# #101

1. Create an account in [Kaggle](https://www.kaggle.com/)
2. Go to your account (https://www.kaggle.com/{username}/account)
3. Generate a new API Token if you don't have one
4. Upload the downloaded `kaggle.json` in this notebook folder

# Installing Dependecies

In [64]:
!pip install kaggle pandas nltk tensorflow scikit-learn



In [12]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/home/borba/.kaggle’: File exists


In [13]:
!cp kaggle/kaggle.json ~/.kaggle

!chmod 600 ~/.kaggle/kaggle.json

# Downloading Dataset

In [14]:
!kaggle datasets download -d stefanlarson/outofscope-intent-classification-dataset

Downloading outofscope-intent-classification-dataset.zip to /home/borba/Workplace/college/IF704-ChatBot/notebooks
100%|█████████████████████████████████████████| 285k/285k [00:00<00:00, 879kB/s]
100%|█████████████████████████████████████████| 285k/285k [00:00<00:00, 876kB/s]


In [3]:
import os

if not os.path.exists('./dataset'):
        os.makedirs('./dataset')

In [4]:
!mv outofscope-intent-classification-dataset.zip ./dataset

mv: cannot stat 'outofscope-intent-classification-dataset.zip': No such file or directory


In [18]:
import zipfile

with zipfile.ZipFile('./dataset/outofscope-intent-classification-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('./dataset')

# Setup Dependencies

In [5]:
import pandas as pd

# ignore words
import nltk
nltk.download('stopwords')

# tokenize and vetorize text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# one-hot encoding labels
from sklearn import preprocessing
import numpy as np
from tensorflow.keras.utils import to_categorical

# deep learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Conv2D
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /home/borba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2021-12-04 08:57:25.671188: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-04 08:57:25.671225: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [6]:
from nltk.corpus import stopwords
words = set(stopwords.words("english"))
print(words)

{"it's", 'is', 'of', 'hasn', 'so', 'were', 'doesn', 'this', 'or', 'can', 'its', 'herself', 'hadn', 'them', 't', 'being', 'be', 'needn', 'won', "mightn't", 'same', 'you', 'm', 'wasn', "needn't", 'having', 'had', 'too', 'but', 'was', 'has', 'those', 'own', 'does', 'did', "wasn't", 'about', 'll', "wouldn't", 'themselves', "weren't", 'him', 'on', 'between', 're', 'his', 'their', 'yours', 'it', 'hers', 'who', 'any', 'if', 'from', "don't", 'shan', 'at', 'with', "couldn't", 'am', 'above', 'other', 'ourselves', 'most', 'will', 'myself', 'aren', "shan't", "doesn't", 'what', 'down', 'just', 'they', 'shouldn', "hadn't", 'she', 'very', 'that', 'been', 'further', 'over', 'how', 'didn', 'during', 'by', 'ma', "isn't", 'after', 'ours', "didn't", "won't", 'which', 'ain', "aren't", 'mustn', 'no', 's', 'when', 'haven', 'while', 'once', 'not', 'mightn', 'into', 'd', 'y', 'these', 'here', 'me', 'until', "mustn't", 'doing', "you'll", 'an', 'why', 'a', 'her', 'more', "should've", 'weren', 'then', 'in', "hasn

# Pre-processing Data

In [7]:
scope_test_json = pd.read_json("./dataset/is_test.json")
scope_train_json = pd.read_json("./dataset/is_train.json")
scope_val_json = pd.read_json("./dataset/is_val.json")

In [8]:
# Extracting dialogs
scope_test_text = [str(line).strip() for line in scope_test_json[0]]
scope_train_text = [str(line).strip() for line in scope_train_json[0]]

model_text = scope_train_text + scope_test_text
scope_val_text = [str(line).strip() for line in scope_val_json[0]]

In [9]:
# Extracting labels
scope_test_labels = [str(line).strip() for line in scope_test_json[1]]
scope_train_labels = [str(line).strip() for line in scope_train_json[1]]

model_labels = scope_train_labels + scope_test_labels
scope_val_labels = [str(line).strip() for line in scope_val_json[1]]

# Tokenize words from dialogues

In [10]:
tok = Tokenizer()
tok.fit_on_texts(model_text)
word_index = tok.word_index

# Vectorizing dialogues

In [11]:
model_tokens = tok.texts_to_sequences(model_text)

max_vocabulary_size = len(word_index) + 1
input_length = max(map(lambda x: len(x), model_tokens))

In [12]:
model_input = pad_sequences(model_tokens, input_length)

validation_tokens = tok.texts_to_sequences(scope_val_text)
validation_input = pad_sequences(validation_tokens, input_length)

# One-hot encoding labels

In [13]:
label_transformer = preprocessing.LabelEncoder()
label_transformer.fit(model_labels)

encoded_validation_labels = label_transformer.transform(scope_val_labels)
encoded_model_labels = label_transformer.transform(model_labels)

In [14]:
categorical_validation_labels = to_categorical(np.asarray(encoded_validation_labels))
categorical_model_labels = to_categorical(np.asarray(encoded_model_labels))

# Split train data to isolate test dataset

In [15]:
X_train, X_val, y_train, y_val = train_test_split(model_input, categorical_model_labels, test_size=0.2, random_state=13)

# Learning

In [87]:
import keras.backend as K

def precision(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0.0

    # How many selected items are relevant?
    return c1 / c2

def recall(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0.0

    # How many relevant items are selected?
    return c1 / c3

def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0.0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [88]:
from tensorflow.keras import metrics

model = Sequential([
  Embedding(max_vocabulary_size, 300, input_length=input_length),                 
  Conv1D(filters=32, kernel_size=8, activation='relu'),
  MaxPooling1D(pool_size=3),
  Flatten(),
  Dense(180, activation='relu'),
  Dense(150, activation='sigmoid')
])

model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy', precision, recall, f1_score])

In [89]:
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 28, 300)           1787400   
                                                                 
 conv1d_16 (Conv1D)          (None, 21, 32)            76832     
                                                                 
 max_pooling1d_16 (MaxPoolin  (None, 7, 32)            0         
 g1D)                                                            
                                                                 
 flatten_16 (Flatten)        (None, 224)               0         
                                                                 
 dense_32 (Dense)            (None, 180)               40500     
                                                                 
 dense_33 (Dense)            (None, 150)               27150     
                                                     

In [90]:
model.fit(X_train, y_train, epochs=6, verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fbab8c88220>

In [92]:
model.evaluate(X_val, y_val)



[0.42374059557914734,
 0.904358983039856,
 0.3591681718826294,
 0.898455798625946,
 0.5102159976959229]

# Predicting

In [93]:
predictions = model.predict(validation_input)

In [94]:
def acc(y_true, y_pred):
    return np.equal(np.argmax(y_true, axis=-1), np.argmax(y_pred, axis=-1)).mean()

print(acc(categorical_validation_labels, predictions))

0.8523333333333334


In [95]:
def get_intent(sentence):
  data = [[sentence]]
  df = pd.DataFrame(data)
  input = df[0]
  input = tok.texts_to_sequences(input)
  input = pad_sequences(input, input_length)
  prediction = model.predict(input)

  return model_labels[np.where(encoded_model_labels == np.argmax(prediction))[0][0]]

# Test

In [96]:
get_intent("how can i say meet me in the bar in spanish")

'translate'

In [97]:
get_intent("translate hello word to spanish")

'translate'

In [98]:
get_intent("how can i say posso pegar teu carro emprestado in english")

'translate'