<a href="https://colab.research.google.com/github/ayushs0911/NLP-Projects/blob/main/Intent_Classification_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
! kaggle datasets download -d bitext/training-dataset-for-chatbotsvirtual-assistants


Downloading training-dataset-for-chatbotsvirtual-assistants.zip to /content
  0% 0.00/1.16M [00:00<?, ?B/s]
100% 1.16M/1.16M [00:00<00:00, 110MB/s]


In [3]:
!unzip /content/training-dataset-for-chatbotsvirtual-assistants.zip

Archive:  /content/training-dataset-for-chatbotsvirtual-assistants.zip
  inflating: 20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv  
  inflating: 20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.xlsx  
  inflating: 20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/LICENSE.txt  
  inflating: 20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/README.txt  
  inflating: Bitext_Sample_Customer_Service_Tra

In [4]:
df = pd.read_csv("/content/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv")

In [5]:
df.head()

Unnamed: 0,flags,utterance,category,intent
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account
4,BLQC,"i want an online account, create one",ACCOUNT,create_account


In [6]:
intents = list(set(df['intent']))
intents

['switch_account',
 'track_order',
 'complaint',
 'edit_account',
 'recover_password',
 'check_invoices',
 'newsletter_subscription',
 'delete_account',
 'check_cancellation_fee',
 'contact_human_agent',
 'change_order',
 'get_refund',
 'delivery_period',
 'delivery_options',
 'review',
 'check_payment_methods',
 'change_shipping_address',
 'get_invoice',
 'cancel_order',
 'registration_problems',
 'payment_issue',
 'create_account',
 'contact_customer_service',
 'check_refund_policy',
 'set_up_shipping_address',
 'place_order',
 'track_refund']

In [7]:
num_classes = len(intents)
num_classes

27

In [8]:
dict_intents = {intents[i]: i for i in range(len(intents))}

In [9]:
dict_intents


{'switch_account': 0,
 'track_order': 1,
 'complaint': 2,
 'edit_account': 3,
 'recover_password': 4,
 'check_invoices': 5,
 'newsletter_subscription': 6,
 'delete_account': 7,
 'check_cancellation_fee': 8,
 'contact_human_agent': 9,
 'change_order': 10,
 'get_refund': 11,
 'delivery_period': 12,
 'delivery_options': 13,
 'review': 14,
 'check_payment_methods': 15,
 'change_shipping_address': 16,
 'get_invoice': 17,
 'cancel_order': 18,
 'registration_problems': 19,
 'payment_issue': 20,
 'create_account': 21,
 'contact_customer_service': 22,
 'check_refund_policy': 23,
 'set_up_shipping_address': 24,
 'place_order': 25,
 'track_refund': 26}

In [10]:
def encoder(df, input_col = 'intent', output_col = 'encoded'):
  df.loc[:, output_col] = df.loc[:, input_col].apply(lambda x : dict_intents[x])
  return df 

In [11]:
df = encoder(df)

In [12]:
df.head()

Unnamed: 0,flags,utterance,category,intent,encoded
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account,21
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account,21
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account,21
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account,21
4,BLQC,"i want an online account, create one",ACCOUNT,create_account,21


In [13]:
df.drop(columns = ['flags', 'category', 'intent'], inplace = True)

In [14]:
df.head()

Unnamed: 0,utterance,encoded
0,"I don't have an online account, what do I have...",21
1,can you tell me if i can regisger two accounts...,21
2,"I have no online account, open one, please",21
3,"could you ask an agent how to open an account,...",21
4,"i want an online account, create one",21


In [15]:
data = df.copy()

In [16]:
data['tokenized'] = data['utterance'].apply(lambda x : word_tokenize(x))

In [17]:
data.head()

Unnamed: 0,utterance,encoded,tokenized
0,"I don't have an online account, what do I have...",21,"[I, do, n't, have, an, online, account, ,, wha..."
1,can you tell me if i can regisger two accounts...,21,"[can, you, tell, me, if, i, can, regisger, two..."
2,"I have no online account, open one, please",21,"[I, have, no, online, account, ,, open, one, ,..."
3,"could you ask an agent how to open an account,...",21,"[could, you, ask, an, agent, how, to, open, an..."
4,"i want an online account, create one",21,"[i, want, an, online, account, ,, create, one]"


In [18]:

def remove_stop(words):
  filtered = [x for x in words if not x.lower() in stop_words]
  return filtered

In [19]:
def punctuation(words):
  filtered = []
  for word in words:
    if re.findall("[()!><.,`?']", word):
      pass
    else:
      filtered.append(word)
    
  return filtered 


In [20]:
def clean_text(data, input_col = "tokenized", output_col = "clean_text"):
  texts = data.loc[:, input_col]
  word = []

  for text in texts:
    t = remove_stop(text)
    t = punctuation(t)
    t = " ".join([str(i) for i in t])
    word.append(t)

  data.loc[:, output_col] = word
  return data


In [21]:
data = clean_text(data)

In [22]:
data.head()

Unnamed: 0,utterance,encoded,tokenized,clean_text
0,"I don't have an online account, what do I have...",21,"[I, do, n't, have, an, online, account, ,, wha...",online account register
1,can you tell me if i can regisger two accounts...,21,"[can, you, tell, me, if, i, can, regisger, two...",tell regisger two accounts single email address
2,"I have no online account, open one, please",21,"[I, have, no, online, account, ,, open, one, ,...",online account open one please
3,"could you ask an agent how to open an account,...",21,"[could, you, ask, an, agent, how, to, open, an...",could ask agent open account please
4,"i want an online account, create one",21,"[i, want, an, online, account, ,, create, one]",want online account create one


In [23]:
data.drop(columns = ['utterance', 'tokenized',], inplace = True)

In [24]:
data.head()

Unnamed: 0,encoded,clean_text
0,21,online account register
1,21,tell regisger two accounts single email address
2,21,online account open one please
3,21,could ask agent open account please
4,21,want online account create one


In [25]:
len(data)

21534

In [26]:
from sklearn.model_selection import train_test_split 
X_train, X_val, y_train, y_val = train_test_split(data['clean_text'].to_numpy(),
                                                  data['encoded'].to_numpy(),
                                                  test_size = 0.1,
                                                  random_state = 0)

In [27]:
len(X_train), len(X_val)

(19380, 2154)

In [28]:
X_train[:5]

array(['problem paying online order need help reporting',
       'online account need help opening one',
       'user account hacked get back',
       'need information send email Client Service',
       'cotnact Client Service'], dtype=object)

In [38]:
y_train[:5]

array([20, 21,  4, 22, 22])

In [29]:
count = 0
for text in X_train:
  count += len(text.split())

In [30]:
max_length = np.round(count/len(X_train))
max_length

5.0

In [31]:
max_vocab_length = 10000

In [32]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [33]:
text_vectorizer = TextVectorization(max_tokens = max_vocab_length, 
                                    output_mode = 'int',
                                    output_sequence_length = 5)

In [34]:
text_vectorizer.adapt(X_train)

In [35]:
import random 
random_sent = random.choice(X_train)

In [36]:
print(random_sent)

want user account cancel


In [37]:
text_vectorizer(random_sent)

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([  2,  35,   4, 106,   0])>

In [39]:
from tensorflow.keras import layers 

In [41]:
embeddings = layers.Embedding(input_dim = max_vocab_length,
                              output_dim = 128, 
                              input_length = max_length)

In [62]:
inputs = layers.Input(shape= (1, ), dtype = 'string')
x = text_vectorizer(inputs)
x = embeddings(x)

x = layers.LSTM(64)(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(800, activation = 'relu')(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(200, activation = 'relu')(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(num_classes, activation = 'softmax')(x)

model = tf.keras.Model(inputs, outputs)


In [65]:
model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(), 
              metrics = ['accuracy'])

In [66]:
model.fit(X_train, y_train, 
          epochs = 1,
          validation_data = (X_val, y_val))



<keras.callbacks.History at 0x7f5801f29ff0>

In [68]:
pred_probs = model.predict(X_val)



In [78]:
predictions = []

for i, x in enumerate(pred_probs):
  predictions.append(np.argmax(pred_probs[i]))

In [81]:
predictions[:5]

[0, 1, 22, 4, 5]

In [82]:
from sklearn.metrics import accuracy_score

In [85]:
accuracy = accuracy_score(y_pred = predictions, 
                          y_true = y_val) 
accuracy*100

95.636025998143

In [119]:
sentence = "I've got no account, can I create one?"

In [120]:
sentence = word_tokenize(sentence)

In [121]:
sentance = [ x for x in sentence if not x.lower() in stop_words]

In [122]:
sentance = " ".join([i for i in sentance])

In [123]:
np.argmax(model.predict([sentance]))



21

In [124]:
def find_key(dictionary, value):
    for key, val in dictionary.items():
        if val == value:
            return key
    return key

In [126]:
find_key(dict_intents, value = 21)

'create_account'