In [133]:
import csv
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM, Dropout,Embedding, Bidirectional
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [134]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
oov_tok = '<OOV>' #  Out of Vocabulary
training_portion = 0.8

In [135]:
articles = []
labels = []

with open("Data/main.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[1])
        article = row[0]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)

In [136]:
len(labels),len(articles)

(8752, 8752)

In [137]:
print(labels[0])

weather


In [138]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

In [139]:
print('train_articles' ,len(train_articles))
print('train_labels', len(train_labels))
print('validation_articles', len(validation_articles))
print('validation_labels', len(validation_labels))

train_articles 7001
train_labels 7001
validation_articles 1751
validation_labels 1751


In [140]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [141]:
word_index

{'<OOV>': 1,
 'the': 2,
 'is': 3,
 'you': 4,
 'what': 5,
 'in': 6,
 'to': 7,
 'i': 8,
 'for': 9,
 'who': 10,
 'on': 11,
 'and': 12,
 'my': 13,
 'me': 14,
 'of': 15,
 'it': 16,
 'a': 17,
 'be': 18,
 'are': 19,
 'do': 20,
 'at': 21,
 'they': 22,
 'weather': 23,
 'will': 24,
 'this': 25,
 'how': 26,
 'play': 27,
 'can': 28,
 'their': 29,
 'tell': 30,
 'like': 31,
 'okay': 32,
 'time': 33,
 'next': 34,
 'today': 35,
 'please': 36,
 'about': 37,
 'have': 38,
 'did': 39,
 'with': 40,
 'that': 41,
 'place': 42,
 'last': 43,
 "who's": 44,
 'calendar': 45,
 'appointment': 46,
 'check': 47,
 'know': 48,
 "what's": 49,
 'week': 50,
 'when': 51,
 'going': 52,
 'march': 53,
 'your': 54,
 'now': 55,
 'there': 56,
 'schedule': 57,
 'am': 58,
 'set': 59,
 'event': 60,
 'need': 61,
 'want': 62,
 'add': 63,
 'goalkeeper': 64,
 "i'm": 65,
 'day': 66,
 'game': 67,
 'pm': 68,
 'forecast': 69,
 'starting': 70,
 'first': 71,
 'date': 72,
 'name': 73,
 'free': 74,
 'was': 75,
 'events': 76,
 'hi': 77,
 'would

In [142]:
tokenizer.texts_to_sequences(['what will be the weather tommorrow morning ?'])

[[5, 24, 18, 2, 23, 1, 168]]

In [143]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
len(train_sequences)

7001

In [144]:
train_sequences[0]

[24, 16, 18, 387, 6, 434, 283, 11, 318]

In [145]:
train_padded = pad_sequences(train_sequences, maxlen=max_length,)

In [146]:
len(train_padded),len(train_padded[0])

(7001, 200)

In [147]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length)

In [148]:
set(labels)

{'general',
 'greeting',
 'recommendation',
 'schedule',
 'sports',
 'thank',
 'weather'}

In [149]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [150]:
validation_label_seq

array([[3],
       [1],
       [4],
       ...,
       [5],
       [4],
       [1]])

In [151]:
# train_labels[744], training_label_seq[744]

In [152]:
model = Sequential()
model.add(Embedding(vocab_size,embedding_dim ))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(8,activation='softmax' ))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 64)          320000    
                                                                 
 dropout_4 (Dropout)         (None, None, 64)          0         
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 8)                 1032      
                                                                 
Total params: 387,080
Trainable params: 387,080
Non-trainable params: 0
_________________________________________________________________


In [153]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',optimizer=opt,metrics=['accuracy'])

In [154]:
num_epochs = 8
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/8
219/219 - 41s - loss: 0.8693 - accuracy: 0.7039 - val_loss: 0.2691 - val_accuracy: 0.9200 - 41s/epoch - 185ms/step
Epoch 2/8
219/219 - 35s - loss: 0.2242 - accuracy: 0.9334 - val_loss: 0.1838 - val_accuracy: 0.9412 - 35s/epoch - 159ms/step
Epoch 3/8
219/219 - 32s - loss: 0.1459 - accuracy: 0.9573 - val_loss: 0.1495 - val_accuracy: 0.9537 - 32s/epoch - 147ms/step
Epoch 4/8
219/219 - 33s - loss: 0.0991 - accuracy: 0.9710 - val_loss: 0.1328 - val_accuracy: 0.9600 - 33s/epoch - 150ms/step
Epoch 5/8
219/219 - 34s - loss: 0.0746 - accuracy: 0.9781 - val_loss: 0.1293 - val_accuracy: 0.9612 - 34s/epoch - 153ms/step
Epoch 6/8
219/219 - 32s - loss: 0.0594 - accuracy: 0.9834 - val_loss: 0.1461 - val_accuracy: 0.9583 - 32s/epoch - 146ms/step
Epoch 7/8
219/219 - 31s - loss: 0.0495 - accuracy: 0.9843 - val_loss: 0.1226 - val_accuracy: 0.9629 - 31s/epoch - 141ms/step
Epoch 8/8
219/219 - 31s - loss: 0.0391 - accuracy: 0.9889 - val_loss: 0.1177 - val_accuracy: 0.9652 - 31s/epoch - 142ms/step


In [155]:
txt_list = ["where can I buy a shirt from a shop nearby?","how are you today?","Who won the final match in champions league?",
            "will it be windy tommorrow?","it was a really bad day!","tell me a movie to watch tonight"
            ,"I have an important meeting at 2 pm.","that is so boring"]
labels = ['sports','schedule','general','weather','recommendation','greeting','thank']
answer = list()
for txt in txt_list:
    seq = tokenizer.texts_to_sequences([txt])
    padded = pad_sequences(seq, maxlen=max_length)
    pred = model.predict(padded)
    try:
        label = labels[np.argmax(pred)-1] 
        answer.append(label)
    except:
        answer.append('general')
print(answer)

['recommendation', 'general', 'sports', 'weather', 'general', 'recommendation', 'schedule', 'general']
