In [1]:
import pandas as pd
import numpy as np
import nltk
import warnings
warnings.filterwarnings('ignore')

Read Data

In [2]:
df = pd.read_csv("./LabelledData.txt",sep=",,,",header=None ,names=['ques','label'])

Get some feel of data

In [3]:
df.head(10)

Unnamed: 0,ques,label
0,how did serfdom develop in and then leave russ...,unknown
1,what films featured the character popeye doyle ?,what
2,how can i find a list of celebrities ' real na...,unknown
3,what fowl grabs the spotlight after the chines...,what
4,what is the full form of .com ?,what
5,what contemptible scoundrel stole the cork fro...,what
6,what team did baseball 's st. louis browns bec...,what
7,what is the oldest profession ?,what
8,what are liver enzymes ?,what
9,name the scar-faced bounty hunter of the old w...,unknown


In [4]:
df.shape

(1483, 2)

In [5]:
df['label'].values

array([' unknown', ' what', ' unknown', ..., ' affirmation',
       ' affirmation', ' affirmation'], dtype=object)

To remove the unnecessary spaces we have to use strip function 

In [6]:
df['label']=df['label'].str.strip()

In [7]:
df['label'].values

array(['unknown', 'what', 'unknown', ..., 'affirmation', 'affirmation',
       'affirmation'], dtype=object)

In [8]:
df['ques'].values

array(['how did serfdom develop in and then leave russia ? ',
       'what films featured the character popeye doyle ? ',
       "how can i find a list of celebrities ' real names ? ", ...,
       'does this hose have one ? ', 'can i get it in india ? ',
       'would this work on a 2008 ford edge with a naked roof ? '],
      dtype=object)

Questions have unnecessary character like ? at the end also we like to have all characters uniform so lets convert everyone to lower

In [9]:
df['ques'] = df['ques'].apply(lambda x: x.lower())

In [10]:
df['ques'].values

array(['how did serfdom develop in and then leave russia ? ',
       'what films featured the character popeye doyle ? ',
       "how can i find a list of celebrities ' real names ? ", ...,
       'does this hose have one ? ', 'can i get it in india ? ',
       'would this work on a 2008 ford edge with a naked roof ? '],
      dtype=object)

Now remove bad characters

In [11]:
import re
df['ques'] = df['ques'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [12]:
df['ques'].values

array(['how did serfdom develop in and then leave russia  ',
       'what films featured the character popeye doyle  ',
       'how can i find a list of celebrities  real names  ', ...,
       'does this hose have one  ', 'can i get it in india  ',
       'would this work on a 2008 ford edge with a naked roof  '],
      dtype=object)

Extra space at will already be taken care by stemming

In [13]:
df['ques'].values

array(['how did serfdom develop in and then leave russia  ',
       'what films featured the character popeye doyle  ',
       'how can i find a list of celebrities  real names  ', ...,
       'does this hose have one  ', 'can i get it in india  ',
       'would this work on a 2008 ford edge with a naked roof  '],
      dtype=object)

In [14]:
df['label'].value_counts()

what           609
who            402
unknown        272
affirmation    104
when            96
Name: label, dtype: int64

In [15]:
from nltk.stem import SnowballStemmer
from nltk import word_tokenize

In [16]:
stemmer = SnowballStemmer('english').stem

In [17]:
def stem_tokenize(que):
    return [stemmer(i) for i in word_tokenize(que)]

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word',lowercase=True,tokenizer=stem_tokenize)

In [19]:
import pickle as pkl
with open('vectorizer.pk', 'wb') as fin:
    pkl.dump(vectorizer, fin)

In [20]:
X_train = vectorizer.fit_transform(df['ques'].values)


In [21]:
print(X_train)

  (0, 1476)	0.2001182106667961
  (0, 915)	0.2164717885466938
  (0, 2570)	0.44430594185596844
  (0, 901)	0.3908042029254287
  (0, 1513)	0.1559749199122551
  (0, 227)	0.21472868956601282
  (0, 2881)	0.44430594185596844
  (0, 1710)	0.36336093563745936
  (0, 2508)	0.4038334387467139
  (1, 3116)	0.1235234721399405
  (1, 1171)	0.37249295925536097
  (1, 1151)	0.4335020087076065
  (1, 2877)	0.10822427463109435
  (1, 613)	0.3994898937121077
  (1, 2275)	0.47965357299329037
  (1, 977)	0.5066505074500371
  (2, 1476)	0.21639785501877104
  (2, 545)	0.26162555778836594
  (2, 1500)	0.2689391174420063
  (2, 1175)	0.32355459878126197
  (2, 128)	0.18630174321482454
  (2, 1744)	0.4366853453189645
  (2, 2100)	0.15562402195047553
  (2, 589)	0.45484943942921086
  (2, 2397)	0.4366853453189645
  :	:
  (1479, 1580)	0.42648606740706074
  (1479, 356)	0.4662673594796969
  (1479, 2422)	0.6492839321004531
  (1480, 946)	0.30548170589714263
  (1480, 2117)	0.5030643155825888
  (1480, 1405)	0.4066523770633723
  (1480, 2

In [22]:
split_ratio=0.2

indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
labels = df['label']
labels = labels[indices]
test_samples = int(split_ratio * X_train.shape[0])

In [23]:
x_train = X_train[:-test_samples]
y_train = labels[:-test_samples]
x_test = X_train[-test_samples:]
y_test = labels[-test_samples:]

In [24]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0)
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
pred = clf.predict(x_test)

In [26]:
from sklearn.metrics import classification_report
print(classification_report(pred,y_test))

             precision    recall  f1-score   support

affirmation       0.06      1.00      0.11         1
    unknown       0.48      1.00      0.64        29
       what       0.99      0.53      0.69       205
       when       0.00      0.00      0.00         0
        who       0.66      0.97      0.78        61

avg / total       0.87      0.67      0.70       296



In [27]:
print("Accuracy : {}".format(clf.score(x_test,y_test)))

Accuracy : 0.6655405405405406


Such a low accuracy shows that the model is not able to capture the underlying meaning of the sentence it means we have to use the model like LSTM which captures the long term dependencies

In [28]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [29]:
tokenizer = Tokenizer(num_words=30000, split=' ')
tokenizer.fit_on_texts(df['ques'].values)
X = tokenizer.texts_to_sequences(df['ques'].values)
X = pad_sequences(X, maxlen=30)

In [30]:
X

array([[   0,    0,    0, ..., 1135,  288,  382],
       [   0,    0,    0, ...,  128,  612, 1137],
       [   0,    0,    0, ..., 1138,  384,  147],
       ...,
       [   0,    0,    0, ..., 1126,   40,  133],
       [   0,    0,    0, ...,   26,    6,  369],
       [   0,    0,    0, ...,    8,  942, 3685]])

In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
e = LabelEncoder()

In [33]:
e.fit(labels)
labels = e.transform(labels) 

In [34]:
from keras.utils.np_utils import to_categorical
labels = to_categorical(np.asarray(labels))

In [35]:
labels

array([[0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [36]:
embeddings_index = {}
f = open('./glove.42B.300d.txt', encoding="utf8")
for line in f:
    w = line.split()
    word = w[0]
    embed = np.asarray(w[1:], dtype='float32')
    embeddings_index[word] = embed
f.close()

In [37]:
len(embeddings_index)

1917494

In [38]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [39]:
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
labels = labels[indices]
val_samples = int(0.2 * X.shape[0])

x_train = X[:-val_samples]
y_train = labels[:-val_samples]
x_val = X[-val_samples:]
y_val = labels[-val_samples:]

In [232]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=30,
                            trainable=True)

In [233]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Bidirectional, Conv1D, GlobalMaxPooling1D, SpatialDropout1D

In [234]:
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(units=256, dropout=0.1, recurrent_dropout=0.1,return_sequences=True)))
model.add(Conv1D(64, kernel_size=5, padding='valid', kernel_initializer='glorot_uniform'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=5, activation="softmax"))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 30, 300)           1105800   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 30, 512)           1140736   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 26, 64)            163904    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 5)                 325       
Total params: 2,410,765
Trainable params: 2,410,765
Non-trainable params: 0
_________________________________________________________________
None


In [261]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

Train on 1187 samples, validate on 296 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x10d953358>

In [262]:
example = tokenizer.texts_to_sequences(["what time does the train leave"])
example = pad_sequences(example, maxlen=30)

In [263]:
e.inverse_transform(np.argmax(model.predict(example)))

'what'

Since the data has a class imabalance highest with 'what' class thats why it is predicting most of the times with that class only due to time limitation i have not augment the data set with class 'when' which is only 90  also the validation accuracy does not show any improvement for the model we can conclude that the dataset should be well prepared an then trained.