## Niki.ai

In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt

%matplotlib inline

ModuleNotFoundError: No module named 'nltk'

In [2]:
df = pd.read_csv("label.txt",sep=",,,",header=None ,names=['question','type'])

  """Entry point for launching an IPython kernel.


In [4]:
df.head()

Unnamed: 0,question,type
0,how did serfdom develop in and then leave russ...,unknown
1,what films featured the character popeye doyle ?,what
2,how can i find a list of celebrities ' real na...,unknown
3,what fowl grabs the spotlight after the chines...,what
4,what is the full form of .com ?,what


In [5]:
df.shape

(1483, 2)

In [9]:
df['type']=df['type'].str.strip()

In [10]:
df['type'].unique()

array(['unknown', 'what', 'when', 'who', 'affirmation'], dtype=object)

In [11]:
df['question'].values

array(['how did serfdom develop in and then leave russia ? ',
       'what films featured the character popeye doyle ? ',
       "how can i find a list of celebrities ' real names ? ", ...,
       'does this hose have one ? ', 'can i get it in india ? ',
       'would this work on a 2008 ford edge with a naked roof ? '], dtype=object)

In [35]:
df['question'] = df['question'].apply(lambda x: x.lower())
df['question'] = df['question'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [41]:
VALIDATION_SPLIT=0.20

## Naive Bayes

In [55]:
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle as pkl
from sklearn.naive_bayes import MultinomialNB
# organize imports
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [37]:
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import wordnet as wn

In [38]:
class StemTokenizer(object):
    def __init__(self):
        self.ignore_set = {'footnote', 'nietzsche', 'plato', 'mr.'}

    def __call__(self, doc):
        words = []
        for word in word_tokenize(doc):
            word = word.lower()
            w = wn.morphy(word)
            if w and len(w) > 1 and w not in self.ignore_set:
                words.append(w)
        return words

In [60]:
stemmer = SnowballStemmer('english').stem
def stem_tokenize(text):
    return [stemmer(i) for i in word_tokenize(text)]

### Using Count Vectorizer 

In [86]:
vectorizer = CountVectorizer(analyzer='word',lowercase=True,tokenizer=stem_tokenize)
X_train = vectorizer.fit_transform(df.question.values)
with open('vectorizer.pk', 'wb') as fin:
    pkl.dump(vectorizer, fin)

In [87]:
labels = data['type']

In [88]:
# split the data into a training set and a validation set
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * X_train.shape[0])

x_train = X_train[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = X_train[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [89]:
clf = MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [90]:
# evaluate the model of test data
preds = clf.predict(x_val)
print(classification_report(preds,y_val))
print("Accuracy :",clf.score(x_val,y_val))

             precision    recall  f1-score   support

affirmation       0.61      1.00      0.76        17
    unknown       0.72      0.85      0.78        52
       what       0.98      0.78      0.87       139
       when       0.33      0.75      0.46         8
        who       0.96      0.94      0.95        80

avg / total       0.89      0.85      0.86       296

Accuracy : 0.847972972973


In [91]:
example=vectorizer.transform(["What time does the train leave"])
clf.predict(example)

array(['what'],
      dtype='<U11')

### Using TF-IDF (though bad choice for short sequences or corpus)

In [92]:
tf_vectorizer = TfidfVectorizer(analyzer='word',lowercase=True,tokenizer=stem_tokenize)
X_train = tf_vectorizer.fit_transform(df.question.values)
with open('tf_vectorizer.pk', 'wb') as fin:
    pkl.dump(tf_vectorizer, fin)

In [93]:
labels = data['type']
# split the data into a training set and a validation set
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * X_train.shape[0])

x_train = X_train[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = X_train[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [94]:
clf = MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [95]:
# evaluate the model of test data
preds = clf.predict(x_val)
print(classification_report(preds,y_val))
print("Accuracy :",clf.score(x_val,y_val))

             precision    recall  f1-score   support

affirmation       0.10      1.00      0.18         2
    unknown       0.45      1.00      0.62        32
       what       1.00      0.56      0.72       194
       when       0.00      0.00      0.00         0
        who       0.80      0.88      0.84        68

avg / total       0.89      0.69      0.73       296

Accuracy : 0.685810810811


  'recall', 'true', average, warn_for)


In [96]:
example=tf_vectorizer.transform(["What time does the train leave"])
clf.predict(example)

array(['what'],
      dtype='<U11')

## LSTM

In [24]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

In [12]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH=30

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re

In [21]:
data=df.copy()

In [22]:
print(data['type'].value_counts())

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, split=' ')
tokenizer.fit_on_texts(data['question'].values)
X = tokenizer.texts_to_sequences(data['question'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

what           609
who            402
unknown        272
affirmation    104
when            96
Name: type, dtype: int64




In [75]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Y = data['type']
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(Y)
Y=le.transform(Y) 
labels = to_categorical(np.asarray(Y))
print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', labels.shape)


# split the data into a training set and a validation set
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * X.shape[0])

x_train = X[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = X[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Found 3685 unique tokens.
Shape of data tensor: (1483, 30)
Shape of label tensor: (1483, 5)


In [26]:
embeddings_index = {}
f = open('E:/Projects/Word2Vec/glove.42B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1917495 word vectors.


In [27]:
EMBEDDING_DIM=300

In [29]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [30]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [32]:
embed_dim = 300
lstm_out = 196

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(lstm_out, dropout_U=0.25, dropout_W=0.25))
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 300)           1105800   
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               389648    
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 985       
Total params: 1,496,433
Trainable params: 390,633
Non-trainable params: 1,105,800
_________________________________________________________________
None


In [33]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=20,
          validation_data=(x_val, y_val))

Train on 1187 samples, validate on 296 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x258b850f2e8>

In [79]:
example = tokenizer.texts_to_sequences(["What time does the flight leave"])
example = pad_sequences(example, maxlen=MAX_SEQUENCE_LENGTH)

In [85]:
le.inverse_transform(np.argmax(model.predict(example)))

'when'