In [9]:
import spacy
import numpy as np
import pandas as pd
from stopwords import ENGLISH_STOP_WORDS
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
en_nlp = spacy.en.English()

In [3]:
def spacy_get_vec(sentence):
    vec = np.zeros(300)
    doc = en_nlp((sentence))
    for word in doc:
        if word.lower_ in ENGLISH_STOP_WORDS:
            continue
        vec += word.vector
    return vec

In [190]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer.fit_transform([''.join(line.split(',')[0]) for line in lines])


<128x111 sparse matrix of type '<class 'numpy.float64'>'
	with 185 stored elements in Compressed Sparse Row format>

In [191]:
vectorizer.stop_words_

set()

In [192]:
def get_idf(sentence):
    score = 1.0
    for word in sentence.split():
        if word[-1] == '\n' or word[-1] == ',' or word[-1] == '.' or word[-1] == ['!']:
            word = word[:-1]
        if word not in vectorizer.vocabulary_:
            continue
        index = vectorizer.vocabulary_[word]
        score = score / vectorizer.idf_[index]
    return score


In [4]:
lines = open('./class.txt').readlines()
vecs = []
intents = []
idfs = []
for line in lines:
    tokens = line.split(',')
    sentence = tokens[0]
    intent = tokens[1]
    if intent[-1] == '\n':
        intent = intent[:-1]
    vecs.append(spacy_get_vec(sentence))
    intents.append(intent)
    #idfs.append(get_idf(sentence))

df = pd.DataFrame(vecs, columns=['vec_%d' % i for i in range(300)])
#df['idf'] = idfs
df['intents'] = intents
df.intents = df.intents.astype('category')

In [5]:
from sklearn.utils import shuffle
df = shuffle(df)

In [6]:
df.head()

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_291,vec_292,vec_293,vec_294,vec_295,vec_296,vec_297,vec_298,vec_299,intents
28,-0.047148,-0.052614,-0.019019,-0.028234,0.099283,0.090774,-0.025177,-0.125545,0.024765,0.019097,...,-0.012836,0.019333,-0.12263,-0.008697,-0.007496,-0.101008,-0.029107,-0.069498,-0.029899,intent
95,0.007811,-0.023038,-0.085239,-0.048018,0.053016,0.072656,-0.03316,-0.008235,-0.014402,-0.073086,...,0.012784,0.00536,-0.098058,0.003844,-0.03188,0.012176,0.037164,-0.062658,-0.025203,non_intent
61,0.154205,0.069486,0.049501,-0.050952,0.10784,0.082489,-0.015041,-0.00509,0.040752,0.021366,...,0.003494,-0.01246,-0.108506,-0.075914,-0.0384,-0.088636,0.089918,0.094394,-0.069333,non_intent
84,0.066939,-0.047388,0.005932,0.041755,0.121324,-0.057365,0.047533,0.076065,0.004791,0.010513,...,0.012315,-0.013224,-0.06454,-0.030876,0.006241,-0.038575,0.108856,0.016149,-0.013379,non_intent
0,0.014738,0.011007,0.001503,0.006558,0.068473,0.047063,0.03928,0.127276,0.070502,-0.074319,...,-0.013501,0.035775,-0.058489,-0.026455,-0.128484,-0.133027,-0.043225,-0.024182,0.006277,intent


In [7]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1:].values.ravel()

In [10]:
from sklearn.cross_validation import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.20)

In [11]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression(C=5.0, class_weight={'intent': 1.2, 'non_intent': 0.8})
logit_model.fit(X_train, y_train)

LogisticRegression(C=5.0, class_weight={'intent': 1.2, 'non_intent': 0.8},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
          random_state=None, solver='liblinear', tol=0.0001, verbose=0,
          warm_start=False)

In [12]:
print(logit_model.score(X_train, y_train))
print(logit_model.score(X_val, y_val))

0.965217391304
0.793103448276


In [13]:
sent = 'it looks cloudy'
#gradboost.predict_proba(np.append(spacy_get_vec(sent), get_idf(sent)))
logit_model.predict_proba(spacy_get_vec(sent))

array([[ 0.29767589,  0.70232411]])

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
gradboost = GradientBoostingClassifier(n_estimators=500, max_depth=25, max_features='log2')

In [15]:
gradboost.fit(X_train, y_train)
print(gradboost.score(X_train, y_train))
print(gradboost.score(X_val, y_val))

0.965217391304
0.862068965517


In [16]:
sent = 'it looks cloudy'
#gradboost.predict_proba(np.append(spacy_get_vec(sent), get_idf(sent)))
gradboost.predict_proba(spacy_get_vec(sent))

array([[  1.58935433e-06,   9.99998411e-01]])

In [17]:
gradboost.classes_

array(['intent', 'non_intent'], dtype=object)

In [18]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', degree=2, probability=True)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=2, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
print(svc.score(X_train, y_train))
print(svc.score(X_val, y_val))

0.947826086957
0.931034482759


In [20]:
sent = 'i need to fly home'
#gradboost.predict_proba(np.append(spacy_get_vec(sent), get_idf(sent)))
svc.predict_proba(spacy_get_vec(sent))

array([[ 0.99133295,  0.00866705]])

In [25]:
sent = 'it appears dark outside'
svc.predict_proba(spacy_get_vec(sent))

array([[ 0.89129347,  0.10870653]])

In [34]:
sent = 'my name is Gopal'
svc.predict_proba(spacy_get_vec(sent))

array([[ 0.61296589,  0.38703411]])

In [27]:
sent = 'it looks cloudy'
svc.predict_proba(spacy_get_vec(sent))

array([[ 0.50630058,  0.49369942]])

In [28]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(256, 128, 2), activation='tanh', learning_rate='adaptive', solver='lbfgs', max_iter=1000, )
nn.fit(X_train, y_train)


MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(256, 128, 2), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [29]:
print(nn.score(X_train, y_train))
print(nn.score(X_val, y_val))

0.965217391304
0.793103448276


In [30]:
sent = 'I have to fly home'
nn.predict_proba(spacy_get_vec(sent))

array([[  9.99818681e-01,   1.81318571e-04]])

In [32]:
sent = 'my name is Gopal'
nn.predict_proba(spacy_get_vec(sent))

array([[  1.87735158e-05,   9.99981226e-01]])

In [33]:
sent = 'it looks cloudy'
nn.predict_proba(spacy_get_vec(sent))

array([[  1.87734762e-05,   9.99981227e-01]])

In [35]:
from sklearn.externals import joblib
joblib.dump(svc, 'class.pkl')

['class.pkl']