In [9]:
import spacy
import numpy as np
import pandas as pd
from stopwords import ENGLISH_STOP_WORDS
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
en_nlp = spacy.en.English()

In [3]:
def spacy_get_vec(sentence):
    vec = np.zeros(300)
    doc = en_nlp((sentence))
    for word in doc:
        #if word.lower_ in ENGLISH_STOP_WORDS:
        #    continue
        vec += word.vector
    return vec

In [30]:
lines = open('./is_question.txt').readlines()
vecs = []
intents = []
idfs = []
for line in lines:
    tokens = line.split(',')
    sentence = tokens[0]
    intent = tokens[1]
    if intent[-1] == '\n':
        intent = intent[:-1]
    vecs.append(spacy_get_vec(sentence))
    intents.append(intent)

df = pd.DataFrame(vecs, columns=['vec_%d' % i for i in range(300)])
df['intents'] = intents
df.intents = df.intents.astype('category')

In [31]:
from sklearn.utils import shuffle
df = shuffle(df)

In [32]:
df.head()

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_291,vec_292,vec_293,vec_294,vec_295,vec_296,vec_297,vec_298,vec_299,intents
40,0.121055,0.038133,0.012964,-0.020334,0.172047,-0.018421,0.012288,0.080411,0.024699,0.064562,...,0.112001,0.017225,-0.103424,0.014032,-0.122818,-0.040137,0.270733,-0.004087,0.040575,sentiment
96,0.184373,-0.027292,-0.003579,-0.157275,0.120644,0.097438,-0.022361,0.015892,0.075208,-0.044138,...,0.035711,0.143653,-0.209778,-0.099372,-0.021035,-0.146351,0.091656,0.039907,-0.022892,sentiment
104,0.21144,0.160685,-0.051632,-0.143768,-0.004933,0.070291,-0.258258,-0.179165,0.088886,-0.110131,...,-0.053267,0.092519,-0.351504,0.118517,-0.268701,-0.147621,0.174754,0.017807,-0.075447,question
58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sentiment
3,0.153262,-0.020188,-0.023359,-0.003338,0.014698,-0.003429,-0.123272,0.051771,0.055314,-0.073981,...,0.053647,0.067319,0.034739,0.019138,-0.08166,0.19035,0.129133,-0.046141,0.086841,sentiment


In [33]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1:].values.ravel()

In [34]:
from sklearn.cross_validation import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.20)

In [35]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
print(logit_model.score(X_train, y_train))
print(logit_model.score(X_val, y_val))

1.0
1.0


In [37]:
sent = 'i forgot your name'
print(logit_model.predict_proba(spacy_get_vec(sent)))
print(logit_model.predict(spacy_get_vec(sent)))

[[ 0.4979607  0.5020393]]
['sentiment']


In [38]:
from sklearn.ensemble import GradientBoostingClassifier
gradboost = GradientBoostingClassifier(n_estimators=500, max_depth=25)

In [39]:
gradboost.fit(X_train, y_train)
print(gradboost.score(X_train, y_train))
print(gradboost.score(X_val, y_val))

1.0
0.838709677419


In [40]:
sent = 'i am feeling very happy'
gradboost.predict(spacy_get_vec(sent))

array(['question'], dtype=object)

In [42]:
sent = 'i think i forgot your name'
gradboost.predict(spacy_get_vec(sent))

array(['sentiment'], dtype=object)

In [43]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', degree=2, probability=True)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=2, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
print(svc.score(X_train, y_train))
print(svc.score(X_val, y_val))

1.0
1.0


In [50]:
sent = 'do you live in France'
svc.predict(spacy_get_vec(sent))

array(['question'], dtype=object)

In [46]:
sent = 'my name is Batman'
svc.predict(spacy_get_vec(sent))

array(['question'], dtype=object)

In [49]:
sent = 'i think i forgot your name'
svc.predict(spacy_get_vec(sent))

array(['question'], dtype=object)

In [48]:
sent = 'it looks cloudy'
svc.predict(spacy_get_vec(sent))

array(['sentiment'], dtype=object)

In [22]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(256, 128, 2), activation='tanh', learning_rate='adaptive', solver='lbfgs', max_iter=1000, )
nn.fit(X_train, y_train)


MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(256, 128, 2), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [23]:
print(nn.score(X_train, y_train))
print(nn.score(X_val, y_val))

0.992592592593
0.852941176471


In [24]:
sent = 'I have to fly home'
nn.predict_proba(spacy_get_vec(sent))



array([[  9.99981281e-01,   1.87194896e-05]])

In [25]:
sent = 'my name is Batman'
nn.predict_proba(spacy_get_vec(sent))



array([[  9.99981281e-01,   1.87194927e-05]])

In [26]:
sent = 'it looks cloudy'
nn.predict_proba(spacy_get_vec(sent))



array([[  6.55637001e-05,   9.99934436e-01]])

In [51]:
from sklearn.externals import joblib
joblib.dump(svc, 'is_question.pkl')

['is_question.pkl']