In [9]:
import spacy
import numpy as np
import pandas as pd
from stopwords import ENGLISH_STOP_WORDS
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
en_nlp = spacy.en.English()

In [3]:
def spacy_get_vec(sentence):
    vec = np.zeros(600)
    doc = en_nlp((sentence))

    allowed_words = set(['not', 'what', 'why', 'how', 'where'])
    """
    tempvec = None
    i = -1
    for index,word in enumerate(doc):
        #if word.lower_ not in allowed_words and word.lower_ in ENGLISH_STOP_WORDS:
        #    continue
        i += 1
        if tempvec is None:
            tempvec = word.vector
        else:
            tempvec = np.append(tempvec, word.vector)
        if (i + 1) % 2 == 0:
            vec += tempvec
            tempvec = None
            """
    for i,word in enumerate(doc):
        if i < len(doc) -1:
            tempvec = np.append(word.vector, doc[i + 1].vector)
            vec += tempvec
        else:
            tempvec = np.append(word.vector, np.zeros(300))
            vec += tempvec
    return vec

In [4]:
lines = open('./sentiment.txt').readlines()
vecs = []
intents = []
idfs = []
for line in lines:
    tokens = line.split(',')
    sentence = tokens[0]
    intent = tokens[1]
    if intent[-1] == '\n':
        intent = intent[:-1]
    vecs.append(spacy_get_vec(sentence))
    intents.append(intent)
    #idfs.append(get_idf(sentence))

df = pd.DataFrame(vecs, columns=['vec_%d' % i for i in range(600)])
#df['idf'] = idfs
df['intents'] = intents
df.intents = df.intents.astype('category')

In [5]:
df.head()

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_591,vec_592,vec_593,vec_594,vec_595,vec_596,vec_597,vec_598,vec_599,intents
0,-0.017591,0.182894,-0.087779,-0.068835,0.025753,0.033162,-0.084063,0.058867,0.082818,-0.04025,...,-0.04411,0.014063,-0.040921,-0.060923,0.010078,0.017878,0.01614,0.064506,-0.004897,congrats
1,0.060574,0.109562,-0.069634,-0.088281,-0.029968,0.105061,-0.129419,0.11322,0.035708,-0.039846,...,-0.165324,-0.0268,-0.049312,-0.156678,-0.030167,-0.029797,-0.007047,0.047412,0.054314,congrats
2,0.1453,0.035108,-0.01127,-0.071486,0.004,0.046305,-0.111672,0.056375,0.11762,-0.131979,...,0.030691,0.022986,-0.0035,-0.121251,-0.042003,0.186914,-0.022237,0.033146,0.021581,congrats
3,0.153262,-0.020188,-0.023359,-0.003338,0.014698,-0.003429,-0.123272,0.051771,0.055314,-0.073981,...,0.014924,0.011049,0.015859,-0.009561,-0.007343,0.168358,0.005968,8.3e-05,0.05811,congrats
4,0.046152,0.124861,0.010688,-0.105057,0.007412,0.083475,-0.137084,-0.040911,0.154606,-0.14078,...,-0.00617,-0.05144,-0.138807,0.034807,-0.035373,-0.165428,0.050296,0.014325,0.021373,congrats


In [6]:
from sklearn.utils import shuffle
df = shuffle(df)

In [7]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1:].values.ravel()

In [10]:
from sklearn.cross_validation import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.20)

In [11]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)
print(logit_model.score(X_train, y_train))
print(logit_model.score(X_val, y_val))

0.875
0.8


In [12]:
sent = 'I think I am feeling well'
print(logit_model.predict_proba(spacy_get_vec(sent)))
print(logit_model.predict(spacy_get_vec(sent)))

[[ 0.09656709  0.18002798  0.72340492]]
['sorry']


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
gradboost = GradientBoostingClassifier(n_estimators=600, max_depth=24)
gradboost.fit(X_train, y_train)
print(gradboost.score(X_train, y_train))
print(gradboost.score(X_val, y_val))

1.0
0.65


In [18]:
sent = 'I think I am not feeling lonely'
print(gradboost.predict_proba(spacy_get_vec(sent)))
print(gradboost.predict(spacy_get_vec(sent)))

[[ 0.22311493  0.49973071  0.27715437]]
['neutral']


In [19]:
sent = 'I think I am feeling lonely'
print(gradboost.predict_proba(spacy_get_vec(sent)))
print(gradboost.predict(spacy_get_vec(sent)))

[[ 0.22327484  0.33443184  0.44229332]]
['sorry']


In [21]:
sent = 'I think I am feeling better'
print(gradboost.predict_proba(spacy_get_vec(sent)))
print(gradboost.predict(spacy_get_vec(sent)))

[[ 0.23699733  0.39907823  0.36392444]]
['neutral']


In [22]:
gradboost.classes_

array(['congrats', 'neutral', 'sorry'], dtype=object)

In [23]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(300,128, 64, 4), solver='lbfgs', activation='tanh', max_iter=1000)

In [24]:
nn.fit(X_train, y_train)
print(nn.score(X_train, y_train))
print(nn.score(X_val, y_val))

1.0
0.85


In [26]:
sent = 'i think I am not feeling unwell'
print(nn.predict_proba(spacy_get_vec(sent)))
print(nn.predict(spacy_get_vec(sent)))

[[  2.68671592e-15   9.98797803e-01   1.20219729e-03]]
['neutral']


In [27]:
sent = 'i think I am feeling unwell'
print(nn.predict_proba(spacy_get_vec(sent)))
print(nn.predict(spacy_get_vec(sent)))

[[  1.93001485e-02   2.18527467e-06   9.80697666e-01]]
['sorry']


In [29]:
sent = 'i think I am feeling well'
print(nn.predict_proba(spacy_get_vec(sent)))
print(nn.predict(spacy_get_vec(sent)))

[[  1.36342389e-09   5.99877418e-01   4.00122580e-01]]
['neutral']


In [30]:
sent = 'i think I am not feeling well'
print(nn.predict_proba(spacy_get_vec(sent)))
print(nn.predict(spacy_get_vec(sent)))

[[  8.73595625e-05   2.37434055e-04   9.99675206e-01]]
['sorry']


In [34]:
from sklearn.externals import joblib
joblib.dump(nn, 'sentiment.pkl')

['sentiment.pkl']

In [31]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf', degree=4, gamma=0.8, probability=True)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=4, gamma=0.8, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
print(svc.score(X_train, y_train))
print(svc.score(X_val, y_val))

1.0
0.7


In [33]:
sent = 'i think i am not feeling depressed'
print(svc.predict_proba(spacy_get_vec(sent)))
print(svc.predict(spacy_get_vec(sent)))

[[ 0.17088662  0.3746642   0.45444918]]
['sorry']
