In [1]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import re
import os
import random
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [4]:
# test text cleanup block
words = 'ajshd asda, + 1231 "sd + ''alkdj''    '
print(extract_words(words))

['ajshd', 'asda', '1231', 'sd', 'alkdj']


In [5]:
#Load doc to vec model
doc2vec_model_name = 'reviews.d2v'
model = Doc2Vec.load(doc2vec_model_name)

In [6]:
model.infer_vector(extract_words("This is very bad video. I don't like it"))

array([-0.00097921,  0.22152558, -0.02575502, -0.05142282, -0.02964645,
       -0.08908243, -0.02033703, -0.13232681,  0.15730113,  0.0219765 ,
       -0.10882669, -0.24273105, -0.19880445, -0.04594366,  0.07880894,
        0.09912173,  0.12206709,  0.13162981,  0.11471985,  0.22692327,
       -0.34206438,  0.0351825 ,  0.03957016, -0.09352863,  0.07131914,
        0.1587591 ,  0.08153705,  0.14199504,  0.05146929,  0.06841698,
        0.31849444, -0.05760468,  0.14546221, -0.15244794, -0.10526983,
        0.15762284,  0.13843904, -0.08263648,  0.40170503, -0.15400773,
       -0.2481152 , -0.27001816, -0.24904737, -0.10606316,  0.20778543,
       -0.03677601,  0.07447217, -0.03276308, -0.0848314 ,  0.15688238],
      dtype=float32)

In [7]:
cosine_similarity(
[model.infer_vector(extract_words("This is very bad video. I don't like it"))],
[model.infer_vector(extract_words("video sucks."))])

array([[0.727353]], dtype=float32)

In [8]:
cosine_similarity(
[model.infer_vector(extract_words("It is now snowing in New York"))],
[model.infer_vector(extract_words("I feel sick. Dont feel like going to school"))])

array([[0.44124275]], dtype=float32)

In [9]:
sentences = []
sentvecs = []
sentiments = []
# download test dataset : https://www.kaggle.com/rahulin05/sentiment-labelled-sentences-data-set?login=true
for fname in ["yelp", "amazon_cells", "imdb"]:
    with open("data/sentiment-labelled-sentences-data-set/%s_labelled.txt" % fname, encoding='UTF-8') as f:
        for i, line in enumerate(f):
            line_split = line.strip().split('\t')
            sentences.append(line_split[0])
            words = extract_words(line_split[0])
            sentvecs.append(model.infer_vector(words, steps=10)) # create a vector for this document
            sentiments.append(int(line_split[1]))
# shuffle sentences, sentvecs, sentiments together
combined = list(zip(sentences, sentvecs, sentiments))
random.shuffle(combined)
sentences, sentvecs, sentiments = zip(*combined)

In [10]:
sentences[0:2]

('Never heard of any of them except Cole who was totally unbelievable in the part.  ',
 'We are sending it back.')

In [11]:
sentvecs[0:2]

(array([-0.04064234,  0.12417678,  0.11082955,  0.20703144, -0.06779366,
        -0.05296742,  0.04924719, -0.53294235,  0.33222738,  0.12901679,
         0.5557371 , -0.49287882, -0.17895997,  0.08885325, -0.06590942,
         0.04583032,  0.13890967,  0.49246433, -0.11303261,  0.10337923,
         0.20695312,  0.11026672, -0.38203767, -0.5026883 , -0.14516264,
         0.23198043,  0.32230964,  0.671394  ,  0.35248262,  0.135847  ,
        -0.15433632,  0.06664702,  0.13038336, -0.0049192 , -0.4993708 ,
        -0.3800676 ,  0.05269672, -0.24363501,  0.4880163 ,  0.07636135,
        -0.49558273, -0.4762459 ,  0.27767465, -0.32609206, -0.20652016,
        -0.18339564,  0.02523056, -0.11994597, -0.53938174,  0.45793116],
       dtype=float32),
 array([ 0.1093405 ,  0.13401023, -0.09033331,  0.20825808,  0.02629481,
        -0.09775223, -0.09246983, -0.1619282 , -0.08687596,  0.228578  ,
         0.27323732, -0.2287021 , -0.29045078, -0.08940927,  0.10642046,
         0.2064396 ,  0.099

In [12]:
sentiments[0:2]

(0, 0)

### Build a classifier

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np

  from numpy.core.umath_tests import inner1d


In [14]:
clfkn = KNeighborsClassifier(n_neighbors=9)
clfrf = RandomForestClassifier()

In [15]:
scores = cross_val_score(clfkn, sentvecs, sentiments, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.7863333333333333 std : 0.01279756921363497


In [16]:
scores = cross_val_score(clfrf, sentvecs, sentiments, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.729 std : 0.012046207333061738


#### Fit the model and save it for later use

In [17]:
#Train the model
clfkn.fit(sentvecs, sentiments)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

In [18]:
#Train the model
clfrf.fit(sentvecs, sentiments)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
y_pred = clfkn.predict(sentvecs[0:10])
print("y_pred : {0}".format(y_pred))
y_test = sentiments[0:10]
print("y_test : {0}".format(y_test))

y_pred : [0 0 1 0 0 1 0 0 1 0]
y_test : (0, 0, 1, 0, 0, 1, 0, 0, 1, 0)


In [20]:
accuracy = metrics.accuracy_score(y_test, y_pred)
print("accuracy : {0}".format(accuracy))

accuracy : 1.0


In [21]:
cf = metrics.confusion_matrix(y_test, y_pred)
print("Confusion Metrics : \n{0}".format(cf))

Confusion Metrics : 
[[7 0]
 [0 3]]


In [22]:
#Save the model
from sklearn.externals import joblib

In [23]:
model_name = 'doc2vec_kn.model'
joblib.dump(clfkn, model_name)

['doc2vec_kn.model']

In [24]:
model_name = 'doc2vec_rf.model'
joblib.dump(clfrf, model_name)

['doc2vec_rf.model']

In [25]:
import pickle

In [26]:
# save the model to disk
model_name = 'doc2vec_rf.sav'
pickle.dump(clfrf, open(model_name, 'wb'))

In [27]:
# load the model from disk
loaded_model = pickle.load(open(model_name, 'rb'))
result = loaded_model.score(sentvecs[0:10], sentiments[0:10])
print(result)

1.0


#### Bag-of-Words model comparison

In [28]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), RandomForestClassifier())

In [29]:
scores = cross_val_score(pipeline, sentences, sentiments, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.7470000000000001 std : 0.01596524001977072


##### Bag-of-Words and Doc2Vec performance seems to be close. However, if we have lot more training example Doc2Vec works better