In [11]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import re
import os
import random
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [13]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [34]:
# test text cleanup block
words = 'Good, + bad / "ugly + '';''    '
print(extract_words(words))

['good', 'bad', 'ugly']


In [36]:
proj_root = os.path.dirname(os.getcwd())
#print("Current working dir : {0}".format(proj_root))

In [37]:
model_dir = proj_root + "/" + "model"
doc2vec_model = model_dir + "/" + 'comments.d2v'

#Load doc to vec model
model = Doc2Vec.load(doc2vec_model)

In [16]:
model.infer_vector(extract_words("This is very bad video. I don't like it"))

array([ 0.0007488 ,  0.23022519, -0.02249053, -0.05488956, -0.03843215,
       -0.09444682, -0.02938818, -0.12492562,  0.15748744,  0.01971101,
       -0.12412424, -0.24856669, -0.20035094, -0.03914883,  0.09236484,
        0.09049756,  0.12286548,  0.12651494,  0.11997791,  0.22038384,
       -0.34141788,  0.04001606,  0.03116315, -0.08485164,  0.08007647,
        0.15309156,  0.07524484,  0.15937917,  0.06037891,  0.07296979,
        0.31113595, -0.05051051,  0.13974777, -0.15265502, -0.10611851,
        0.15341866,  0.14397682, -0.07252364,  0.38951796, -0.14550473,
       -0.24489135, -0.27200776, -0.23849759, -0.10863675,  0.21872744,
       -0.04544033,  0.08326463, -0.02799641, -0.08214579,  0.16252297],
      dtype=float32)

In [51]:
labeled_senti_subdir = 'traindata/labeled-senti-data'
labeled_senti_dir = proj_root + "/" + labeled_senti_subdir

sentences = []
sentvecs = []
sentiments = []


for fname in sorted(os.listdir(labeled_senti_dir)):
    print("file name : {0}".format(fname))
    if fname.endswith('.txt'):
        with open(labeled_senti_dir + "/" + fname, encoding = 'UTF-8') as f:
            for i, line in enumerate(f):
                line_split = line.strip().split('\t')
                sentences.append(line_split[0])
                words = extract_words(line_split[0])
                sentvecs.append(model.infer_vector(words, steps=10))
                sentiments.append(int(line_split[1]))
        
combined = list(zip(sentences, sentvecs, sentiments))
random.shuffle(combined)
sentences, sentvecs, sentiments = zip(*combined)

file name : amazon.txt
file name : imdb.txt
file name : yelp.txt


In [56]:
sentences[0:2]

("I didn't think that the instructions provided were helpful to me.",
 "I promise they won't disappoint.")

In [57]:
sentvecs[0:2]

(array([ 0.48536435, -0.17653368,  0.2098451 , -0.20381974,  0.09497128,
        -0.044964  ,  0.3941045 , -0.5218856 , -0.1571676 ,  0.16231482,
         0.05511091, -0.17999406,  0.07783119,  0.03506007,  0.41333073,
        -0.15531571, -0.14621882, -0.1358074 , -0.02331984,  0.01832723,
        -0.05436605, -0.04734341,  0.08598659, -0.2657693 , -0.22395723,
         0.03232431, -0.10989751,  0.5301082 , -0.2534588 ,  0.24355558,
         0.43762696,  0.09672873,  0.24846926,  0.18541782,  0.03174235,
        -0.09439711,  0.2080433 , -0.28092685,  0.3195152 , -0.0998478 ,
        -0.3149838 , -0.01844147,  0.21288288, -0.21128532,  0.1586669 ,
        -0.05897464,  0.20594312, -0.24821277, -0.6573384 ,  0.07050503],
       dtype=float32),
 array([-0.14356126,  0.19225422, -0.01631394,  0.13989933,  0.05279529,
        -0.10171934,  0.1476186 , -0.31477496,  0.23688883,  0.07492894,
         0.05177709,  0.13395412, -0.0982677 , -0.0616399 ,  0.09610128,
        -0.05369568, -0.141

In [58]:
sentiments[0:2]

(0, 1)

### Build a classifier

In [59]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np

In [60]:
clfkn = KNeighborsClassifier(n_neighbors=9)
clfrf = RandomForestClassifier()

In [61]:
scores = cross_val_score(clfkn, sentvecs, sentiments, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.7853333333333333 std : 0.004268749491621889


In [62]:
scores = cross_val_score(clfrf, sentvecs, sentiments, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.7386666666666667 std : 0.01180395413975053


#### Fit the model and save it for later use

In [63]:
#Train the model
clfkn.fit(sentvecs, sentiments)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

In [64]:
#Train the model
clfrf.fit(sentvecs, sentiments)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [65]:
y_pred = clfkn.predict(sentvecs[0:10])
print("y_pred : {0}".format(y_pred))
y_test = sentiments[0:10]
print("y_test : {0}".format(y_test))

y_pred : [0 1 0 1 0 1 0 0 1 1]
y_test : (0, 1, 0, 0, 0, 1, 0, 0, 1, 1)


In [66]:
accuracy = metrics.accuracy_score(y_test, y_pred)
print("accuracy : {0}".format(accuracy))

accuracy : 0.9


In [69]:
cf = metrics.confusion_matrix(y_test, y_pred)
print("Confusion Metrics : \n{0}".format(cf))

Confusion Metrics : 
[[5 1]
 [0 4]]


In [70]:
#Save the model
from sklearn.externals import joblib

In [None]:
model_dir = proj_root + "/" + "model"

In [72]:
#Persist the model to disk
model_name = model_dir + "/" + 'doc2vec_senti_kn.model'
joblib.dump(clfkn, model_name)

['/Users/dbiswas/Documents/Malabika/MS/Fall2018/social_media_mining/project/comments_analysis/model/doc2vec_senti_kn.model']

In [73]:
#Persist the model to disk
model_name = model_dir + "/" + 'doc2vec_senti_rf.model'
joblib.dump(clfrf, model_name)

['/Users/dbiswas/Documents/Malabika/MS/Fall2018/social_media_mining/project/comments_analysis/model/doc2vec_senti_rf.model']

In [None]:
#Demo with pickle

In [75]:
import pickle

In [76]:
# save the model to disk
model_name = model_dir + "/" + 'doc2vec_senti_rf.pickle'
pickle.dump(clfrf, open(model_name, 'wb'))

#### Bag-of-Words model comparison

In [80]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), RandomForestClassifier())

In [81]:
scores = cross_val_score(pipeline, sentences, sentiments, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.748 std : 0.01941934888483933


##### Bag-of-Words and Doc2Vec performance seems to be close. However, if we have lot more training example Doc2Vec works better