### Load the model and make prediction
##### Input Args:

doc_dir = directory where the file with comments to be predicted

doc2vec_model_name = doc2vec model already trained

trained_clf = Trained classifier


In [20]:
from gensim.models import Doc2Vec
import re
import os
import random
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [21]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [22]:
def extract_words(sent):
    #print("line tokenizing : \n{0}".format(sent))
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [25]:
#Load doc to vec model
doc2vec_model_name = 'reviews.d2v'
doc2vec_model = Doc2Vec.load(doc2vec_model_name)

In [27]:
#Test the trained doc2vec model
cosine_similarity(
[doc2vec_model.infer_vector(extract_words("This is really exciting video. Thank you for presenting to us."))],
[doc2vec_model.infer_vector(extract_words("Exciting video. Keep it coming"))])

array([[0.6753069]], dtype=float32)

In [28]:
#Converts the comments to vector using doc2vec model trianed earlier
def get_doc2vec(model, doc_dir,comment_col_name='Comment'):
    comments_dfs = []
    
    for fname in sorted(os.listdir(doc_dir)):
        fname_full = doc_dir + "/" + fname
        print("file being read : {0}".format(fname_full))
        data = pd.read_csv(fname_full, sep='\t', header='infer')
        data['FNAME'] = fname
        data['WORDS'] = data[comment_col_name].map(lambda x: extract_words(x))
        data['SENTVECS'] = data['WORDS'].map(lambda x: model.infer_vector(x, steps=10))
        comments_dfs.append(data)

    comments = pd.concat(comments_dfs)
    return comments

In [29]:
doc_dir = "../indata"
comments = get_doc2vec(doc2vec_model, doc_dir)

file being read : ../indata/EdSheeran.csv
file being read : ../indata/Maroon5.csv
file being read : ../indata/SuperBowlLI.csv
file being read : ../indata/bing-bang-theory.csv
file being read : ../indata/bitcoin-vs-ethereum.csv
file being read : ../indata/coldpay.csv
file being read : ../indata/eminem-not-affraid.csv
file being read : ../indata/federer-rafael.csv
file being read : ../indata/formula1-2018.csv
file being read : ../indata/got8.csv
file being read : ../indata/house-of-cards.csv
file being read : ../indata/iphone-xs-vs-note9.csv
file being read : ../indata/mac-vs-windows.csv
file being read : ../indata/nfl-highlights.csv
file being read : ../indata/oculus.csv
file being read : ../indata/samsung-vr.csv
file being read : ../indata/tonight-show.csv
file being read : ../indata/tswift.csv
file being read : ../indata/warriors.csv


In [30]:
comments.head(5)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS
0,December everyone,2018-12-06T06:05:20.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[december, everyone]","[-0.068403006, -0.044122253, 0.01688296, 0.053..."
1,the sweetest song ever in love with this song,2018-12-06T05:58:13.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[the, sweetest, song, ever, in, love, with, th...","[0.14944652, 0.37444994, 0.38711354, -0.232493..."
2,Love this song,2018-12-06T05:58:04.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[love, this, song]","[0.17394641, 0.35500512, 0.13546918, -0.124175..."
3,m gonna kill that son of bitch,2018-12-06T05:32:23.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[m, gonna, kill, that, son, of, bitch]","[0.1346761, -0.044986937, -0.053727243, 0.1896..."
4,Wuuuuaaaaoooo,2018-12-06T05:31:54.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,[wuuuuaaaaoooo],"[0.0053953473, -0.003310422, -0.0038284012, 0...."


In [31]:
print("No of files : {0}".format(comments.FNAME.unique()))
print("No of Type : {}".format(comments.Type.unique()))

No of files : ['EdSheeran.csv' 'Maroon5.csv' 'SuperBowlLI.csv' 'bing-bang-theory.csv'
 'bitcoin-vs-ethereum.csv' 'coldpay.csv' 'eminem-not-affraid.csv'
 'federer-rafael.csv' 'formula1-2018.csv' 'got8.csv' 'house-of-cards.csv'
 'iphone-xs-vs-note9.csv' 'mac-vs-windows.csv' 'nfl-highlights.csv'
 'oculus.csv' 'samsung-vr.csv' 'tonight-show.csv' 'tswift.csv'
 'warriors.csv']
No of Type : ['Comment' 'Reply']


In [32]:
trained_clf = 'doc2vec_rf.model'
loaded_clf = joblib.load(trained_clf)

In [33]:
comments['SENTIMENT_RF'] = loaded_clf.predict(comments['SENTVECS'].tolist())

In [34]:
comments.head(5)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS,SENTIMENT_RF
0,December everyone,2018-12-06T06:05:20.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[december, everyone]","[-0.068403006, -0.044122253, 0.01688296, 0.053...",1
1,the sweetest song ever in love with this song,2018-12-06T05:58:13.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[the, sweetest, song, ever, in, love, with, th...","[0.14944652, 0.37444994, 0.38711354, -0.232493...",1
2,Love this song,2018-12-06T05:58:04.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[love, this, song]","[0.17394641, 0.35500512, 0.13546918, -0.124175...",1
3,m gonna kill that son of bitch,2018-12-06T05:32:23.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[m, gonna, kill, that, son, of, bitch]","[0.1346761, -0.044986937, -0.053727243, 0.1896...",0
4,Wuuuuaaaaoooo,2018-12-06T05:31:54.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,[wuuuuaaaaoooo],"[0.0053953473, -0.003310422, -0.0038284012, 0....",0


In [35]:
trained_clf = 'doc2vec_kn.model'
loaded_clf = joblib.load(trained_clf)

In [36]:
comments['SENTIMENT_KN'] = loaded_clf.predict(comments['SENTVECS'].tolist())

In [37]:
comments.head(5)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS,SENTIMENT_RF,SENTIMENT_KN
0,December everyone,2018-12-06T06:05:20.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[december, everyone]","[-0.068403006, -0.044122253, 0.01688296, 0.053...",1,1
1,the sweetest song ever in love with this song,2018-12-06T05:58:13.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[the, sweetest, song, ever, in, love, with, th...","[0.14944652, 0.37444994, 0.38711354, -0.232493...",1,1
2,Love this song,2018-12-06T05:58:04.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[love, this, song]","[0.17394641, 0.35500512, 0.13546918, -0.124175...",1,1
3,m gonna kill that son of bitch,2018-12-06T05:32:23.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[m, gonna, kill, that, son, of, bitch]","[0.1346761, -0.044986937, -0.053727243, 0.1896...",0,1
4,Wuuuuaaaaoooo,2018-12-06T05:31:54.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,[wuuuuaaaaoooo],"[0.0053953473, -0.003310422, -0.0038284012, 0....",0,1


In [38]:
out_df = comments[['Comment','CreateTimeStamp','Type', 'videoID', 'videoTitle', 'FNAME','SENTIMENT_RF','SENTIMENT_KN']]

In [43]:
out_df.head(5)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,videoTitle,FNAME,SENTIMENT_RF,SENTIMENT_KN
0,December everyone,2018-12-06T06:05:20.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,1,1
1,the sweetest song ever in love with this song,2018-12-06T05:58:13.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,1,1
2,Love this song,2018-12-06T05:58:04.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,1,1
3,m gonna kill that son of bitch,2018-12-06T05:32:23.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,0,1
4,Wuuuuaaaaoooo,2018-12-06T05:31:54.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,0,1


In [40]:
out_file_name = '../outdata/video_comments_analysis_sentiments.csv'
out_df.to_csv(out_file_name, sep='\t', index=False)

In [41]:
print('****End of script****')

****End of script****
