### Load the model and make prediction
##### Input Args:

doc_dir = directory where the file with comments to be predicted

doc2vec_model_name = doc2vec model already trained

trained_clf = Trained classifier


In [1]:
from gensim.models import Doc2Vec
import re
import os
import random
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [4]:
#Load doc to vec model
doc2vec_model_name = 'reviews.d2v'
doc2vec_model = Doc2Vec.load(doc2vec_model_name)

In [5]:
#Test the trained doc2vec model
cosine_similarity(
[doc2vec_model.infer_vector(extract_words("This is very bad video. I don't like it"))],
[doc2vec_model.infer_vector(extract_words("video sucks."))])

array([[0.7246897]], dtype=float32)

In [6]:
#Converts the comments to vector using doc2vec model trianed earlier
def get_doc2vec(model, doc_dir,comment_col_name='Comment'):
    comments_dfs = []
    
    for fname in sorted(os.listdir(doc_dir)):
        fname_full = doc_dir + "/" + fname
        print("files being read : {0}".format(fname_full))
        data = pd.read_csv(fname_full, sep='\t', header='infer')
        data['FNAME'] = fname
        data['WORDS'] = data[comment_col_name].map(lambda x: extract_words(x))
        data['SENTVECS'] = data['WORDS'].map(lambda x: model.infer_vector(x, steps=10))
        comments_dfs.append(data)

    comments = pd.concat(comments_dfs)
    return comments

In [7]:
doc_dir = "../indata"
comments = get_doc2vec(doc2vec_model, doc_dir)

files being read : ../indata/samsung-vr.csv
files being read : ../indata/tswift.csv


In [8]:
comments.head(5)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,FNAME,WORDS,SENTVECS
0,I ll just leave this right here,2017-07-27T19:00:40.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[i, ll, just, leave, this, right, here]","[0.19757406, 0.16072297, 0.3160726, 0.04200211..."
1,How does sound work,2018-11-30T13:18:40.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[how, does, sound, work]","[-0.09989608, 0.21812965, 0.1737743, -0.207742..."
2,hey,2018-11-29T04:51:42.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,[hey],"[0.0071364488, 0.13148789, 0.04221164, -0.0763..."
3,Whats the song in the beginning,2018-11-23T20:46:13.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[whats, the, song, in, the, beginning]","[0.4085205, 0.23637642, 0.05918572, -0.0892807..."
4,Can t compete with a PS VR,2018-11-21T10:54:45.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[can, t, compete, with, a, ps, vr]","[-0.081133395, 0.075557545, 0.14778389, 0.0053..."


In [9]:
print("No of files : {0}".format(comments.FNAME.unique()))
print("No of Type : {}".format(comments.Type.unique()))

No of files : ['samsung-vr.csv' 'tswift.csv']
No of Type : ['Comment' 'Reply']


In [10]:
trained_clf = 'doc2vec_spam_rf.model'
loaded_clf = joblib.load(trained_clf)

  from numpy.core.umath_tests import inner1d


In [11]:
comments['SPAM_IND_RF'] = loaded_clf.predict(comments['SENTVECS'].tolist())

In [17]:
comments.head(20)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,FNAME,WORDS,SENTVECS,SPAM_IND_RF,SPAM_IND_KN
0,I ll just leave this right here,2017-07-27T19:00:40.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[i, ll, just, leave, this, right, here]","[0.19757406, 0.16072297, 0.3160726, 0.04200211...",0,1
1,How does sound work,2018-11-30T13:18:40.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[how, does, sound, work]","[-0.09989608, 0.21812965, 0.1737743, -0.207742...",0,0
2,hey,2018-11-29T04:51:42.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,[hey],"[0.0071364488, 0.13148789, 0.04221164, -0.0763...",0,0
3,Whats the song in the beginning,2018-11-23T20:46:13.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[whats, the, song, in, the, beginning]","[0.4085205, 0.23637642, 0.05918572, -0.0892807...",0,0
4,Can t compete with a PS VR,2018-11-21T10:54:45.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[can, t, compete, with, a, ps, vr]","[-0.081133395, 0.075557545, 0.14778389, 0.0053...",1,1
5,I think the oculus go is better,2018-11-16T21:47:51.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[i, think, the, oculus, go, is, better]","[0.0062060654, 0.062335677, 0.2105212, -0.0596...",0,0
6,does it work with S,2018-11-15T08:39:24.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[does, it, work, with, s]","[-0.0535581, -0.056835603, 0.049192593, 0.1074...",0,0
7,Gear vr fkn sucks Wait for quest,2018-11-13T17:43:11.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[gear, vr, fkn, sucks, wait, for, quest]","[-0.080691904, 0.29959255, -0.123865984, 0.065...",1,0
8,It really should come with dual controller sup...,2018-11-13T09:57:45.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[it, really, should, come, with, dual, control...","[-0.06152313, -0.15247348, 0.22437231, -0.0310...",0,0
9,I have something like this from Samsung but IT...,2018-11-12T04:58:29.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[i, have, something, like, this, from, samsung...","[0.18959036, 0.13804175, 0.3886805, 0.05065775...",0,1


In [18]:
trained_clf = 'doc2vec_spam_kn.model'
loaded_clf = joblib.load(trained_clf)

In [19]:
comments['SPAM_IND_KN'] = loaded_clf.predict(comments['SENTVECS'].tolist())

In [20]:
comments.head(10)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,FNAME,WORDS,SENTVECS,SPAM_IND_RF,SPAM_IND_KN
0,I ll just leave this right here,2017-07-27T19:00:40.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[i, ll, just, leave, this, right, here]","[0.19757406, 0.16072297, 0.3160726, 0.04200211...",0,1
1,How does sound work,2018-11-30T13:18:40.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[how, does, sound, work]","[-0.09989608, 0.21812965, 0.1737743, -0.207742...",0,0
2,hey,2018-11-29T04:51:42.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,[hey],"[0.0071364488, 0.13148789, 0.04221164, -0.0763...",0,0
3,Whats the song in the beginning,2018-11-23T20:46:13.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[whats, the, song, in, the, beginning]","[0.4085205, 0.23637642, 0.05918572, -0.0892807...",0,0
4,Can t compete with a PS VR,2018-11-21T10:54:45.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[can, t, compete, with, a, ps, vr]","[-0.081133395, 0.075557545, 0.14778389, 0.0053...",1,1
5,I think the oculus go is better,2018-11-16T21:47:51.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[i, think, the, oculus, go, is, better]","[0.0062060654, 0.062335677, 0.2105212, -0.0596...",0,0
6,does it work with S,2018-11-15T08:39:24.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[does, it, work, with, s]","[-0.0535581, -0.056835603, 0.049192593, 0.1074...",0,0
7,Gear vr fkn sucks Wait for quest,2018-11-13T17:43:11.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[gear, vr, fkn, sucks, wait, for, quest]","[-0.080691904, 0.29959255, -0.123865984, 0.065...",1,0
8,It really should come with dual controller sup...,2018-11-13T09:57:45.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[it, really, should, come, with, dual, control...","[-0.06152313, -0.15247348, 0.22437231, -0.0310...",0,0
9,I have something like this from Samsung but IT...,2018-11-12T04:58:29.000Z,Comment,jBcHv3h1pHs,samsung-vr.csv,"[i, have, something, like, this, from, samsung...","[0.18959036, 0.13804175, 0.3886805, 0.05065775...",0,1


In [21]:
out_df = comments[['Comment','CreateTimeStamp','Type', 'videoID','SPAM_IND_RF','SPAM_IND_KN']]

In [22]:
out_df.head(5)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,SPAM_IND_RF,SPAM_IND_KN
0,I ll just leave this right here,2017-07-27T19:00:40.000Z,Comment,jBcHv3h1pHs,0,1
1,How does sound work,2018-11-30T13:18:40.000Z,Comment,jBcHv3h1pHs,0,0
2,hey,2018-11-29T04:51:42.000Z,Comment,jBcHv3h1pHs,0,0
3,Whats the song in the beginning,2018-11-23T20:46:13.000Z,Comment,jBcHv3h1pHs,0,0
4,Can t compete with a PS VR,2018-11-21T10:54:45.000Z,Comment,jBcHv3h1pHs,1,1


In [25]:
out_file_name = '../outdata/video_comments_analysis_spam.csv'
out_df.to_csv(out_file_name, sep='\t', index=False)

In [24]:
print('****End of script****')

****End of script****
