### Load the model and make prediction
##### Input Args:

doc_dir = directory where the file with comments to be predicted

doc2vec_model_name = doc2vec model already trained

trained_clf = Trained classifier


In [1]:
from gensim.models import Doc2Vec
import re
import os
import random
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [4]:
#Load doc to vec model
doc2vec_model_name = 'reviews.d2v'
doc2vec_model = Doc2Vec.load(doc2vec_model_name)

In [5]:
#Test the trained doc2vec model
cosine_similarity(
[doc2vec_model.infer_vector(extract_words("This is very bad video. I don't like it"))],
[doc2vec_model.infer_vector(extract_words("video sucks."))])

array([[0.72811055]], dtype=float32)

In [6]:
#Converts the comments to vector using doc2vec model trianed earlier
def get_doc2vec(model, doc_dir,comment_col_name='Comment'):
    comments_dfs = []
    
    for fname in sorted(os.listdir(doc_dir)):
        fname_full = doc_dir + "/" + fname
        print("files being read : {0}".format(fname_full))
        data = pd.read_csv(fname_full, sep='\t', header='infer')
        data['FNAME'] = fname
        data['WORDS'] = data[comment_col_name].map(lambda x: extract_words(x))
        data['SENTVECS'] = data['WORDS'].map(lambda x: model.infer_vector(x, steps=10))
        comments_dfs.append(data)

    comments = pd.concat(comments_dfs)
    return comments

In [8]:
doc_dir = "../indata"
comments = get_doc2vec(doc2vec_model, doc_dir)

files being read : ../indata/eminem-not-affraid.csv
files being read : ../indata/got8.csv
files being read : ../indata/house-of-cards.csv
files being read : ../indata/samsung-vr.csv
files being read : ../indata/tonight-show.csv
files being read : ../indata/tswift.csv
files being read : ../indata/warriors.csv


In [10]:
comments.head(5)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS
0,Give Him Some Water Man,2018-12-03T10:11:17.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[give, him, some, water, man]","[-0.07413684, 0.43375567, 0.23789428, -0.15710..."
1,What s going on Eminem,2018-12-03T10:09:18.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[what, s, going, on, eminem]","[0.14541733, -0.19542317, 0.38643086, 0.133048..."
2,No one can beat you,2018-12-03T09:46:24.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[no, one, can, beat, you]","[-0.06301007, 0.45594293, 0.19862762, 0.180713..."
3,EMINEM THE ALLTIME BEST RAPPER IN THE WORLD,2018-12-03T09:45:16.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[eminem, the, alltime, best, rapper, in, the, ...","[0.06287132, 0.072824255, 0.07605592, 0.010044..."
4,Much love from india,2018-12-03T09:03:50.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[much, love, from, india]","[0.17051177, 0.23125106, 0.12662126, 0.0427657..."


In [11]:
print("No of files : {0}".format(comments.FNAME.unique()))
print("No of Type : {}".format(comments.Type.unique()))

No of files : ['eminem-not-affraid.csv' 'got8.csv' 'house-of-cards.csv' 'samsung-vr.csv'
 'tonight-show.csv' 'tswift.csv' 'warriors.csv']
No of Type : ['Comment' 'Reply']


In [12]:
trained_clf = 'doc2vec_spam_rf.model'
loaded_clf = joblib.load(trained_clf)

  from numpy.core.umath_tests import inner1d


In [13]:
comments['SPAM_IND_RF'] = loaded_clf.predict(comments['SENTVECS'].tolist())

In [14]:
comments.head(10)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS,SPAM_IND_RF
0,Give Him Some Water Man,2018-12-03T10:11:17.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[give, him, some, water, man]","[-0.07413684, 0.43375567, 0.23789428, -0.15710...",0
1,What s going on Eminem,2018-12-03T10:09:18.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[what, s, going, on, eminem]","[0.14541733, -0.19542317, 0.38643086, 0.133048...",0
2,No one can beat you,2018-12-03T09:46:24.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[no, one, can, beat, you]","[-0.06301007, 0.45594293, 0.19862762, 0.180713...",0
3,EMINEM THE ALLTIME BEST RAPPER IN THE WORLD,2018-12-03T09:45:16.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[eminem, the, alltime, best, rapper, in, the, ...","[0.06287132, 0.072824255, 0.07605592, 0.010044...",0
4,Much love from india,2018-12-03T09:03:50.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[much, love, from, india]","[0.17051177, 0.23125106, 0.12662126, 0.0427657...",0
5,Who is right now listening to this masterpiece...,2018-12-03T09:02:20.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[who, is, right, now, listening, to, this, mas...","[0.20082739, 0.16013142, 0.2667011, -0.1218121...",0
6,dec anyone,2018-12-03T08:56:34.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[dec, anyone]","[0.1491602, 0.13057116, 0.13942419, -0.1626210...",0
7,Who still listening in december,2018-12-03T08:25:08.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[who, still, listening, in, december]","[0.028714284, 0.1683977, 0.14882058, -0.069841...",0
8,EMINEM,2018-12-03T08:21:25.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,[eminem],"[0.009762872, 0.014688463, 0.022014672, 0.0420...",0
9,The music in is so bad Eminem is my last chance,2018-12-03T08:10:18.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[the, music, in, is, so, bad, eminem, is, my, ...","[0.21369755, 0.46041906, 0.14770782, 0.0245659...",0


In [15]:
trained_clf = 'doc2vec_spam_kn.model'
loaded_clf = joblib.load(trained_clf)

In [16]:
comments['SPAM_IND_KN'] = loaded_clf.predict(comments['SENTVECS'].tolist())

In [17]:
comments.head(10)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS,SPAM_IND_RF,SPAM_IND_KN
0,Give Him Some Water Man,2018-12-03T10:11:17.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[give, him, some, water, man]","[-0.07413684, 0.43375567, 0.23789428, -0.15710...",0,0
1,What s going on Eminem,2018-12-03T10:09:18.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[what, s, going, on, eminem]","[0.14541733, -0.19542317, 0.38643086, 0.133048...",0,1
2,No one can beat you,2018-12-03T09:46:24.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[no, one, can, beat, you]","[-0.06301007, 0.45594293, 0.19862762, 0.180713...",0,0
3,EMINEM THE ALLTIME BEST RAPPER IN THE WORLD,2018-12-03T09:45:16.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[eminem, the, alltime, best, rapper, in, the, ...","[0.06287132, 0.072824255, 0.07605592, 0.010044...",0,0
4,Much love from india,2018-12-03T09:03:50.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[much, love, from, india]","[0.17051177, 0.23125106, 0.12662126, 0.0427657...",0,0
5,Who is right now listening to this masterpiece...,2018-12-03T09:02:20.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[who, is, right, now, listening, to, this, mas...","[0.20082739, 0.16013142, 0.2667011, -0.1218121...",0,0
6,dec anyone,2018-12-03T08:56:34.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[dec, anyone]","[0.1491602, 0.13057116, 0.13942419, -0.1626210...",0,1
7,Who still listening in december,2018-12-03T08:25:08.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[who, still, listening, in, december]","[0.028714284, 0.1683977, 0.14882058, -0.069841...",0,0
8,EMINEM,2018-12-03T08:21:25.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,[eminem],"[0.009762872, 0.014688463, 0.022014672, 0.0420...",0,0
9,The music in is so bad Eminem is my last chance,2018-12-03T08:10:18.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,"[the, music, in, is, so, bad, eminem, is, my, ...","[0.21369755, 0.46041906, 0.14770782, 0.0245659...",0,0


In [22]:
out_df = comments[['Comment','CreateTimeStamp','Type', 'videoID', 'videoTitle','FNAME', 'SPAM_IND_RF','SPAM_IND_KN']]

In [23]:
out_df.head(5)

Unnamed: 0,Comment,CreateTimeStamp,Type,videoID,videoTitle,FNAME,SPAM_IND_RF,SPAM_IND_KN
0,Give Him Some Water Man,2018-12-03T10:11:17.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,0,0
1,What s going on Eminem,2018-12-03T10:09:18.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,0,1
2,No one can beat you,2018-12-03T09:46:24.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,0,0
3,EMINEM THE ALLTIME BEST RAPPER IN THE WORLD,2018-12-03T09:45:16.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,0,0
4,Much love from india,2018-12-03T09:03:50.000Z,Comment,j5-yKhDd64s,Eminem - Not Afraid,eminem-not-affraid.csv,0,0


In [24]:
out_file_name = '../outdata/video_comments_analysis_spam.csv'
out_df.to_csv(out_file_name, sep='\t', index=False)

In [25]:
print('****End of script****')

****End of script****
