### Load the model and make prediction
##### Input Args:

doc_dir = directory where the file with comments to be predicted

doc2vec_model_name = doc2vec model already trained

trained_clf = Trained classifier


In [1]:
from gensim.models import Doc2Vec
import re
import os
import random
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [4]:
proj_root = os.path.dirname(os.getcwd())
#print("Current working dir : {0}".format(proj_root))

In [5]:
model_dir = proj_root + "/" + "model"
doc2vec_model = model_dir + "/" + 'comments.d2v'

#Load doc to vec model
doc2vec_model = Doc2Vec.load(doc2vec_model)

In [6]:
#Test the trained doc2vec model
cosine_similarity(
[doc2vec_model.infer_vector(extract_words("This is very bad video. I don't like it"))],
[doc2vec_model.infer_vector(extract_words("video sucks."))])

array([[0.74414426]], dtype=float32)

In [7]:
#Converts the comments to vector using doc2vec model trianed earlier
def get_doc2vec(model, doc_dir,comment_col_name='Comment'):
    comments_dfs = []
    
    for fname in sorted(os.listdir(doc_dir)):
        fname_full = doc_dir + "/" + fname
        print("files being read : {0}".format(fname_full))
        data = pd.read_csv(fname_full, sep='\t', header='infer')
        data['FNAME'] = fname
        data['WORDS'] = data[comment_col_name].map(lambda x: extract_words(x))
        data['SENTVECS'] = data['WORDS'].map(lambda x: model.infer_vector(x, steps=10))
        comments_dfs.append(data)

    comments = pd.concat(comments_dfs)
    return comments

In [8]:
doc_dir = "../indata"
comments = get_doc2vec(doc2vec_model, doc_dir)

files being read : ../indata/EdSheeran.csv
files being read : ../indata/Maroon5.csv
files being read : ../indata/SuperBowlLI.csv
files being read : ../indata/bing-bang-theory.csv
files being read : ../indata/bitcoin-vs-ethereum.csv
files being read : ../indata/coldpay.csv
files being read : ../indata/eminem-not-affraid.csv
files being read : ../indata/federer-rafael.csv
files being read : ../indata/formula1-2018.csv
files being read : ../indata/got8.csv
files being read : ../indata/house-of-cards.csv
files being read : ../indata/iphone-xs-vs-note9.csv
files being read : ../indata/mac-vs-windows.csv
files being read : ../indata/nfl-highlights.csv
files being read : ../indata/oculus.csv
files being read : ../indata/samsung-vr.csv
files being read : ../indata/tonight-show.csv
files being read : ../indata/tswift.csv
files being read : ../indata/warriors.csv


In [9]:
comments.head(5)

Unnamed: 0,Comment,CommentID,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS
0,Like maroon Comment ed sheeran,UgxD46CBoJVhQCAibLp4AaABAg,2018-12-08T02:32:48.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[like, maroon, comment, ed, sheeran]","[0.09525244, 0.0007141725, 0.0357948, -0.23296..."
1,Its Legendary December,Ugx91N6QKBVsR24qOH94AaABAg,2018-12-08T02:31:02.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[its, legendary, december]","[-0.0021700214, -0.011219359, -0.07317611, -0...."
2,Feelings so deep in feelings,UgzG83e-1TszyKCMO8F4AaABAg,2018-12-08T02:28:16.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[feelings, so, deep, in, feelings]","[0.236262, 0.12739281, 0.005141589, 0.21333386..."
3,I love your songs seriously I would like to me...,UgydHSuRFNFux7vGSQ94AaABAg,2018-12-08T02:20:23.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[i, love, your, songs, seriously, i, would, li...","[0.79627186, 0.7702556, 0.24780324, -0.2365534..."
4,I love it,UgwTYwXgN9KGAB7WIaN4AaABAg,2018-12-08T02:13:49.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[i, love, it]","[0.046548694, 0.15993516, 0.00060374994, 0.007..."


In [10]:
print("No of files : {0}".format(comments.FNAME.unique()))
print("No of Type : {0}".format(comments.Type.unique()))

No of files : ['EdSheeran.csv' 'Maroon5.csv' 'SuperBowlLI.csv' 'bing-bang-theory.csv'
 'bitcoin-vs-ethereum.csv' 'coldpay.csv' 'eminem-not-affraid.csv'
 'federer-rafael.csv' 'formula1-2018.csv' 'got8.csv' 'house-of-cards.csv'
 'iphone-xs-vs-note9.csv' 'mac-vs-windows.csv' 'nfl-highlights.csv'
 'oculus.csv' 'samsung-vr.csv' 'tonight-show.csv' 'tswift.csv'
 'warriors.csv']
No of Type : ['Comment' 'Reply']


In [11]:
trained_clf = model_dir + "/" + 'doc2vec_spam_rf.model'
loaded_clf = joblib.load(trained_clf)

  from numpy.core.umath_tests import inner1d


In [12]:
comments['SPAM_IND_RF'] = loaded_clf.predict(comments['SENTVECS'].tolist())

In [13]:
comments.head(10)

Unnamed: 0,Comment,CommentID,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS,SPAM_IND_RF
0,Like maroon Comment ed sheeran,UgxD46CBoJVhQCAibLp4AaABAg,2018-12-08T02:32:48.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[like, maroon, comment, ed, sheeran]","[0.09525244, 0.0007141725, 0.0357948, -0.23296...",1
1,Its Legendary December,Ugx91N6QKBVsR24qOH94AaABAg,2018-12-08T02:31:02.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[its, legendary, december]","[-0.0021700214, -0.011219359, -0.07317611, -0....",0
2,Feelings so deep in feelings,UgzG83e-1TszyKCMO8F4AaABAg,2018-12-08T02:28:16.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[feelings, so, deep, in, feelings]","[0.236262, 0.12739281, 0.005141589, 0.21333386...",0
3,I love your songs seriously I would like to me...,UgydHSuRFNFux7vGSQ94AaABAg,2018-12-08T02:20:23.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[i, love, your, songs, seriously, i, would, li...","[0.79627186, 0.7702556, 0.24780324, -0.2365534...",0
4,I love it,UgwTYwXgN9KGAB7WIaN4AaABAg,2018-12-08T02:13:49.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[i, love, it]","[0.046548694, 0.15993516, 0.00060374994, 0.007...",0
5,como me encanta esta cancion,UgztF8wJRD-SI92_V_Z4AaABAg,2018-12-08T02:12:03.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[como, me, encanta, esta, cancion]","[0.17076933, 0.030146817, 0.27426234, -0.06725...",0
6,Awwww love your music your the best,Ugwdf7L_S85fVKV2FSl4AaABAg,2018-12-08T02:08:46.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[awwww, love, your, music, your, the, best]","[0.16973305, 0.40647706, 0.23566242, 0.1104907...",0
7,I love this song thank you for making it,Ugxn4ht_YPobdCoyUfB4AaABAg,2018-12-08T02:06:01.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[i, love, this, song, thank, you, for, making,...","[0.14801028, 0.51117325, 0.14707884, -0.214295...",0
8,December,UgxHvVj4_XaLBTLKqW14AaABAg,2018-12-08T01:56:15.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,[december],"[-0.085088514, -0.044014324, -0.047773987, -0....",0
9,it s perfect,Ugwmueq-3gsvUR3HAHZ4AaABAg,2018-12-08T01:49:54.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[it, s, perfect]","[0.077113256, 0.060511004, 0.13120939, 0.06240...",0


In [14]:
trained_clf = model_dir + "/" + 'doc2vec_spam_kn.model'
loaded_clf = joblib.load(trained_clf)

In [15]:
comments['SPAM_IND_KN'] = loaded_clf.predict(comments['SENTVECS'].tolist())

In [16]:
comments.head(10)

Unnamed: 0,Comment,CommentID,CreateTimeStamp,Type,videoID,videoTitle,FNAME,WORDS,SENTVECS,SPAM_IND_RF,SPAM_IND_KN
0,Like maroon Comment ed sheeran,UgxD46CBoJVhQCAibLp4AaABAg,2018-12-08T02:32:48.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[like, maroon, comment, ed, sheeran]","[0.09525244, 0.0007141725, 0.0357948, -0.23296...",1,1
1,Its Legendary December,Ugx91N6QKBVsR24qOH94AaABAg,2018-12-08T02:31:02.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[its, legendary, december]","[-0.0021700214, -0.011219359, -0.07317611, -0....",0,0
2,Feelings so deep in feelings,UgzG83e-1TszyKCMO8F4AaABAg,2018-12-08T02:28:16.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[feelings, so, deep, in, feelings]","[0.236262, 0.12739281, 0.005141589, 0.21333386...",0,0
3,I love your songs seriously I would like to me...,UgydHSuRFNFux7vGSQ94AaABAg,2018-12-08T02:20:23.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[i, love, your, songs, seriously, i, would, li...","[0.79627186, 0.7702556, 0.24780324, -0.2365534...",0,0
4,I love it,UgwTYwXgN9KGAB7WIaN4AaABAg,2018-12-08T02:13:49.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[i, love, it]","[0.046548694, 0.15993516, 0.00060374994, 0.007...",0,0
5,como me encanta esta cancion,UgztF8wJRD-SI92_V_Z4AaABAg,2018-12-08T02:12:03.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[como, me, encanta, esta, cancion]","[0.17076933, 0.030146817, 0.27426234, -0.06725...",0,1
6,Awwww love your music your the best,Ugwdf7L_S85fVKV2FSl4AaABAg,2018-12-08T02:08:46.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[awwww, love, your, music, your, the, best]","[0.16973305, 0.40647706, 0.23566242, 0.1104907...",0,0
7,I love this song thank you for making it,Ugxn4ht_YPobdCoyUfB4AaABAg,2018-12-08T02:06:01.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[i, love, this, song, thank, you, for, making,...","[0.14801028, 0.51117325, 0.14707884, -0.214295...",0,0
8,December,UgxHvVj4_XaLBTLKqW14AaABAg,2018-12-08T01:56:15.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,[december],"[-0.085088514, -0.044014324, -0.047773987, -0....",0,0
9,it s perfect,Ugwmueq-3gsvUR3HAHZ4AaABAg,2018-12-08T01:49:54.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,"[it, s, perfect]","[0.077113256, 0.060511004, 0.13120939, 0.06240...",0,0


In [17]:
out_df = comments[['Comment', 'CommentID', 'CreateTimeStamp','Type', 'videoID', 'videoTitle','FNAME', 'SPAM_IND_RF','SPAM_IND_KN']]

In [18]:
out_df.head(5)

Unnamed: 0,Comment,CommentID,CreateTimeStamp,Type,videoID,videoTitle,FNAME,SPAM_IND_RF,SPAM_IND_KN
0,Like maroon Comment ed sheeran,UgxD46CBoJVhQCAibLp4AaABAg,2018-12-08T02:32:48.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,1,1
1,Its Legendary December,Ugx91N6QKBVsR24qOH94AaABAg,2018-12-08T02:31:02.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,0,0
2,Feelings so deep in feelings,UgzG83e-1TszyKCMO8F4AaABAg,2018-12-08T02:28:16.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,0,0
3,I love your songs seriously I would like to me...,UgydHSuRFNFux7vGSQ94AaABAg,2018-12-08T02:20:23.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,0,0
4,I love it,UgwTYwXgN9KGAB7WIaN4AaABAg,2018-12-08T02:13:49.000Z,Comment,2Vv-BfVoq4g,Ed Sheeran - Perfect (Official Music Video),EdSheeran.csv,0,0


In [19]:
out_file_name = '../outdata/video_comments_analysis_spam.csv'
out_df.to_csv(out_file_name, sep='\t', index=False)

In [20]:
print('****End of script****')

****End of script****
