## Spam Detection
https://archive.ics.uci.edu/ml/datasets/YouTube+Spam+Collection

### Downloaded here:

data/YouTube-Spam-Collection-v1

In [15]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import re
import os
import random
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [18]:
# test text cleanup block
words = 'ajshd asda, + 1231 "sd + ''alkdj''    '
print(extract_words(words))

['ajshd', 'asda', '1231', 'sd', 'alkdj']


In [19]:
#Load doc to vec model
doc2vec_model_name = 'reviews.d2v'
model = Doc2Vec.load(doc2vec_model_name)

In [20]:
model.infer_vector(extract_words("This is very bad video. I don't like it"))

array([-0.00062089,  0.2351284 , -0.03454486, -0.05228289, -0.02684396,
       -0.09259088, -0.02277846, -0.12211672,  0.16191505,  0.01832944,
       -0.11064825, -0.24176133, -0.20306163, -0.04831441,  0.07895452,
        0.09381681,  0.11844175,  0.13362783,  0.10960958,  0.22026704,
       -0.33733156,  0.03500133,  0.03830094, -0.09355066,  0.0709269 ,
        0.15171033,  0.08753077,  0.15251824,  0.06004937,  0.07809102,
        0.31834617, -0.05555944,  0.14290863, -0.14791834, -0.09593711,
        0.15710084,  0.13722666, -0.07485071,  0.39205426, -0.15195712,
       -0.23429897, -0.27143204, -0.23950902, -0.10757922,  0.21703528,
       -0.03553951,  0.07372943, -0.03376171, -0.08742584,  0.15654385],
      dtype=float32)

In [21]:
cosine_similarity(
[model.infer_vector(extract_words("This is very bad video. I don't like it"))],
[model.infer_vector(extract_words("video sucks."))])

array([[0.7280318]], dtype=float32)

In [22]:
cosine_similarity(
[model.infer_vector(extract_words("It is now snowing in New York"))],
[model.infer_vector(extract_words("I feel sick. Dont feel like going to school"))])

array([[0.43282947]], dtype=float32)

In [23]:
d = pd.concat([pd.read_csv("data/YouTube-Spam-Collection-v1/Youtube01-Psy.csv"), 
               pd.read_csv("data/YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv"),
              pd.read_csv("data/YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv"),
              pd.read_csv("data/YouTube-Spam-Collection-v1/Youtube04-Eminem.csv"),
              pd.read_csv("data/YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")])
d = d.sample(frac=1)

In [24]:
d['CONTENT'][0:5]

416                    every bady yust have a good time﻿
42     they Shuffle hard that they made an Earthquake...
284    The Perry you're doing a good job good job I l...
253    I am now going to voyage to the first comment....
118                  Very pleasant to hear, haha, good.﻿
Name: CONTENT, dtype: object

In [25]:
d['WORDS'] = d['CONTENT'].map(lambda x: extract_words(x))

In [26]:
d['WORDS'][0:5]

416             [every, bady, yust, have, a, good, time]
42     [they, shuffle, hard, that, they, made, an, ea...
284    [the, perry, yo, e, doing, a, good, job, good,...
253    [i, am, now, going, to, voyage, to, the, first...
118               [very, pleasant, to, hear, haha, good]
Name: WORDS, dtype: object

In [67]:
sentences = d['CONTENT']

In [27]:
d['SENTVECS'] = d['WORDS'].map(lambda x: model.infer_vector(x, steps=10))

In [28]:
d['SENTVECS'][0:5]

416    [-0.07048194, 0.30158433, 0.055230945, -0.1037...
42     [0.018265551, 0.14901805, 0.06305703, 0.103425...
284    [0.21629643, 0.37278166, 0.47778076, 0.1357919...
253    [0.036792, -0.076096356, 0.32669353, -0.204220...
118    [0.12418369, 0.37762502, 0.14810343, -0.269280...
Name: SENTVECS, dtype: object

In [44]:
sentvecs = d['SENTVECS'].tolist()
labels = d['CLASS'].tolist()

### Build a classifier

In [45]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np

In [46]:
clfkn = KNeighborsClassifier(n_neighbors=9)
clfrf = RandomForestClassifier()

In [47]:
scores = cross_val_score(clfkn, sentvecs, labels, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.9059188892948484 std : 0.012107819061874497


In [48]:
scores = cross_val_score(clfrf, sentvecs, labels, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.8992757972754319 std : 0.013055168550590612


#### Fit the model and save it for later use

In [49]:
#Train the model
clfkn.fit(sentvecs, label)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

In [51]:
#Train the model
clfrf.fit(sentvecs, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [52]:
y_pred = clfkn.predict(sentvecs[0:10])
print("y_pred : {0}".format(y_pred))
y_test = labels[0:10]
print("y_test : {0}".format(y_test))

y_pred : [0 0 1 1 0 1 1 0 1 1]
y_test : [0, 0, 1, 0, 0, 1, 1, 0, 1, 1]


In [53]:
accuracy = metrics.accuracy_score(y_test, y_pred)
print("accuracy : {0}".format(accuracy))

accuracy : 0.9


In [54]:
cf = metrics.confusion_matrix(y_test, y_pred)
print("Confusion Metrics : \n{0}".format(cf))

Confusion Metrics : 
[[4 1]
 [0 5]]


In [55]:
#Save the model
from sklearn.externals import joblib

In [56]:
model_name = 'doc2vec_spam_kn.model'
joblib.dump(clfkn, model_name)

['doc2vec_spam_kn.model']

In [57]:
model_name = 'doc2vec_spam_rf.model'
joblib.dump(clfrf, model_name)

['doc2vec_spam_rf.model']

In [59]:
import pickle

In [62]:
# save the model to disk
model_name = 'doc2vec_spam_rf.sav'
pickle.dump(clfrf, open(model_name, 'wb'))

In [63]:
# load the model from disk
loaded_model = pickle.load(open(model_name, 'rb'))
result = loaded_model.score(sentvecs[0:10], labels[0:10])
print(result)

1.0


#### Bag-of-Words model comparison

In [64]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), RandomForestClassifier())

In [68]:
scores = cross_val_score(pipeline, sentences, labels, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.937115715851558 std : 0.012280288547147656


##### Bag-of-Words and Doc2Vec performance seems to be close. However, if we have lot more training example Doc2Vec works better