## Spam Detection

In [1]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import re
import os
import random
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def extract_words(sent):
    sent = sent.lower()
    sent = re.sub(r'<[^>]+>', ' ', sent) #strip html tags
    sent = re.sub(r'(\w)\'(\w)', '\1\2', sent) # remoce apostrophes
    sent = re.sub(r'\W',' ', sent) # remove punctuation
    sent = re.sub(r'\s+', ' ', sent) # remove repeated space
    sent = sent.strip()
    return sent.split()

In [4]:
# test text cleanup block
words = 'Chirstmas time...Santa time, 2018  + ''//''    '
print(extract_words(words))

['chirstmas', 'time', 'santa', 'time', '2018']


In [6]:
proj_root = os.path.dirname(os.getcwd())
#print("Current working dir : {0}".format(proj_root))

In [7]:
model_dir = proj_root + "/" + "model"
doc2vec_model = model_dir + "/" + 'comments.d2v'

#Load doc to vec model
model = Doc2Vec.load(doc2vec_model)

In [8]:
model.infer_vector(extract_words("This is awesome..."))

array([ 0.10651165,  0.08706113, -0.03325614, -0.09894038,  0.03064333,
       -0.019945  ,  0.03584543, -0.20947409,  0.05308295,  0.07270622,
        0.00521208, -0.08042626, -0.0354125 , -0.03602903, -0.05012208,
        0.04042677,  0.00169689,  0.06674345, -0.00993093,  0.09504773,
       -0.05088811, -0.03965221, -0.11067194,  0.04205915,  0.0532117 ,
       -0.03844444,  0.08866784,  0.05826956, -0.07038295,  0.00540849,
        0.07919995, -0.09133605,  0.07977698,  0.01242394, -0.03988631,
        0.07748676, -0.05657875, -0.03706421,  0.13353555, -0.02652852,
       -0.02558131, -0.22499587,  0.02591169, -0.10143113,  0.12842202,
        0.00634575,  0.06762049,  0.07074005, -0.044052  ,  0.04046854],
      dtype=float32)

In [22]:
#Load labeled data
#Trying to use pandas
labeled_spam_subdir = 'traindata/labeled-spam-data'
labeled_spam_dir = proj_root + "/" + labeled_spam_subdir
               
data = pd.concat([pd.read_csv(labeled_spam_dir + "/" + "Youtube01-Psy.csv"), 
               pd.read_csv(labeled_spam_dir + "/" + "Youtube02-KatyPerry.csv"),
              pd.read_csv(labeled_spam_dir + "/" + "Youtube03-LMFAO.csv"),
              pd.read_csv(labeled_spam_dir + "/" + "Youtube04-Eminem.csv"),
              pd.read_csv(labeled_spam_dir + "/" + "Youtube05-Shakira.csv")])

In [23]:
data['CONTENT'][0:2]

0    Huh, anyway check out this you[tube] channel: ...
1    Hey guys check out my new channel and our firs...
Name: CONTENT, dtype: object

In [25]:
data['WORDS'] = data['CONTENT'].map(lambda x: extract_words(x))

In [26]:
data['WORDS'][0:2]

0    [huh, anyway, check, out, this, you, tube, cha...
1    [hey, guys, check, out, my, new, channel, and,...
Name: WORDS, dtype: object

In [27]:
sentences = data['CONTENT']

In [28]:
data['SENTVECS'] = data['WORDS'].map(lambda x: model.infer_vector(x, steps=10))

In [29]:
data['SENTVECS'][0:5]

0    [0.16520166, 0.20271881, 0.1698854, 0.1000886,...
1    [0.09364169, 0.09967823, -0.21531972, 0.103081...
2    [-0.029987874, 0.02582819, 0.18829851, -0.3173...
3    [0.2704871, 0.07241474, 0.17938806, -0.2955648...
4    [0.13608585, 0.04917111, 0.07191952, 0.082964,...
Name: SENTVECS, dtype: object

In [30]:
sentvecs = data['SENTVECS'].tolist()
labels = data['CLASS'].tolist()

### Build a classifier

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np

  from numpy.core.umath_tests import inner1d


In [32]:
clfkn = KNeighborsClassifier(n_neighbors=9)
clfrf = RandomForestClassifier()

In [33]:
scores = cross_val_score(clfkn, sentvecs, labels, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.8936909546427266 std : 0.030292858107524863


In [34]:
scores = cross_val_score(clfrf, sentvecs, labels, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.8783639542773631 std : 0.04028561450393364


#### Fit the model and save it for later use

In [36]:
#Train the model
clfkn.fit(sentvecs, labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

In [37]:
#Train the model
clfrf.fit(sentvecs, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [38]:
y_pred = clfkn.predict(sentvecs[0:10])
print("y_pred : {0}".format(y_pred))
y_test = labels[0:10]
print("y_test : {0}".format(y_test))

y_pred : [1 1 1 1 1 1 1 1 1 1]
y_test : [1, 1, 1, 1, 1, 1, 1, 0, 1, 1]


In [40]:
#Save the model
from sklearn.externals import joblib

In [41]:
model_dir = proj_root + "/" + "model"

In [42]:
model_name = model_dir + "/" + 'doc2vec_spam_kn.model'
joblib.dump(clfkn, model_name)

['/Users/dbiswas/Documents/Malabika/MS/Fall2018/social_media_mining/project/comments_analysis/model/doc2vec_spam_kn.model']

In [43]:
model_name = model_dir + "/" + 'doc2vec_spam_rf.model'
joblib.dump(clfrf, model_name)

['/Users/dbiswas/Documents/Malabika/MS/Fall2018/social_media_mining/project/comments_analysis/model/doc2vec_spam_rf.model']

In [45]:
import pickle

In [46]:
# save the model to disk
model_name = model_dir + "/" + 'doc2vec_spam_rf.sav'
pickle.dump(clfrf, open(model_name, 'wb'))

#### Bag-of-Words model comparison

In [47]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), RandomForestClassifier())

In [48]:
scores = cross_val_score(pipeline, sentences, labels, cv =5)
print("avg : {0} std : {1}".format(np.mean(scores), np.std(scores)))

avg : 0.9151312698992641 std : 0.01103018082373847


##### Bag-of-Words and Doc2Vec performance seems to be close. However, if we have lot more training example Doc2Vec works better