# Predict if a YouTube comment is a spam
### Import the data as pandas DataFrame

In [1]:
import pandas as pd
import numpy as np

train_filenames = ['Youtube01-Psy.csv', 'Youtube02-KatyPerry.csv', 'Youtube03-LMFAO.csv', 'Youtube04-Eminem.csv']
valid_filename = 'Youtube05-Shakira.csv'

train_df = pd.concat([pd.read_csv('data/' + filename, encoding='utf-8-sig') for filename in train_filenames])

train_df.CONTENT.head()

0    Huh, anyway check out this you[tube] channel: ...
1    Hey guys check out my new channel and our firs...
2               just for test I have to say murdev.com
3     me shaking my sexy ass on my channel enjoy ^_^ ﻿
4              watch?v=vtaRGgvGtWQ   Check this out .﻿
Name: CONTENT, dtype: object

### Get words indexes

In [2]:
from collections import Counter
import itertools
import nltk
import re

def replace_url(phrase):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', phrase)
    for url in urls:
        phrase = phrase.replace(url, 'URLURLURLURL')
    return phrase

def format_phrase(phrase):
    #phrase = replace_url(phrase)
    words = re.sub("[^\w]", " ",  phrase).split()
    #words = nltk.word_tokenize(phrase)
    return [w.replace(" ", "").lower() for w in words]
    
def get_unique_words(phrases):
    words_list = phrases.sum()
    return np.unique(np.array(words_list))

def words2idxs(phrase):
    words_count = len(word2idx)
    return [word2idx[word] if word in word2idx else words_count for word in phrase]

train_df = train_df.assign(CONTENT_WORDS=train_df.CONTENT.apply(format_phrase))

unique_words = get_unique_words(train_df.CONTENT_WORDS)
word2idx = {v: k for k, v in enumerate(unique_words)}

pd.options.display.max_colwidth = 300
train_df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS,CONTENT_WORDS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"[huh, anyway, check, out, this, you, tube, channel, kobyoshi02]"
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"[hey, guys, check, out, my, new, channel, and, our, first, vid, this, is, us, the, monkeys, i, m, the, monkey, in, the, white, shirt, please, leave, a, like, comment, and, please, subscribe]"
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1,"[just, for, test, i, have, to, say, murdev, com]"
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,"[me, shaking, my, sexy, ass, on, my, channel, enjoy, _]"
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1,"[watch, v, vtarggvgtwq, check, this, out]"


In [3]:
format_phrase('Agnes Blog is totALLy awesome :) !!!!')

['agnes', 'blog', 'is', 'totally', 'awesome']

In [4]:
from keras import backend as K
from keras.preprocessing import sequence

train_df = train_df.assign(CONTENT_IDX=train_df.CONTENT_WORDS.apply(words2idxs))

maxlen = train_df.CONTENT_IDX.map(len).max()
train_content_idx = sequence.pad_sequences(train_df.CONTENT_IDX, maxlen=maxlen, value=-1)
train_df.head()

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS,CONTENT_WORDS,CONTENT_IDX
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: kobyoshi02",1,"[huh, anyway, check, out, this, you, tube, channel, kobyoshi02]","[1839, 518, 861, 2626, 3500, 3911, 3590, 850, 2070]"
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,"Hey guys check out my new channel and our first vid THIS IS US THE MONKEYS!!! I'm the monkey in the white shirt,please leave a like comment and please subscribe!!!!",1,"[hey, guys, check, out, my, new, channel, and, our, first, vid, this, is, us, the, monkeys, i, m, the, monkey, in, the, white, shirt, please, leave, a, like, comment, and, please, subscribe]","[1779, 1697, 861, 2626, 2456, 2492, 850, 493, 2625, 1482, 3692, 3500, 1957, 3649, 3477, 2407, 1855, 2252, 3477, 2406, 1899, 3477, 3807, 3130, 2746, 2122, 364, 2144, 937, 493, 2746, 3354]"
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1,"[just, for, test, i, have, to, say, murdev, com]","[2021, 1509, 3464, 1855, 1746, 3528, 3049, 2448, 929]"
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,"[me, shaking, my, sexy, ass, on, my, channel, enjoy, _]","[2303, 3114, 2456, 3110, 560, 2592, 2456, 850, 1308, 348]"
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1,"[watch, v, vtarggvgtwq, check, this, out]","[3762, 3667, 3733, 861, 3500, 2626]"


In [5]:
valid_df = pd.read_csv('data/' + valid_filename, encoding='utf-8-sig')

valid_df = valid_df.assign(CONTENT_WORDS=valid_df.CONTENT.apply(format_phrase))
valid_df = valid_df.assign(CONTENT_IDX=valid_df.CONTENT_WORDS.apply(words2idxs))

valid_content_idx = sequence.pad_sequences(valid_df.CONTENT_IDX, maxlen=maxlen, value=-1)

In [6]:
from keras.layers import MaxPooling1D, Conv1D, BatchNormalization
from keras.layers import Flatten, Dense, Embedding, Dropout, Dense, SpatialDropout1D
from keras.models import Sequential
from keras.optimizers import Adam, Adamax, RMSprop, SGD
from keras.regularizers import l2

vocab_size = len(word2idx) + 1
vgg_model = Sequential([
    
    Embedding(vocab_size, 32, input_length=maxlen, embeddings_regularizer=l2(1e-4), dropout=0.4),
    Dropout(0.4),
    
    # Conv Block 1
    Conv1D(64, 5, padding='same', activation='relu'),
    MaxPooling1D(),
    Dropout(0.6),
    
    # FC layers wiht BatchNorm
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(100, activation='relu'),
    BatchNormalization(),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])


vgg_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

vgg_model.optimizer.lr = 10e-3
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=10, batch_size=64)
vgg_model.optimizer.lr = 10e-4
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=40, batch_size=64)
vgg_model.optimizer.lr = 10e-5
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=40, batch_size=64)



Train on 1586 samples, validate on 370 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 1586 samples, validate on 370 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train on 1586 samples, validate on 370 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/4

<keras.callbacks.History at 0x7fad004eb8d0>

In [7]:
non_spams = ['lol love it', 'awesome video', 'i love this song', 'so many views', 'she must have so much money']
spams = ['check my channel', 'want to have more money contact me mail', 'email me to earn a lot of money',
         'email me to at agne@gmal.com', 'suscribe to my yt channel', 'http://salut.com']

spams = [words2idxs(format_phrase(spam)) for spam in spams]
non_spams = [words2idxs(format_phrase(spam)) for spam in non_spams]


spams = sequence.pad_sequences(spams, maxlen=maxlen, value=-1)
non_spams = sequence.pad_sequences(non_spams, maxlen=maxlen, value=-1)

vgg_model.predict(non_spams)

array([[ 0.00021692],
       [ 0.00022692],
       [ 0.00020616],
       [ 0.00021896],
       [ 0.00024789]], dtype=float32)

In [8]:
vgg_model.predict(spams)

array([[  1.00000000e+00],
       [  3.19520719e-02],
       [  8.76859762e-04],
       [  7.98932850e-01],
       [  9.99083638e-01],
       [  9.99604404e-01]], dtype=float32)