# Predict if a YouTube comment is a spam
### Import the data as pandas DataFrame

In [1]:
import pandas as pd
import numpy as np

train_filenames = ['Youtube01-Psy.csv', 'Youtube02-KatyPerry.csv', 'Youtube03-LMFAO.csv', 'Youtube04-Eminem.csv']
valid_filename = 'Youtube05-Shakira.csv'

train_df = pd.concat([pd.read_csv('data/' + filename, encoding='utf-8-sig') for filename in train_filenames])

train_df.CONTENT.head(30)

0     Huh, anyway check out this you[tube] channel: ...
1     Hey guys check out my new channel and our firs...
2                just for test I have to say murdev.com
3      me shaking my sexy ass on my channel enjoy ^_^ ﻿
4               watch?v=vtaRGgvGtWQ   Check this out .﻿
5     Hey, check out my new website!! This site is a...
6                             Subscribe to my channel ﻿
7     i turned it on mute as soon is i came on i jus...
8       You should check my channel for Funny VIDEOS!!﻿
9     and u should.d check my channel and tell me wh...
10                                 Hey subscribe to me﻿
11     Once you have started reading do not stop. If...
12                 https://twitter.com/GBphotographyGB﻿
13                              subscribe like comment﻿
14    please like :D https://premium.easypromosapp.c...
15    Hello! Do you like gaming, art videos, scienti...
16                         I'm only checking the views﻿
17    http://www.ebay.com/itm/171183229277?ssPag

### Get words indexes

In [2]:
from collections import Counter
import itertools
import nltk
import re

def replace_url(phrase):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', phrase)
    for url in urls:
        phrase = phrase.replace(url, 'URLURLURLURL')
    return phrase

def format_phrase(phrase):
    #phrase = replace_url(phrase)
    words = re.sub("[^\w]", " ",  phrase).split()
    #words = nltk.word_tokenize(phrase)
    return [w.replace(" ", "").lower() for w in words]
    
def get_unique_words(phrases):
    words_list = phrases.sum()
    return np.unique(np.array(words_list))

def words2idxs(phrase):
    words_count = len(word2idx)
    return [word2idx[word] if word in word2idx else words_count for word in phrase]

train_df = train_df.assign(CONTENT_WORDS=train_df.CONTENT.apply(format_phrase))

unique_words = get_unique_words(train_df.CONTENT_WORDS)
word2idx = {v: k for k, v in enumerate(unique_words)}

train_df.head(10)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS,CONTENT_WORDS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1,"[huh, anyway, check, out, this, you, tube, cha..."
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1,"[hey, guys, check, out, my, new, channel, and,..."
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1,"[just, for, test, i, have, to, say, murdev, com]"
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,"[me, shaking, my, sexy, ass, on, my, channel, ..."
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1,"[watch, v, vtarggvgtwq, check, this, out]"
5,LZQPQhLyRh9-wNRtlZDM90f1k0BrdVdJyN_YsaSwfxc,Jason Haddad,2013-11-26T02:55:11,"Hey, check out my new website!! This site is a...",1,"[hey, check, out, my, new, website, this, site..."
6,z13lfzdo5vmdi1cm123te5uz2mqig1brz04,ferleck ferles,2013-11-27T21:39:24,Subscribe to my channel ﻿,1,"[subscribe, to, my, channel]"
7,z122wfnzgt30fhubn04cdn3xfx2mxzngsl40k,Bob Kanowski,2013-11-28T12:33:27,i turned it on mute as soon is i came on i jus...,0,"[i, turned, it, on, mute, as, soon, is, i, cam..."
8,z13ttt1jcraqexk2o234ghbgzxymz1zzi04,Cony,2013-11-28T16:01:47,You should check my channel for Funny VIDEOS!!﻿,1,"[you, should, check, my, channel, for, funny, ..."
9,z12avveb4xqiirsix04chxviiljryduwxg0,BeBe Burkey,2013-11-28T16:30:13,and u should.d check my channel and tell me wh...,1,"[and, u, should, d, check, my, channel, and, t..."


In [3]:
format_phrase('Agnes Blog is totALLy awesome :) !!!!')

['agnes', 'blog', 'is', 'totally', 'awesome']

In [4]:
from keras import backend as K
from keras.preprocessing import sequence

train_df = train_df.assign(CONTENT_IDX=train_df.CONTENT_WORDS.apply(words2idxs))

maxlen = train_df.CONTENT_IDX.map(len).max()
train_content_idx = sequence.pad_sequences(train_df.CONTENT_IDX, maxlen=maxlen, value=-1)
train_df.head(30)

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS,CONTENT_WORDS,CONTENT_IDX
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1,"[huh, anyway, check, out, this, you, tube, cha...","[1839, 518, 861, 2626, 3500, 3911, 3590, 850, ..."
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1,"[hey, guys, check, out, my, new, channel, and,...","[1779, 1697, 861, 2626, 2456, 2492, 850, 493, ..."
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1,"[just, for, test, i, have, to, say, murdev, com]","[2021, 1509, 3464, 1855, 1746, 3528, 3049, 244..."
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,"[me, shaking, my, sexy, ass, on, my, channel, ...","[2303, 3114, 2456, 3110, 560, 2592, 2456, 850,..."
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1,"[watch, v, vtarggvgtwq, check, this, out]","[3762, 3667, 3733, 861, 3500, 2626]"
5,LZQPQhLyRh9-wNRtlZDM90f1k0BrdVdJyN_YsaSwfxc,Jason Haddad,2013-11-26T02:55:11,"Hey, check out my new website!! This site is a...",1,"[hey, check, out, my, new, website, this, site...","[1779, 861, 2626, 2456, 2492, 3779, 3500, 3178..."
6,z13lfzdo5vmdi1cm123te5uz2mqig1brz04,ferleck ferles,2013-11-27T21:39:24,Subscribe to my channel ﻿,1,"[subscribe, to, my, channel]","[3354, 3528, 2456, 850]"
7,z122wfnzgt30fhubn04cdn3xfx2mxzngsl40k,Bob Kanowski,2013-11-28T12:33:27,i turned it on mute as soon is i came on i jus...,0,"[i, turned, it, on, mute, as, soon, is, i, cam...","[1855, 3594, 1962, 2592, 2453, 549, 3243, 1957..."
8,z13ttt1jcraqexk2o234ghbgzxymz1zzi04,Cony,2013-11-28T16:01:47,You should check my channel for Funny VIDEOS!!﻿,1,"[you, should, check, my, channel, for, funny, ...","[3911, 3143, 861, 2456, 850, 1509, 1563, 3695]"
9,z12avveb4xqiirsix04chxviiljryduwxg0,BeBe Burkey,2013-11-28T16:30:13,and u should.d check my channel and tell me wh...,1,"[and, u, should, d, check, my, channel, and, t...","[493, 3610, 3143, 1056, 861, 2456, 850, 493, 3..."


In [5]:
valid_df = pd.read_csv('data/' + valid_filename, encoding='utf-8-sig')

valid_df = valid_df.assign(CONTENT_WORDS=valid_df.CONTENT.apply(format_phrase))
valid_df = valid_df.assign(CONTENT_IDX=valid_df.CONTENT_WORDS.apply(words2idxs))

valid_content_idx = sequence.pad_sequences(valid_df.CONTENT_IDX, maxlen=maxlen, value=-1)
valid_df.head(10)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS,CONTENT_WORDS,CONTENT_IDX
0,z13lgffb5w3ddx1ul22qy1wxspy5cpkz504,dharma pal,2015-05-29T02:30:18.971000,Nice song﻿,0,"[nice, song]","[2498, 3239]"
1,z123dbgb0mqjfxbtz22ucjc5jvzcv3ykj,Tiza Arellano,2015-05-29T00:14:48.748000,I love song ﻿,0,"[i, love, song]","[1855, 2219, 3239]"
2,z12quxxp2vutflkxv04cihggzt2azl34pms0k,Prìñçeśś Âliś Łøvê Dømíñø Mâđiś™ ﻿,2015-05-28T21:00:08.607000,I love song ﻿,0,"[i, love, song]","[1855, 2219, 3239]"
3,z12icv3ysqvlwth2c23eddlykyqut5z1h,Eric Gonzalez,2015-05-28T20:47:12.193000,"860,000,000 lets make it first female to reach...",0,"[860, 000, 000, lets, make, it, first, female,...","[3940, 2, 2, 2130, 2270, 1962, 1482, 1449, 352..."
4,z133stly3kete3tly22petvwdpmghrlli,Analena López,2015-05-28T17:08:29.827000,shakira is best for worldcup﻿,0,"[shakira, is, best, for, worldcup]","[3115, 1957, 664, 1509, 3940]"
5,z12myn4rltf4ejddv23mwr3piuapcbl0r,jehoiada wellington,2015-05-28T17:06:37.288000,The best world cup song ever!!!!﻿,0,"[the, best, world, cup, song, ever]","[3477, 664, 3852, 1038, 3239, 1340]"
6,z135vzqy1yrjhluew23kibopnrmqsplux,Kara Cuthbertson,2015-05-28T15:46:42.482000,I love﻿,0,"[i, love]","[1855, 2219]"
7,z12uujnj2sifvzvav04chpypvofvexpoggg,Sudheer Yadav,2015-05-28T10:28:25.133000,SEE SOME MORE SONG OPEN GOOGLE AND TYPE Shakir...,1,"[see, some, more, song, open, google, and, typ...","[3080, 3229, 2415, 3239, 2603, 1648, 493, 3608..."
8,z13lvh1qnma4d15sy23lyvqq5riafz52m,Alex John,2015-05-28T07:44:52.614000,Awesome ﻿,0,[awesome],[589]
9,z135hlk5grfwjhmym04ced0gyzrvsn5avuw0k,Nirab Valobasha,2015-05-27T21:31:38.388000,I like shakira..﻿,0,"[i, like, shakira]","[1855, 2144, 3115]"


In [6]:
from keras.layers import MaxPooling1D, Conv1D, BatchNormalization
from keras.layers import Flatten, Dense, Embedding, Dropout, Dense, SpatialDropout1D
from keras.models import Sequential
from keras.optimizers import Adam, Adamax, RMSprop, SGD
from keras.regularizers import l2



vocab_size = len(word2idx) + 1
vgg_model = Sequential([
    
    Embedding(vocab_size, 32, input_length=maxlen, embeddings_regularizer=l2(1e-4), dropout=0.4),
    Dropout(0.4),
    
    # Conv Block 1
    Conv1D(64, 5, padding='same', activation='relu'),
    #Conv1D(64, 3, padding='same', activation='relu'),
    MaxPooling1D(),
    Dropout(0.6),
    
    # Conv Block 2
    #Conv1D(128, 3, padding='same', activation='relu'),
    #Conv1D(128, 3, padding='same', activation='relu'),
    #MaxPooling1D(),
    #Dropout(0.6),
        
    # FC layers wiht BatchNorm
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(100, activation='relu'),
    BatchNormalization(),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])


vgg_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

vgg_model.optimizer.lr = 10e-3
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=10, batch_size=64)
vgg_model.optimizer.lr = 10e-4
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=40, batch_size=64)
vgg_model.optimizer.lr = 10e-5
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=40, batch_size=64)



Train on 1586 samples, validate on 370 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 1586 samples, validate on 370 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train on 1586 samples, validate on 370 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/4

<keras.callbacks.History at 0x7f3a248ac310>

In [7]:
non_spams = ['lol love it', 'awesome video', 'i love this song', 'so many views', 'she must have so much money']
spams = ['check my channel', 'want to have more money contact me mail', 'email me to earn a lot of money',
         'email me to at agne@gmal.com', 'suscribe to my yt channel', 'http://salut.com']

spams = [words2idxs(format_phrase(spam)) for spam in spams]
non_spams = [words2idxs(format_phrase(spam)) for spam in non_spams]


spams = sequence.pad_sequences(spams, maxlen=maxlen, value=-1)
non_spams = sequence.pad_sequences(non_spams, maxlen=maxlen, value=-1)

vgg_model.predict(non_spams)

array([[  2.02950192e-04],
       [  2.21498092e-04],
       [  3.87404543e-06],
       [  3.24840978e-04],
       [  7.98035762e-04]], dtype=float32)

In [8]:
vgg_model.predict(spams)

array([[ 0.99865913],
       [ 0.99848181],
       [ 0.99802542],
       [ 0.99839884],
       [ 0.99853694],
       [ 0.99855644]], dtype=float32)