# Predict if a YouTube comment is a spam
### Import the data as pandas DataFrame

In [1]:
import os
from keras.utils.data_utils import get_file
import cPickle as pickle
import bcolz


def get_glove_dataset(dataset):
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

def load_array(fname):
    return bcolz.open(fname)[:]

vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
from collections import Counter
import itertools
import nltk
import re
import pandas as pd
import numpy as np

train_filenames = ['Youtube01-Psy.csv', 'Youtube02-KatyPerry.csv', 'Youtube03-LMFAO.csv', 'Youtube04-Eminem.csv']
valid_filename = 'Youtube05-Shakira.csv'

train_df = pd.concat([pd.read_csv('data/' + filename, encoding='utf-8-sig') for filename in train_filenames])

train_df.CONTENT.head()

def replace_url(phrase):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', phrase)
    for url in urls:
        phrase = phrase.replace(url, 'LINKHTTP')
    return phrase

def format_phrase(phrase):
    phrase = replace_url(phrase)
    words = re.sub("[^\w]", " ",  phrase).split()
    words = nltk.word_tokenize(phrase)
    return [w.replace(" ", "").lower() for w in words]
    
def get_unique_words(phrases):
    words_list = phrases.sum()
    return np.unique(np.array(words_list))

def words2idxs(phrase):
    words_count = len(wordidx) - 1
    return [wordidx[word] if word in wordidx else words_count for word in phrase]

train_df = train_df.assign(CONTENT_WORDS=train_df.CONTENT.apply(format_phrase))

#unique_words = get_unique_words(train_df.CONTENT_WORDS)
#word2idx = {v: k for k, v in enumerate(unique_words)}

train_df.head(3)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS,CONTENT_WORDS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1,"[huh, ,, anyway, check, out, this, you, [, tub..."
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1,"[hey, guys, check, out, my, new, channel, and,..."
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1,"[just, for, test, i, have, to, say, murdev.com]"


### Get words indexes

In [3]:
format_phrase('Agnes Blog is totALLy awesome :) !!!!')

['agnes', 'blog', 'is', 'totally', 'awesome', ':', ')', '!', '!', '!', '!']

In [4]:
from keras import backend as K
from keras.preprocessing import sequence

train_df = train_df.assign(CONTENT_IDX=train_df.CONTENT_WORDS.apply(words2idxs))

maxlen = train_df.CONTENT_IDX.map(len).max()
train_content_idx = sequence.pad_sequences(train_df.CONTENT_IDX, maxlen=maxlen, value=-1)

In [5]:
valid_df = pd.read_csv('data/' + valid_filename, encoding='utf-8-sig')
valid_df = valid_df.assign(CONTENT_WORDS=valid_df.CONTENT.apply(format_phrase))
valid_df = valid_df.assign(CONTENT_IDX=valid_df.CONTENT_WORDS.apply(words2idxs))
valid_content_idx = sequence.pad_sequences(valid_df.CONTENT_IDX, maxlen=maxlen, value=-1)

In [6]:
from keras.layers import MaxPooling1D, Conv1D, BatchNormalization
from keras.layers import Flatten, Dense, Embedding, Dropout, Dense, SpatialDropout1D
from keras.models import Sequential
from keras.optimizers import Adam, Adamax, RMSprop, SGD
from keras.regularizers import l2

#embeddings_regularizer=l2(1e-4),

vocab_size = len(wordidx)
vgg_model = Sequential([
    
    Embedding(vocab_size, 50, input_length=maxlen, embeddings_regularizer=l2(1e-4), 
              dropout=0.2, weights=[vecs], trainable=False),
    
    # Conv Block 1
    Conv1D(64, 5, padding='same', activation='relu'),
    #Conv1D(64, 3, padding='same', activation='relu'),
    MaxPooling1D(),
    Dropout(0.5),
    
    # Conv Block 2
    #Conv1D(128, 3, padding='same', activation='relu'),
    #Conv1D(128, 3, padding='same', activation='relu'),
    #MaxPooling1D(),
    #Dropout(0.6),
        
    # FC layers wiht BatchNorm
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(100, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')])


vgg_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
vgg_model.optimizer.lr = 10e-5
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=3, batch_size=64)

vgg_model.optimizer.lr = 10e-3
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=10, batch_size=64)
vgg_model.optimizer.lr = 10e-4
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=40, batch_size=64)
vgg_model.optimizer.lr = 10e-5
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), 
              epochs=40, batch_size=64)



Train on 1586 samples, validate on 370 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 1586 samples, validate on 370 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 1586 samples, validate on 370 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train on 1586 samples, validate on 370 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch

<keras.callbacks.History at 0x7fd813ad6810>

In [7]:
non_spams = ['lol love it', 'awesome video', 'i love this song', 'so many views', 'she must have so much money']
spams = ['check my channel', 'want to have more money contact me mail', 'email me to earn a lot of money',
         'email me to at agne@gmal.com', 'suscribe to my yt channel', 'http://salut.com']

spams = [words2idxs(format_phrase(spam)) for spam in spams]
non_spams = [words2idxs(format_phrase(spam)) for spam in non_spams]


spams = sequence.pad_sequences(spams, maxlen=maxlen, value=-1)
non_spams = sequence.pad_sequences(non_spams, maxlen=maxlen, value=-1)

vgg_model.predict(non_spams)

array([[ 0.13810994],
       [ 0.37778828],
       [ 0.14262445],
       [ 0.06725907],
       [ 0.26699576]], dtype=float32)

In [8]:
vgg_model.predict(spams)

array([[ 0.99768353],
       [ 0.9818902 ],
       [ 0.96827245],
       [ 0.98142713],
       [ 0.97258818],
       [ 0.49787214]], dtype=float32)