In [1]:
import pandas as pd
import os
os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding, concatenate
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model

from keras.callbacks import ModelCheckpoint

import numpy as np
import pickle

Using Theano backend.


In [2]:
data = pd.read_csv('../data/youtube_relevance_labelling.csv')

In [3]:
data

Unnamed: 0,query,title,description,id,Label
0,Algebra videos,Algebra Introduction - Basic Overview - Online...,This math video tutorial provides a basic over...,grnP3mduZkM,1
1,Algebra videos,Algebra Basics: What Is Algebra? - Math Antics,This video gives an overview of Algebra and in...,NybHckSEQBI,1
2,Algebra videos,Quick Math Review to Prep for Algebra 1,This is 1 of 4 videos I custom made for an edu...,6KtQrCP01OE,1
3,Algebra videos,Algebra - Completing the square,Hi Algebrinos! As we progress with our problem...,DJMH2F3GuIc,0
4,Algebra videos,Algebra Basics: Solving Basic Equations Part 1...,This video shows students how to solve simple ...,l3XzepN03KQ,0
5,Algebra videos,Algebra Basics: What Are Polynomials? - Math A...,This video introduces students to polynomials ...,ffLLmV4mZwU,0
6,Algebra videos,Basic Algebra Lessons for Beginners 😊 (P1) -- ...,Master Algebra the easiest way without even Le...,V3dFHt9p5W8,1
7,Algebra videos,Algebra Basics: The Distributive Property - Ma...,This video introduces the Distributive Propert...,v-6MShC82ow,0
8,Algebra videos,Algebra 25 - Linear Equations in the Real World,Linear equations can be used to solve many typ...,8eXb-6wQUks,1
9,What are some SAT math hacks?,SAT Math Hacks: Tips and Tricks to Destroy the...,Hack your way to a better score on the Math se...,Rqmtjl_c-uo,1


In [4]:
data["query"] = data["query"].apply(lambda x: '' if x is np.nan else x)

In [5]:
data["title"] = data["title"].apply(lambda x: '' if x is np.nan else x)

In [6]:
data["description"] = data["description"].apply(lambda x: '' if x is np.nan else x)

In [7]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 100

In [8]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)



In [9]:
texts = pd.concat((data['query'], data['title'], data['description'])) 

In [10]:
texts.shape

(1698,)

In [11]:
tokenizer.fit_on_texts(texts)

In [12]:
query_sequences = tokenizer.texts_to_sequences(data['query'])

In [13]:
title_sequences = tokenizer.texts_to_sequences(data['title'])

In [14]:
desc_sequences = tokenizer.texts_to_sequences(data['description'])

In [15]:
query_data = pad_sequences(query_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [16]:
title_data = pad_sequences(title_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [17]:
desc_data = pad_sequences(desc_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [18]:
labels = np.array(data['Label'])

In [19]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 1965 unique tokens.


In [20]:
embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [21]:
embedding_matrix_1 = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_1[i] = embedding_vector

In [22]:
embedding_matrix_2 = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_2[i] = embedding_vector

In [23]:
embedding_matrix_3 = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_3[i] = embedding_vector

In [24]:
embedding_layer_1 = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix_1],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input_1 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer_1(sequence_input_1)
l_cov1_1= Conv1D(128, 5, activation='relu')(embedded_sequences_1)
l_pool1_1 = MaxPooling1D(5)(l_cov1_1)
l_dropout1_1 = Dropout(0.2)(l_pool1_1)
l_flat_1 = Flatten()(l_dropout1_1)

In [25]:
embedding_layer_2 = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix_2],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input_2 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer_2(sequence_input_2)
l_cov1_2= Conv1D(128, 5, activation='relu')(embedded_sequences_2)
l_pool1_2 = MaxPooling1D(5)(l_cov1_2)
l_dropout1_2 = Dropout(0.2)(l_pool1_2)
l_flat_2 = Flatten()(l_dropout1_2)

In [26]:
embedding_layer_3 = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix_3],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input_3 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_3 = embedding_layer_3(sequence_input_3)
l_cov1_3 = Conv1D(128, 5, activation='relu')(embedded_sequences_3)
l_pool1_3 = MaxPooling1D(5)(l_cov1_3)
l_dropout1_3 = Dropout(0.2)(l_pool1_3)
l_flat_3 = Flatten()(l_dropout1_3)

In [27]:
x = concatenate([l_flat_1, l_flat_2, l_flat_3])

In [28]:
x = Dense(64, activation='relu')(x)

In [29]:
main_output = Dense(1, activation='sigmoid', name='main_output')(x)

In [30]:
model = Model(inputs=[sequence_input_1, sequence_input_2, sequence_input_3], outputs=[main_output])

In [31]:
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [32]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [33]:
model.fit([query_data, title_data, desc_data], [labels], epochs=11, batch_size=5, callbacks=callbacks_list,
          validation_split=0.1)

Train on 509 samples, validate on 57 samples
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.callbacks.History at 0x11aed2d30>

In [56]:
labels.shape

(566,)

In [34]:
pickle.dump(tokenizer, open('tokenizer.p', 'wb'))