In [92]:
# Loading the cleaned data set
%store -r data

# Imports

import pandas as pd
from sklearn.model_selection import train_test_split

# Parallelisation
from joblib import parallel_backend

# Data
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Model Imports
from keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, Embedding, Flatten, \
                            Dropout, Activation
from keras.models import Model
from keras.models import Sequential
from keras.initializers import Constant

# Convolutional Neural Network (CNN)

In [93]:
data = data.sample(n = 1000000)

# Training and testing split
df_train, df_test = train_test_split(data, test_size=0.1)

### Data Pre-Processing 

In [None]:
# Tokenizing and Sequence Padding for training data

MAX_SEQUENCE_LENGTH = 50
MAX_NUM_WORDS = 10000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df_train['text'])
sequences = tokenizer.texts_to_sequences(df_train['text'])

features = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(df_train['target']))

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print('Shape of data tensor:', features.shape)
print('Shape of label tensor:', labels.shape)

# Training and validation split

VALIDATION_SPLIT = 0.11111
indices  = np.arange(features.shape[0])
np.random.shuffle(indices)
features = features[indices]
labels   = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * features.shape[0])
x_train = features[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val   = features[-num_validation_samples:]
y_val   = labels[-num_validation_samples:]

### Creating Embedding Matrix

In [97]:
# Generating embedding index from Glove text file

embeddings_index = {}

# opening the downloaded glove embeddings file
f = open('glove.6B.300d.txt')
for line in f:
    # For each line file, the words are split and stored in a list
    values = line.split()
    word   = value = values[0]
    coefs  = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()
print('Found %s word vectors.' %len(embeddings_index))

Found 400000 word vectors.


In [98]:
# Creating embedding matrix

EMBEDDING_DIM = 300
embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Building the model

In [None]:
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)

embedding_layer = Embedding(MAX_NUM_WORDS,EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH,trainable=True, weights = [embedding_matrix])

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(64, 3, activation='relu')(embedded_sequences)
x = Conv1D(32, 3, activation='relu')(x)
x = Conv1D(16, 3, activation='relu')(x)

x = GlobalMaxPooling1D()(x)

x = Flatten()(x)

x = Dropout(0.2)(x)
x = Dense(180, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(2, activation='relu')(x)


output = Activation('sigmoid')(x)

model = Model(inputs=[sequence_input], outputs=[output])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc'])

model.summary()

### Training the model

In [None]:
with parallel_backend('threading'):
    history = model.fit(x_train, y_train,batch_size=32, epochs=5, validation_data=(x_val, y_val))

### Evaluating the model

In [67]:
tokenizer.fit_on_texts(df_test['text'])
sequences = tokenizer.texts_to_sequences(df_test['text'])

features_test = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = to_categorical(np.asarray(df_test['target']))

In [69]:
# evaluate the model
score = model.evaluate(features_test, labels_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

acc: 59.38%


### Plot

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

### Exporting Model

In [None]:
model.save("group144_pretrained_model", save_format="h5")