# Multi-Convolutional Net for Sentiment Classification

This Conv Net performs sentiment analysis on the Google toxicity dataset review dataset.

In [1]:
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

import keras
print(keras.__version__)

ValueError: Unable to import backend : plaidml.keras.backend

In [None]:
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text
from keras.models import Model, Sequential
from keras.layers import Input, concatenate
from keras.layers import Dense, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Embedding, Conv1D, SpatialDropout1D, GlobalMaxPool1D, LSTM
from keras.layers.wrappers import Bidirectional
from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras_contrib.layers.advanced_activations import SineReLU

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

#### Set Hyperparameters

In [None]:
output_dir = 'model_output/multi-conv'

e_param = 0.05

n_classes = 6

epochs = 3
patience = 1
batch_size = 128
test_split=.3

n_dim = 128
n_unique_words = 20000
max_review_length = 400
pad_type = trunc_type = 'pre'

n_conv_1 = 32
n_conv_2 = 64
n_conv_3 = 128
k_conv_1 = 2
k_conv_2 = 4
k_conv_3 = 5
drop_conv = 0.5

n_dense = 512
dropout = 0.3

#### Load Data

In [None]:
train_df = pd.read_csv('kaggle/datasets/toxicity/train.csv')
test_df = pd.read_csv('kaggle/datasets/toxicity/test.csv')

#### Preprocess Data

In [None]:
test_df.shape

In [None]:
train_sentences_series = train_df['comment_text'].fillna("_").values
test_sentences_series = test_df['comment_text'].fillna("_").values

# Tokeninze the Training data
tokenizer = text.Tokenizer(num_words=n_unique_words)
tokenizer.fit_on_texts(list(train_sentences_series))
train_tokenized_sentences = tokenizer.texts_to_sequences(train_sentences_series)

# Tokeninze the Test data
test_tokenized_sentences = tokenizer.texts_to_sequences(test_sentences_series)

# toxic,severe_toxic,obscene,threat,insult,identity_hate
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train_df[classes].values

X_train = pad_sequences(train_tokenized_sentences, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)
X_test_sub = pad_sequences(test_tokenized_sentences, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)


X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=test_split)

#### Design Deep Net Architecture

In [None]:
cnn_epsilon = 0.0025
dense_epsilon = 0.0083

input_layer = Input(shape=(max_review_length,), dtype='int16', name='input')

embedding_layer = Embedding(n_unique_words, n_dim, input_length=max_review_length, name='embedding_1')(input_layer)

conv_1 = Conv1D(n_conv_1, k_conv_1, name='conv_1')(embedding_layer)
act1 = SineReLU(cnn_epsilon)(conv_1)

maxp_1 = GlobalMaxPool1D(name='maxp_1')(act1)
drop_1 = Dropout(drop_conv)(maxp_1)
norm_1 = BatchNormalization()(drop_1)

conv_2 = Conv1D(n_conv_2, k_conv_2 name='conv_2')(embedding_layer)
act2 = SineReLU(cnn_epsilon)(conv_2)

maxp_2 = GlobalMaxPool1D(name='maxp_2')(act2)
drop_2 = Dropout(drop_conv)(maxp_2)
norm_2 = BatchNormalization()(drop_2)

conv_3 = Conv1D(n_conv_3, k_conv_3 name='conv_3')(embedding_layer)
act3 = SineReLU(cnn_epsilon)(conv_3)

maxp_3 = GlobalMaxPool1D(name='maxp_3')(act3)
drop_3 = Dropout(drop_conv)(maxp_3)
norm_3 = BatchNormalization()(drop_3)

concat = concatenate([norm_1, norm_2, norm_3])

dense_layer_1 = Dense(n_dense, name='dense_1')(concat)
act4 = SineReLU(dense_epsilon)(dense_layer_1)
drop_dense_layer_1 = Dropout(dropout, name='drop_dense_1')(act4)

dense_layer_2 = Dense(n_dense, name='dense_2')(drop_dense_layer_1)
act5 = SineReLU(dense_epsilon)(dense_layer_2)

drop_dense_layer_2 = Dropout(dropout, name='drop_dense_2')(act5)

predictions = Dense(n_classes, activation='sigmoid', name='output')(drop_dense_layer_2)

model = Model(input_layer, predictions)


In [None]:
model.summary()

#### Configure the Model

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
modelCheckpoint = ModelCheckpoint(monitor='val_acc', filepath=output_dir+'/weights-multicnn-toxicity_new.hdf5', save_best_only=True, mode='max')
earlyStopping = EarlyStopping(monitor='val_acc', mode='max', patience=patience)

In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Train the Model

In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_valid, y_valid), callbacks=[modelCheckpoint, earlyStopping])

#### Evaluate

In [None]:
#model.load_weights(output_dir+'/weights-multicnn-toxicity.hdf5')
model = keras.models.load_model(output_dir + '/weights-multicnn-toxicity_new.hdf5')

In [None]:
y_hat = model.predict(X_test_sub)

In [None]:
plt.hist(y_hat)
_ = plt.axvline(x=0.5, color='orange')

In [None]:
np.random.shuffle(y_hat)
pct_auc = roc_auc_score(y_valid, y_hat[0:31915]) * 100

In [None]:
'{:0.2f}'.format(pct_auc)

In [None]:
y_hat[0]

In [None]:
sample_submission = pd.read_csv("kaggle/datasets/toxicity/sample_submission.csv")

sample_submission.shape

sample_submission[classes] = y_hat
sample_submission.to_csv("kaggle/datasets/toxicity/submission_multicnn_relus.csv", index=False)