### Kaggle Toxic Comment Classification Challenge - LSTM Classifier

The competition can be found at the following url:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

#### 1. Preparing the data

In [1]:
import pandas as pd
import os

os.chdir('D://Analytics/Kaggle/toxic_comment_challenge/')
print(os.getcwd())

dev = pd.read_csv('data/raw/train.csv')
val = pd.read_csv('data/raw/test.csv')
sub1 = pd.read_csv('data/submissions/baseline.csv')
print(dev.shape)
print(val.shape)

D:\Analytics\Kaggle\toxic_comment_challenge
(95851, 8)
(226998, 2)


In [2]:
dev.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [3]:
# Identifying the target columns
y_cols = [c for c in dev.columns if c not in ['id','comment_text']]
y_vals = dev[y_cols].as_matrix()

# Flagging the validation ids
vid = val['id'].values

# Concatenating the dev and val datasets
df_txt = pd.concat([dev['comment_text'], val['comment_text']], axis=0)
df_txt = df_txt.fillna("unknown")

# Number of rows in the dev sample
nrows = dev.shape[0]

In [4]:
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Get a distinct list of stop words
stop_words = set(stopwords.words('english'))

# Initialize a stemmer
stemmer = PorterStemmer()

# Function that turns a doc into clean tokens
def clean_doc(doc, stemmer, stop_words):
    # Split into individual tokens by white space
    tokens = doc.split()
    # Remove punctuation and set to lowercase
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table).lower() for w in tokens]
    # Remove words that are not entirely alphabetical
    #tokens = [w for w in tokens if w.isalpha()]
    # Removing all known stop words
    tokens = [w for w in tokens if not w in stop_words]
    # Remove tokens that aren't at least two characters in length
    tokens = [w for w in tokens if len(w) > 1]
    # Stem the remaining tokens
    #tokens = [stemmer.stem(w) for w in tokens]
    return(tokens)

In [7]:
from collections import Counter

# Define vocab
vocab = Counter()

# Iterate over each of the texts in our training sample
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stemmer, stop_words)
    # Add tokens to vocab
    vocab.update(tokens)

In [8]:
# A container object that will hold the words of each individual document
lines = list()

# Iterate over each of the texts in our training sample
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stemmer, stop_words)
    # Filter the words in the document by our defined vocabulary
    tokens = [w for w in tokens if w in vocab]
    # Concatentate each word in the document by a single space and append to our lines container
    lines.append(' '.join(tokens))

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
data = pad_sequences(sequences, maxlen=500)
print(data.shape)

(322849, 500)


In [None]:
print(tokenizer.word_index)

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data[:nrows], y_vals, test_size=.5, random_state=52)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(47925, 500)
(47925, 6)
(47926, 500)
(47926, 6)


#### 2. Train LSTM model - Toxic comments

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create the model
model = Sequential()
model.add(Embedding(50000, 128, input_length=x_train.shape[1]))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=2, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/toxic_best_weights.h5', monitor='val_loss', save_best_only=True)

# Train the model
model.fit(x_train, 
          y_train[:,0], 
          validation_data=(x_test, y_test[:,0]), 
          epochs=20, 
          batch_size=64,
          callbacks=[early_stopping, checkpoint],
          verbose=2
          )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 128)          6400000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 500, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 496, 64)           41024     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 124, 64)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 6,539,969
Trainable params: 6,539,969
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x2472b5ada20>

In [17]:
model.load_weights('models/toxic_best_weights.h5')
model.compile(loss='binary_crossentropy', optimizer='adam')
preds_toxic = model.predict(data[nrows:])
print(preds_toxic.shape)

(226998, 1)


#### 3. Train model - Severe toxic

In [18]:
# Create the model
model = Sequential()
model.add(Embedding(50000, 128, input_length=x_train.shape[1]))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=2, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/severe_toxic_best_weights.h5', monitor='val_loss', save_best_only=True)

# Train the model
model.fit(x_train, 
          y_train[:,1], 
          validation_data=(x_test, y_test[:,1]), 
          epochs=20, 
          batch_size=64,
          callbacks=[early_stopping, checkpoint],
          verbose=2
          )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 128)          6400000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 500, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 496, 64)           41024     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 124, 64)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 6,539,969
Trainable params: 6,539,969
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x2472e5015c0>

In [19]:
model.load_weights('models/severe_toxic_best_weights.h5')
model.compile(loss='binary_crossentropy', optimizer='adam')
preds_severe_toxic = model.predict(data[nrows:])
print(preds_severe_toxic.shape)

(226998, 1)


#### 4. Train model - Obscene

In [20]:
# Create the model
model = Sequential()
model.add(Embedding(50000, 128, input_length=x_train.shape[1]))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=2, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/obscene_best_weights.h5', monitor='val_loss', save_best_only=True)

# Train the model
model.fit(x_train, 
          y_train[:,2], 
          validation_data=(x_test, y_test[:,2]), 
          epochs=20, 
          batch_size=64,
          callbacks=[early_stopping, checkpoint],
          verbose=2
          )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 128)          6400000   
_________________________________________________________________
dropout_6 (Dropout)          (None, 500, 128)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 496, 64)           41024     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 124, 64)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 6,539,969
Trainable params: 6,539,969
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x24731380f60>

In [21]:
model.load_weights('models/obscene_best_weights.h5')
model.compile(loss='binary_crossentropy', optimizer='adam')
preds_obscene = model.predict(data[nrows:])
print(preds_obscene.shape)

(226998, 1)


#### 5. Train model - Threat

In [22]:
# Create the model
model = Sequential()
model.add(Embedding(50000, 128, input_length=x_train.shape[1]))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=2, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/threat_best_weights.h5', monitor='val_loss', save_best_only=True)

# Train the model
model.fit(x_train, 
          y_train[:,3], 
          validation_data=(x_test, y_test[:,3]), 
          epochs=20, 
          batch_size=64,
          callbacks=[early_stopping, checkpoint],
          verbose=2
          )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 128)          6400000   
_________________________________________________________________
dropout_7 (Dropout)          (None, 500, 128)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 496, 64)           41024     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 124, 64)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 6,539,969
Trainable params: 6,539,969
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x24773bdbfd0>

In [23]:
model.load_weights('models/threat_best_weights.h5')
model.compile(loss='binary_crossentropy', optimizer='adam')
preds_threat = model.predict(data[nrows:])
print(preds_threat.shape)

(226998, 1)


#### 6. Train model - Insult & Identity Hate

In [24]:
# Create the model
model = Sequential()
model.add(Embedding(50000, 128, input_length=x_train.shape[1]))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(2, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=2, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/insult_hate_best_weights.h5', monitor='val_loss', save_best_only=True)

# Train the model
model.fit(x_train, 
          y_train[:,4:], 
          validation_data=(x_test, y_test[:,4:]), 
          epochs=20, 
          batch_size=64,
          callbacks=[early_stopping, checkpoint],
          verbose=2
          )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 128)          6400000   
_________________________________________________________________
dropout_8 (Dropout)          (None, 500, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 496, 64)           41024     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 124, 64)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 258       
Total params: 6,540,098
Trainable params: 6,540,098
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x24775499ba8>

In [26]:
model.load_weights('models/insult_hate_best_weights.h5')
model.compile(loss='binary_crossentropy', optimizer='adam')
preds_insult_hate = model.predict(data[nrows:])
print(preds_insult_hate.shape)

(226998, 2)


#### 8. Submission

In [29]:
import numpy as np

preds = np.zeros((val.shape[0], len(y_cols)))

preds[:,0] = preds_toxic[:,0]
preds[:,1] = preds_severe_toxic[:,0]
preds[:,2] = preds_obscene[:,0]
preds[:,3] = preds_threat[:,0]
preds[:,4] = preds_insult_hate[:,0]
preds[:,5] = preds_insult_hate[:,1]

print(preds.shape)

(226998, 6)


In [30]:
submid = pd.DataFrame({'id': vid})
submission = pd.concat([submid, pd.DataFrame(preds, columns=y_cols)], axis=1)
submission.to_csv('data/submissions/lstm.csv', index=False)

#### 9. Weighted average submission

In [31]:
sub = sub1.as_matrix()[:,1:]
sub2 = submission.as_matrix()[:,1:]
sub3 = (sub * .5) + (sub2 * .5)
print(sub3.shape)

(226998, 6)


In [32]:
submission2 = pd.concat([submid, pd.DataFrame(sub3, columns=y_cols)], axis=1)
submission2.to_csv('data/submissions/sub1_sub2_wtd_avg.csv', index=False)