### Kaggle Toxic Comment Classification Challenge - LSTM Classifier

The competition can be found at the following url:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

#### 1. Loading the data

In [1]:
import pandas as pd
import os

os.chdir('D://Analytics/Kaggle/toxic_comment_challenge/')
print(os.getcwd())

dev = pd.read_csv('data/raw/train.csv')
val = pd.read_csv('data/raw/test.csv')
sub1 = pd.read_csv('data/submissions/sub1_sub2_wtd_avg.csv')
print(dev.shape)
print(val.shape)
print(dev.head())

D:\Analytics\Kaggle\toxic_comment_challenge
(95851, 8)
(226998, 2)
         id                                       comment_text  toxic  \
0  22256635  Nonsense?  kiss off, geek. what I said is true...      1   
1  27450690  "\n\n Please do not vandalize pages, as you di...      0   
2  54037174  "\n\n ""Points of interest"" \n\nI removed the...      0   
3  77493077  Asking some his nationality is a Racial offenc...      0   
4  79357270  The reader here is not going by my say so for ...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [2]:
# Identifying the target columns
y_cols = [c for c in dev.columns if c not in ['id','comment_text']]
y_vals = dev[y_cols].as_matrix()

# Flagging the validation ids
vid = val['id'].values

# Concatenating the dev and val datasets
df_txt = pd.concat([dev['comment_text'], val['comment_text']], axis=0)
df_txt = df_txt.fillna("unknown")

# Number of rows in the dev sample
nrows = dev.shape[0]

#### 2. Preprocessing the text for deep learning

In [3]:
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Get a distinct list of stop words
stop_words = set(stopwords.words('english'))

# Initialize a stemmer
stemmer = PorterStemmer()

# Function that turns a doc into clean tokens
def clean_doc(doc, stemmer, stop_words):
    # Split into individual tokens by white space
    tokens = doc.split()
    # Remove punctuation and set to lowercase
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table).lower() for w in tokens]
    # Remove words that are not entirely alphabetical
    #tokens = [w for w in tokens if w.isalpha()]
    # Removing all known stop words
    tokens = [w for w in tokens if not w in stop_words]
    # Remove tokens that aren't at least two characters in length
    tokens = [w for w in tokens if len(w) > 2]
    # Stem the remaining tokens
    #tokens = [stemmer.stem(w) for w in tokens]
    return(tokens)

In [4]:
from collections import Counter

# Define vocab
vocab = Counter()

# Iterate over each of the texts in our training sample
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stemmer, stop_words)
    # Add tokens to vocab
    vocab.update(tokens)

In [5]:
# A container object that will hold the words of each individual document
lines = list()

# Iterate over each of the texts in our training sample
for text in df_txt:
    # Create a list of tokens
    tokens = clean_doc(text, stemmer, stop_words)
    # Filter the words in the document by our defined vocabulary
    tokens = [w for w in tokens if w in vocab]
    # Concatentate each word in the document by a single space and append to our lines container
    lines.append(' '.join(tokens))

In [17]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words = 100000
max_length = 1000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
data = pad_sequences(sequences, maxlen=max_length)
print(data.shape)

(322849, 1000)


In [18]:
n_words = len(tokenizer.word_index)
print('Number of words: %d' % n_words)

Number of words: 493879


In [19]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data[:nrows], y_vals, test_size=.5, random_state=52)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(47925, 1000)
(47925, 6)
(47926, 1000)
(47926, 6)


#### 3. Train LSTM model

In [20]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create the model
model = Sequential()
model.add(Embedding(num_words, 256, input_length=max_length))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128, recurrent_dropout=0.15))
model.add(Dense(6, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=2, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/lstm_best_weights2.h5', monitor='val_loss', save_best_only=True)

# Train the model
model.fit(x_train, 
          y_train, 
          validation_data=(x_test, y_test), 
          epochs=20, 
          batch_size=128,
          callbacks=[early_stopping, checkpoint],
          verbose=2
          )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 1000, 256)         25600000  
_________________________________________________________________
dropout_6 (Dropout)          (None, 1000, 256)         0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 996, 128)          163968    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 249, 128)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 774       
Total params: 25,896,326
Trainable params: 25,896,326
Non-trainable params: 0
________________________________________________________________

KeyboardInterrupt: 

#### 4. Submission using trained LSTM model

In [26]:
from keras.models import load_model

model = load_model('models/lstm_best_weights2.h5')

preds = model.predict(data[nrows:])
print(preds.shape)

(226998, 6)


In [27]:
submid = pd.DataFrame({'id': vid})
sub2 = pd.concat([submid, pd.DataFrame(preds, columns=y_cols)], axis=1)
sub2.to_csv('data/submissions/lstm_gpu.csv', index=False)

#### 5. Weighted average submission

In [28]:
sub3 = (sub1.as_matrix()[:,1:] * .9) + (sub2.as_matrix()[:,1:] * .1)
print(sub3.shape)

(226998, 6)


In [29]:
sub3 = pd.concat([submid, pd.DataFrame(sub3, columns=y_cols)], axis=1)
sub3.to_csv('data/submissions/wtd_avg_2.csv', index=False)