# LSTM Training
This notebook can be used to train an LSTM for text classification and generate predictions for the kaggle competition found [here](https://www.kaggle.com/c/quora-insincere-questions-classification). 

The notebook utilizes Keras and GloVe for preprocessing using word embeddings. Then, Keras with Tensorflow backend is used for training a deep LSTM. 

Ensure that the train.csv and test.csv are in the data/ directory of this project. 

In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Embedding, CuDNNLSTM, Bidirectional, SpatialDropout1D, GlobalMaxPool1D, Dropout
from keras.models import Model

%load_ext autoreload
%autoreload 2

In [None]:
# Load in training and testing data
train_df = pd.read_csv('../input/train.csv')
train_df.head()

In [None]:
print('The average word length of questions in the training set is {0:.0f}.'\
          .format(np.mean(train_df['question_text'].apply(lambda x: len(x.split())))))
print('The maximum word length for a question in the training set is {0:.0f}.'\
          .format(np.max(train_df['question_text'].apply(lambda x: len(x.split())))))

In [None]:
# Extract the training data and corresponding labels
text = train_df['question_text'].fillna('unk').values
labels = train_df['target'].values

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(text, labels,\
                                                  test_size=0.2)

In [None]:
embed_size = 300 # Size of each word vector
max_words = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

In [None]:
## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

word_index = tokenizer.word_index
print('The word index consists of {} unique tokens.'.format(len(word_index)))

## Pad the sentences 
X_train = pad_sequences(X_train, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

In [None]:
# Create the embedding dictionary from the word embedding file
embedding_dict = {}
filename = os.path.join('../input/embeddings/', 'glove.840B.300d/glove.840B.300d.txt')
with open(filename) as f:
    for line in f:
        line = line.split()
        token = line[0]
        try:
            coefs = np.asarray(line[1:], dtype='float32')
            embedding_dict[token] = coefs
        except:
            pass
print('The embedding dictionary has {} items'.format(len(embedding_dict)))

In [None]:
# Create the embedding layer weight matrix
embed_mat = np.zeros(shape=[max_words, embed_size])
for word, idx in word_index.items():
    # Word index is ordered from most frequent to least frequent
    # Ignore words that occur less frequently
    if idx >= max_words: continue
    vector = embedding_dict.get(word)
    if vector is not None:
        embed_mat[idx] = vector

In [None]:
def create_lstm():
    input = Input(shape=(maxlen,))
    x = Embedding(max_words, embed_size, weights=[embed_mat], trainable=False)(input)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    return model

In [None]:
lstm = create_lstm()
lstm.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=3, batch_size=512)

# Predictions
The remainder of this notebok will generate predictions from the test set and write them to a submission csv file. 

In [None]:
test_df = pd.read_csv('../input/test.csv')
X_test = test_df['question_text'].values

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=maxlen)

preds = np.rint(lstm.predict([X_test], batch_size=1024, verbose=1))
test_df['prediction'] = preds


In [None]:
test_df.loc[test_df['prediction'] == 1]['question_text'].head()

In [None]:
test_df = test_df.drop('question_text', axis=1)


In [None]:
test_df.head()

In [None]:
test_df.to_csv('submission.csv', index=False)
