# For introduction look at Keras Introduction+ lstm 1 layer + GloVe + Early Stopping .ipynb

In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Bidirectional,LSTM,RepeatVector,TimeDistributed,Activation
from keras.optimizers import Adam
from keras.layers import BatchNormalization, Flatten, Conv1D, MaxPooling1D,GlobalMaxPool1D,CuDNNLSTM,CuDNNGRU
from keras.models import Model
from keras.layers import Dropout,SpatialDropout1D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import backend as K
from keras.engine.topology import Layer, InputSpec



In [14]:
from keras import initializers,regularizers, constraints

In [4]:
# read in the data

#df_train = pd.read_csv('train.csv.zip')
#df_test = pd.read_csv('test.csv.zip')

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

print(df_train.shape)
print(df_test.shape)

(159571, 8)
(153164, 2)


In [5]:
# combine the train and test sets for encoding and padding

train_len = len(df_train)
df_combined =  pd.concat(objs=[df_train, df_test], axis=0).reset_index(drop=True)

print(df_combined.shape)

(312735, 8)


In [6]:
print(df_combined.head)

<bound method NDFrame.head of                                              comment_text                id  \
0       Explanation\nWhy the edits made under my usern...  0000997932d777bf   
1       D'aww! He matches this background colour I'm s...  000103f0d9cfb60f   
2       Hey man, I'm really not trying to edit war. It...  000113f07ec002fd   
3       "\nMore\nI can't make any real suggestions on ...  0001b41b1c6bb37e   
4       You, sir, are my hero. Any chance you remember...  0001d958c54c6e35   
5       "\n\nCongratulations from me as well, use the ...  00025465d4725e87   
6            COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK  0002bcb3da6cb337   
7       Your vandalism to the Matt Shirvington article...  00031b1e95af7921   
8       Sorry if the word 'nonsense' was offensive to ...  00037261f536c51d   
9       alignment on this subject and which are contra...  00040093b2687caa   
10      "\nFair use rationale for Image:Wonju.jpg\n\nT...  0005300084f90edc   
11      bbq \n\nbe a m

In [7]:
# define text data
docs_combined = df_combined['comment_text'].astype(str)

# initialize the tokenizer
t = Tokenizer()
t.fit_on_texts(docs_combined)
vocab_size = len(t.word_index) + 1

# integer encode the text data
encoded_docs = t.texts_to_sequences(docs_combined)

# pad the vectors to create uniform length
padded_docs_combined = pad_sequences(encoded_docs, maxlen=150, padding='post')

In [8]:
# seperate the train and test sets

df_train_padded = padded_docs_combined[:train_len]
df_test_padded = padded_docs_combined[train_len:]

print(df_train_padded.shape)
print(df_test_padded.shape)

(159571, 150)
(153164, 150)


### **Load the GloVe embeddings**

In [9]:
# load the glove840B embedding into memory after downloading and unzippping

embeddings_index = dict()
f = open('/home/nbuser/glove/glove.840B.300d.txt', encoding="utf8")

for line in f:
    # Note: use split(' ') instead of split() if you get an error.
	values = line.split(' ')
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector



Loaded 2196016 word vectors.


### **Define X and y**

In [10]:
X = df_train_padded
X_test = df_test_padded

In [11]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = df_train[list_classes].values

### **Train and generate predictions for each of the 6 target columns:**

In [12]:
#preds = []    
# create a stratified split
X_train, X_eval, y_train ,y_eval = train_test_split(X, y,test_size=0.2,shuffle=True)
                                                    
#random_state=5

### Attention Models


<img src="images/15.PNG">



<img src="images/16.PNG">


In [15]:
CONTEXT_DIM = 100

class Attention(Layer):

    def __init__(self, regularizer=regularizers.l2(1e-10), **kwargs):
        self.regularizer = regularizer
        self.supports_masking = True
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3        
        self.W = self.add_weight(name='W',
                                 shape=(input_shape[-1], CONTEXT_DIM),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)
        self.b = self.add_weight(name='b',
                                 shape=(CONTEXT_DIM,),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)
        self.u = self.add_weight(name='u',
                                 shape=(CONTEXT_DIM,),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)        
        super(Attention, self).build(input_shape)

    @staticmethod
    def softmax(x, dim):
        """Computes softmax along a specified dim. Keras currently lacks this feature.
        """
        if K.backend() == 'tensorflow':
            import tensorflow as tf
            return tf.nn.softmax(x, dim)
        elif K.backend() == 'theano':
            # Theano cannot softmax along an arbitrary dim.
            # So, we will shuffle `dim` to -1 and un-shuffle after softmax.
            perm = np.arange(K.ndim(x))
            perm[dim], perm[-1] = perm[-1], perm[dim]
            x_perm = K.permute_dimensions(x, perm)
            output = K.softmax(x_perm)

            # Permute back
            perm[dim], perm[-1] = perm[-1], perm[dim]
            output = K.permute_dimensions(x, output)
            return output
        else:
            raise ValueError("Backend '{}' not supported".format(K.backend()))

    def call(self, x, mask=None):
        ut = K.tanh(K.bias_add(K.dot(x, self.W), self.b)) * self.u

        # Collapse `attention_dims` to 1. This indicates the weight for each time_step.
        ut = K.sum(ut, axis=-1, keepdims=True)

        # Convert those weights into a distribution but along time axis.
        # i.e., sum of alphas along `time_steps` axis should be 1.
        self.at = self.softmax(ut, dim=1)
        if mask is not None:
            self.at *= K.cast(K.expand_dims(mask, -1), K.floatx())

        # Weighted sum along `time_steps` axis.
        return K.sum(x * self.at, axis=-2)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
    
    def get_config(self):
        config = {}
        base_config = super(Attention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_mask(self, inputs, mask):
        return None

In [29]:
#MAIN Create LSTM model
#Best performing model-lr=0.0003
model=Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], 
                  input_length=150, trainable=False))
model.add(SpatialDropout1D(0.2))
model.add((Bidirectional(CuDNNLSTM(50,return_sequences=True))))
model.add(Attention())
model.add(Dense(70, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(6, activation="sigmoid"))


In [30]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 150, 300)          118436400 
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 150, 300)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 150, 100)          140800    
_________________________________________________________________
attention_3 (Attention)      (None, 100)               10200     
_________________________________________________________________
dense_5 (Dense)              (None, 70)                7070      
_________________________________________________________________
dropout_5 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 426       
Total para

In [31]:
 # compile the model
Adam_opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.000015)
model.compile(optimizer=Adam_opt, loss='binary_crossentropy', metrics=['acc'])

early_stopping = EarlyStopping(monitor='val_loss', patience=4, mode='min',min_delta=0.0005)
save_best = ModelCheckpoint('/home/nbuser/toxiclstmattention.hdf', save_best_only=True, 
                           monitor='val_acc', mode='max')

history = model.fit(X_train, y_train, validation_data=(X_eval, y_eval),
                    epochs=40, verbose=1,callbacks=[early_stopping,save_best],batch_size=128)

                        

Train on 127656 samples, validate on 31915 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40

KeyboardInterrupt: 

In [16]:
model.load_weights('/home/nbuser/toxiclstmattention.hdf')

In [17]:
# make a prediction on y (target column)
    
predictions = model.predict(X_test)


In [18]:
predictions

array([[9.9745911e-01, 5.8220989e-01, 9.7239596e-01, 1.3976243e-01,
        9.2421460e-01, 4.4856468e-01],
       [9.3540637e-04, 6.0126801e-08, 8.4280065e-05, 1.4785840e-06,
        3.9639646e-05, 1.6033798e-06],
       [3.8379602e-04, 3.3369105e-08, 3.4451561e-05, 1.0978166e-06,
        1.3349451e-05, 9.3679165e-07],
       ...,
       [1.8271645e-04, 1.0796542e-08, 2.3067649e-05, 2.9302774e-07,
        6.5523991e-06, 3.1244323e-07],
       [1.9745632e-04, 8.6652975e-08, 1.6009037e-05, 2.9206708e-06,
        7.5593130e-06, 1.5606578e-05],
       [9.7628772e-01, 1.8008012e-02, 8.5504109e-01, 7.2783773e-04,
        6.7450976e-01, 2.4795048e-03]], dtype=float32)

### **Create a submission file**

In [19]:
sample_submission = pd.read_csv("data/sample_submission.csv")

In [20]:
sample_submission[list_classes] = predictions

In [21]:
sample_submission.to_csv("baselinelstmattention.csv", index=False)

In [22]:
z = pd.read_csv('baselinelstmattention.csv')

In [23]:
z

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999612,4.432712e-01,9.750918e-01,1.026920e-02,9.394193e-01,4.166672e-01
1,0000247867823ef7,0.000064,9.601138e-10,5.475502e-06,4.189792e-08,1.766603e-06,2.485000e-07
2,00013b17ad220c46,0.000035,1.099106e-09,3.012348e-06,6.663038e-08,1.019648e-06,3.778804e-07
3,00017563c3f7919a,0.000005,6.409814e-11,1.663322e-06,1.807170e-08,3.011706e-07,2.538108e-08
4,00017695ad8997eb,0.005163,2.495954e-06,4.381569e-04,8.765717e-05,1.002813e-04,1.765878e-05
5,0001ea8717f6de06,0.000016,1.419354e-09,2.041784e-06,5.578204e-07,1.354267e-06,5.824378e-07
6,00024115d4cbde0f,0.000538,2.576459e-09,4.073026e-05,6.135154e-08,3.693880e-05,4.503346e-07
7,000247e83dcc1211,0.426644,1.459220e-04,3.122146e-02,7.474751e-05,2.829204e-02,6.031641e-04
8,00025358d4737918,0.065159,1.986716e-06,6.723492e-03,2.475270e-06,9.288167e-03,4.892949e-05
9,00026d1092fe71cc,0.000134,5.137209e-09,2.005500e-05,1.180496e-07,4.062455e-06,5.496302e-07


In [None]:
y_preds = predictions[:,0]

In [6]:
# append the prediction to a python list
preds.append(y_preds)

NameError: name 'preds' is not defined

In [None]:
#df_results = pd.DataFrame({'id':df_test.id,
                            "toxic":preds[0],
                           "severe_toxic":preds[1],
                           "obscene":preds[2],
                           "threat":preds[3],
                           "insult":preds[4],
                           "identity_hate":preds[5]}).set_index('id')

# Pandas automatically sorts the columns alphabetically by column name.
# Therefore, we need to re-order the columns to match the sample submission file.
#df_results = df_results[["toxic","severe_toxic","obscene","threat","insult','identity_hate']]

# create a submission csv file
#df_results.to_csv('kaggle_submission.csv', 
                  #columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate']) 

***
### **Resources**


These are a few cnn and nlp resources I found helpful:

- What are word embeddings?<br>
https://www.youtube.com/watch?v=Eku_pbZ3-Mw


- Blog post with a simple example explaining how to use pre trained embeddings:<br>https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/


- Online cnn course:<br>
https://www.coursera.org/learn/convolutional-neural-networks<br>
This course can be taken for free. 


- Lesson 5 notes from the fast.ai course:<br>
http://wiki.fast.ai/index.php/Lesson_5_Notes


- GloVe: Global Vectors for Word Representation<br>
Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014.<br>
https://nlp.stanford.edu/projects/glove/


- Machine learning with text<br>
https://www.youtube.com/watch?v=ZiKMIuYidY0


- NLTK Tutorial series<br>
https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/

***
This competition is a great learning experience. Thank you to all who have been commenting and publishing.

Happy new year!