In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

In [2]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

In [3]:
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Layer
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [4]:
import sys

from keras import backend as K
#from keras import initializations
from keras import initializers, regularizers, constraints
from sklearn.metrics import roc_auc_score

In [5]:
class Attention(Layer):
    # Input shape 3D tensor with shape: `(samples, steps, features)`.
    # Output shape 2D tensor with shape: `(samples, features)`.

    def __init__(
        self,
        step_dim,
        W_regulizer=None,
        b_regulizer=None,
        W_constraint=None,
        b_constraint=None,
        bias=True,
        **kwargs
    ):

        self.W_regulizer = W_regulizer
        self.b_regulizer = b_regulizer

        self.W_constraint = W_constraint
        self.b_constraint = b_constraint

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        self.init = initializers.get("glorot_uniform")
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(
            shape=(input_shape[-1],),
            initializer=self.init,
            constraint=self.W_constraint,
            regularizer=self.W_regulizer,
            name="{}_W".format(self.name),
        )

        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(
                (input_shape[1],),
                initializer="zero",
                name="{}_b".format(self.name),
                regularizer=self.b_regulizer,
                constraint=self.b_constraint,
            )
        else:
            self.b = None
        super(Attention, self).build(input_shape)

    def call(self, x, mask=None):

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(
            K.dot(
                K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))
            ),
            (-1, step_dim),
        )

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:

            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a

        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim


In [6]:
path = "./data/"
path1 = './'
EMBEDDING_FILE=path1+'glove.6B.100d.txt'
TRAIN_DATA_FILE=path+'train.csv'
TEST_DATA_FILE=path+'test.csv'

MAX_SEQUENCE_LENGTH = 150
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = 300
num_dense = 256
lstm_dropout_rate = 0.25
dense_dropout_rate = 0.25

act = 'relu'

In [7]:
print('Indexing word vectors')
embedding_index = {}
with open(EMBEDDING_FILE,'r',encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        embedding_index[word] = coefs
print('Indexed the word vectors')   
print('Found %s word vectors.' %len(embedding_index))     

train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

Indexing word vectors
Indexed the word vectors
Found 400000 word vectors.


In [8]:
print('performing some basic preprocessing on data')

#regex for removing non-alphanumeric characters and spaces
remove_special_char = re.compile('r[^a-z\d]',re.IGNORECASE)

#regex to replace all numerics
replace_numerics = re.compile(r'\d+',re.IGNORECASE)

performing some basic preprocessing on data


In [9]:
def preprocess_text(text, remove_stopwords = True, perform_stemming = True):
    #convert text to lowercase and split.
    text = text.lower().split()
    
    #stopword removal(you can use your own set of stopwords, here we are using default from nltk stopwords)
    if(remove_stopwords):
        stop_words = set(stopwords.words('english'))
        text = [word for word in text if word not in stop_words]
     
    text = ' '.join(text)   
    
    text = remove_special_char.sub('', text)
    text = replace_numerics.sub('n', text)
        
    if(perform_stemming):
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = ' '.join(stemmed_words)
        
    return text    

In [10]:
train_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [13]:
raw_train_comments = train_df['comment_text'].fillna('NA').values
raw_test_comments = test_df['comment_text'].fillna('NA').values
classes_to_predict = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[classes_to_predict].values
#y_test_predicted = test_df[classes_to_predict].values

processed_train_comments = []
for comment in raw_train_comments:
    processed_train_comments.append(preprocess_text(comment))
    
processed_test_comments = []    
for comment in raw_test_comments:
    processed_test_comments.append(preprocess_text(comment))

In [17]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(processed_train_comments + processed_test_comments)

train_sequences = tokenizer.texts_to_sequences(processed_train_comments)
test_sequences = tokenizer.texts_to_sequences(processed_test_comments)

print('found %s tokens in text.' %(len(tokenizer.word_index)))

train_data = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
final_test_data = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

print('shape of train_data(will be divided further into final_train_data + final_validation_data) ready for feeding to network is [%s,%s]' %(train_data.shape))
print('shape of final_test_data ready for fedding to network is [%s,%s]' %(final_test_data.shape))
print('shape of label(y) is [%s,%s]' %(y.shape))

found 503597 tokens in text.
shape of train_data(will be divided further into final_train_data + final_validation_data) ready for feeding to network is [159571,150]
shape of final_test_data ready for fedding to network is [153164,150]
shape of label(y) is [159571,6]


In [19]:
print('preparing embedding matrix')
word_index = tokenizer.word_index
nb_words  = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if(i> MAX_NB_WORDS):
        continue
    embedding_vector = embedding_index.get(word) 
    if(embedding_vector is not None):
        embedding_matrix[i] = embedding_vector
print('embedding matrix preparation complete')    

preparing embedding matrix


ValueError: could not broadcast input array from shape (100,) into shape (300,)