In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

In [3]:
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam
import sys

Using TensorFlow backend.


In [4]:
from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints


In [None]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim
        

In [None]:
path = '../data/preprocessed_data/'
EMBEDDING_FILE='./word_embeddings/glove.840B.300d.txt'
TRAIN_DATA_FILE=path+'train.csv'
TEST_DATA_FILE=path+'test.csv'

MAX_SEQUENCE_LENGTH = 200
MAX_NB_WORDS = 40000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.15

num_lstm = 200
num_dense = 256
rate_drop_lstm = 0.4
rate_drop_dense = 0.4
num_epochs = 10
act = 'relu'
batch_size = 128

In [None]:

train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)
human_df = pd.read_csv('../Terminator Mode/test_data_labelled')
human_df

Unnamed: 0.1,Unnamed: 0,Comment,Labels
0,1000,""" \r\r\r\n == Speedy deletion of """"Mason Henso...",0.0
1,1001,onwhat ehbcdpyedwcdo vaeotgbcdjvfh8dwikxosmn b...,0.0
2,1002,*I don't know what's going on here - the note ...,0.0
3,1003,""" \r\r\r\n\r\r\r\n \r\r\r\n == Final draft… m...",0.0
4,1004,""" \r\r\r\n\r\r\r\n : This is not an edit propo...",1.0
5,1005,Is he contesting 2014 Lok Sabha elecction. If ...,0.0
6,1006,== 'citation needed' is not needed == \r\r\r\n...,0.0
7,1007,""" \r\r\r\n\r\r\r\n \r\r\r\n\r\r\r\n WOW, so t...",1.0
8,1008,""" \r\r\r\n\r\r\r\n :Looking at the issue again...",0.0
9,1009,Hi David_FLXD. I am coming back to you with re...,0.0


In [None]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE, encoding='utf-8')
for line in f:
    values = line.split()
    word = ' '.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))


Indexing word vectors
Total 2195895 word vectors.


In [None]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')

#Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)

#regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

print("Done")

Processing text dataset
Done


In [None]:
list_sentences_train = train_df["Comment"].fillna("NA").values
y = train_df["Labels"].values
list_sentences_test = test_df["Comment"].fillna("NA").values
y_real = test_df["Labels"]
list_sentences_human = human_df["Comment"][0:1000]
y_human = human_df["Labels"][0:1000]
y_human

0      0.0
1      0.0
2      0.0
3      0.0
4      1.0
5      0.0
6      0.0
7      1.0
8      0.0
9      0.0
10     1.0
11     0.0
12     0.0
13     0.0
14     0.0
15     0.0
16     0.0
17     0.0
18     0.0
19     0.0
20     0.0
21     0.0
22     1.0
23     0.0
24     0.0
25     0.0
26     0.0
27     0.0
28     0.0
29     0.0
      ... 
970    0.0
971    0.0
972    0.0
973    0.0
974    1.0
975    0.0
976    0.0
977    0.0
978    0.0
979    0.0
980    0.0
981    0.0
982    0.0
983    1.0
984    0.0
985    1.0
986    0.0
987    0.0
988    0.0
989    0.0
990    0.0
991    0.0
992    0.0
993    0.0
994    0.0
995    0.0
996    0.0
997    0.0
998    1.0
999    0.0
Name: Labels, Length: 1000, dtype: float64

In [None]:
comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))
    
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

human_comments = []
for text in list_sentences_human:
    human_comments.append(text_to_wordlist(text))
    
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(comments + test_comments)

sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)
human_sequences = tokenizer.texts_to_sequences(human_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)
human_data = pad_sequences(human_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of human data tensor:', human_data.shape)

Found 271583 unique tokens
Shape of data tensor: (159571, 200)
Shape of label tensor: (159571,)
Shape of test_data tensor: (63978, 200)
Shape of human data tensor: (1000, 200)


In [None]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


Preparing embedding matrix
Null word embeddings: 5051


In [None]:
########################################
## sample train/validation data
########################################
# np.random.seed(1234)
perm = np.random.permutation(len(data))
idx_train = perm[:int(len(data)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data)*(1-VALIDATION_SPLIT)):]

data_train=data[idx_train]
labels_train=y[idx_train]
print(data_train.shape,labels_train.shape)

data_val=data[idx_val]
labels_val=y[idx_val]

print(data_val.shape,labels_val.shape)


(135635, 200) (135635,)
(23936, 200) (23936,)


In [None]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,return_sequences=True)

comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences= embedding_layer(comment_input)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
merged = Attention(MAX_SEQUENCE_LENGTH)(x)
merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)


In [None]:
########################################
## train the model
########################################
model = Model(inputs=[comment_input], \
        outputs=preds)
adam = Adam(lr=0.00001, beta_1=0.9, beta_2=0.99, epsilon=1e-8)
model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
print(model.summary())
STAMP = 'simple_lstm_glove_vectors_%.2f_%.2f'%(rate_drop_lstm,rate_drop_dense)
print(STAMP)
bst_model_path = STAMP + '.h5'

#early_stopping =EarlyStopping(monitor='val_loss', patience=10)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, mode='min')
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit(data_train, labels_train, \
        validation_data=(data_val, labels_val), \
        epochs=num_epochs, batch_size=batch_size, shuffle=True, \
         callbacks=[model_checkpoint, reduce_lr])
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
y_test = model.predict([test_data], batch_size=1024, verbose=1)
y_human_pred = model.predict([human_data], batch_size=1000, verbose = 1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 300)          12000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 200)          400800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 200)          0         
_________________________________________________________________
attention_1 (Attention)      (None, 200)               400       
_________________________________________________________________
dense_1 (Dense)              (None, 256)               51456     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
__________

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
y_test = np.where(y_test > 0.5, 1, 0)
confmat = confusion_matrix(y_real, y_test)
print(confmat)

In [None]:
roc_auc = roc_auc_score(y_real, y_test, 'macro')
print("ROC AUC score mean: %f" % (roc_auc))

In [None]:
hist.history

In [None]:
## Plot training and validation loss
fig = plt.figure(figsize= (15,10), dpi=80)
fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
plt.subplot(1,2,1)
plt.plot(np.arange(num_epochs), hist.history['val_loss'], label="Validation Loss")
plt.plot( np.arange(num_epochs), hist.history['loss'], label="Training loss")
plt.xlabel("Epochs")
plt.ylabel("Binary Cross Entropy Loss")
plt.ylim((0, 0.3))
plt.legend(loc='upper left')
plt.subplot(1,2,2)
plt.plot(np.arange(num_epochs), hist.history['val_acc'], label ="Validation Accuracy")
plt.plot(np.arange(num_epochs), hist.history['acc'], label = "Training Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.ylim((0.9, 1))
plt.legend(loc='upper left')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
cm = pd.DataFrame(confmat);
sns.heatmap(cm, annot=True, fmt='g')
plt.ylabel('True label')
plt.xlabel('Predicted label')
print("ROC AUC score mean: %f" % (roc_auc))

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
y_human_pred = np.where(y_human_pred > 0.5, 1, 0)
confmat = confusion_matrix(y_human, y_human_pred)
print(confmat)

In [None]:
roc_auc = roc_auc_score(y_human, y_human_pred, 'macro')
confmat

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
cm = pd.DataFrame(confmat);
sns.heatmap(cm, annot=True, fmt='g')
plt.ylabel('True label')
plt.xlabel('Predicted label')
print("ROC AUC score mean: %f" % (roc_auc))