In [224]:
import pandas as pd
import numpy as np
import pickle
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, BatchNormalization, AveragePooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from Attention import *
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from imblearn.over_sampling import RandomOverSampler

In [240]:
data = pd.read_csv('../data/labeled_data.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [241]:
index = np.arange(len(data)).reshape(-1,1)
ros = RandomOverSampler()
balanced_index, balanced_labels = ros.fit_sample(index, labels)

In [244]:
balanced_index = balanced_index.flatten()
balanced_labels = balanced_labels.flatten()

In [245]:
data = data.iloc[balanced_index, :]
y_train = to_categorical(balanced_labels)

In [247]:
data['class'].value_counts()

2    19190
1    19190
0    19190
Name: class, dtype: int64

In [248]:
tweets = data.tweet
labels = data['class'].values

In [249]:
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100

In [250]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(tweets)

In [251]:
train_sequences = tokenizer.texts_to_sequences(tweets)
X_train = np.array([np.array(_) for _ in train_sequences])
X_train = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [252]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 36508 unique tokens.


In [12]:
embeddings_index = {}
f = open('../embeddings/glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("error reading word", word)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

error reading word .
error reading word at
error reading word .
error reading word to
error reading word .
error reading word .
error reading word email
error reading word or
error reading word contact
error reading word Email
error reading word on
error reading word At
error reading word by
error reading word in
error reading word emailing
error reading word Contact
error reading word at
error reading word •
error reading word at
error reading word is
Found 2195884 word vectors.


In [13]:
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 17622


In [253]:
embedding_matrix.shape

(36509, 300)

In [254]:
embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)

In [255]:
tweet_input = Input(shape=[MAX_SEQUENCE_LENGTH], dtype='int32')

with tf.device('/gpu:0'):
    x = embedding_layer(tweet_input)

with tf.device('/gpu:1'):
    x = Bidirectional(LSTM(300, dropout=0.1, return_sequences=True, recurrent_dropout=0.1))(x)
    x = Attention()(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    predictions = Dense(3, activation='softmax')(x)

In [256]:
model = Model(tweet_input, predictions)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [257]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          10952700  
_________________________________________________________________
bidirectional_5 (Bidirection (None, 100, 600)          1442400   
_________________________________________________________________
attention_4 (Attention)      (None, 600)               700       
_________________________________________________________________
dense_7 (Dense)              (None, 100)               60100     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 100)               400       
__________

In [258]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_ckpt = ModelCheckpoint(filepath='../models/sentiment_analysis.h5', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5)
callbacks = [early_stopping, model_ckpt, reduce_lr]

In [259]:
# def batch_generator(X, y, batch_size):
#     idx = np.arange(X)
#     while True:
#         np.random.shuffle(idx)
#         X = X[idx]
#         y = y[idx]
#         for i in range(0, len(X)):
#             yield (pad_sequences(X[i:i + batch_size]), y[i:i + batch_size])

In [260]:
model.fit(X_train, y_train, batch_size=256, epochs=20, validation_split=0.2, callbacks=callbacks)

Train on 46056 samples, validate on 11514 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa2560a2c18>

In [167]:
model.load_weights('../models/sentiment_analysis.h5')

In [168]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [172]:
train_comments = tokenizer.texts_to_sequences(train['comment_text'])
train_comments = pad_sequences(train_comments, maxlen=MAX_SEQUENCE_LENGTH)

In [175]:
train_probs = model.predict(train_comments)

In [182]:
train['prob_hate_speech'] = np.nan
train['prob_offensive_language'] = np.nan

In [183]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,prob_hate_speech,prob_offensive_language
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,,
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,,
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,,
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,,
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,,


In [186]:
train.loc[:, ['prob_hate_speech', 'prob_offensive_language']] = train_probs[:, :2]

In [188]:
train.to_csv('../data/processed/train_sent_analysis.csv', index=False)

In [190]:
test_comments = tokenizer.texts_to_sequences(test['comment_text'].fillna('UNK'))
test_comments = pad_sequences(test_comments, maxlen=MAX_SEQUENCE_LENGTH)

In [191]:
test_probs = model.predict(test_comments)

In [192]:
test['prob_hate_speech'] = np.nan
test['prob_offensive_language'] = np.nan

In [193]:
test.loc[:, ['prob_hate_speech', 'prob_offensive_language']] = test_probs[:, :2]

In [194]:
test.head()

Unnamed: 0,id,comment_text,prob_hate_speech,prob_offensive_language
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...,0.042976,0.1362989
1,6102620,::Kentuckiana is colloquial. Even though the ...,0.000807,0.006388481
2,14563293,"Hello fellow Wikipedians,\nI have just modifie...",0.000168,8.061553e-07
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2...",0.024494,0.03593352
4,22982444,== [WIKI_LINK: Talk:Celts] ==,0.025673,0.05709131
