In [17]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GRU, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Conv1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
import gc
from sklearn.model_selection import train_test_split
from keras.models import load_model
import tensorflow as tf
from keras.models import model_from_json
import keras.backend
import unidecode
import json
import regex as re
import pickle

In [2]:
# Parameters
EMBEDSIZE = 50
MAXFEATURES = 2000
MAXLEN = 100
batch_size = 32
epochs = 3


In [3]:
train = pd.read_csv('data/train.csv')
test  = pd.read_csv('data/test.csv')
EMBEDDING_FILE = f'glove-twitter-27B/glove.twitter.27B.50d.txt'

In [4]:
def glove_preprocess(text):
    """
    adapted from https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb

    """
    # Different regex parts for smiley faces
    eyes = "[8:=;]"
    nose = "['`\-]?"
    text = re.sub("https?:* ", "<URL>", text)
    text = re.sub("www.* ", "<URL>", text)
    text = re.sub("\[\[User(.*)\|", '<USER>', text)
    text = re.sub("<3", '<HEART>', text)
    text = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>", text)
    text = re.sub(eyes + nose + "[Dd)]", '<SMILE>', text)
    text = re.sub("[(d]" + nose + eyes, '<SMILE>', text)
    text = re.sub(eyes + nose + "p", '<LOLFACE>', text)
    text = re.sub(eyes + nose + "\(", '<SADFACE>', text)
    text = re.sub("\)" + nose + eyes, '<SADFACE>', text)
    text = re.sub(eyes + nose + "[/|l*]", '<NEUTRALFACE>', text)
    text = re.sub("/", " / ", text)
    text = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>", text)
    text = re.sub("([!]){2,}", "! <REPEAT>", text)
    text = re.sub("([?]){2,}", "? <REPEAT>", text)
    text = re.sub("([.]){2,}", ". <REPEAT>", text)
    pattern = re.compile(r"(.)\1{2,}")
    text = pattern.sub(r"\1" + " <ELONG>", text)

    return text

In [5]:
X_train, X_test, y_train, y_test = train_test_split(train,
                                        train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]],
                                        test_size = 0.10, random_state = 42)
list_sentences_train = X_train["comment_text"].apply(glove_preprocess)
list_sentences_test = X_test["comment_text"].apply(glove_preprocess)

In [6]:
tokenizer = Tokenizer(num_words=MAXFEATURES,char_level=True)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_sentences_test = tokenizer.texts_to_sequences(list_sentences_test)

X_t = pad_sequences(list_tokenized_train, maxlen=MAXLEN)
X_te = pad_sequences(list_sentences_test, maxlen=MAXLEN)

In [7]:
def get_model(embedding_matrix, dropout = 0.2):
    inp = Input(shape=(MAXLEN,))
    x = Embedding(MAXFEATURES, EMBEDSIZE, weights=[ embedding_matrix])(inp)
    x = Conv1D(filters = 100, kernel_size = 4, padding = 'same', activation = 'relu' )(x)
    x = MaxPooling1D(pool_size =4)(x)
    x = Bidirectional(GRU(60, return_sequences=True, dropout=dropout, recurrent_dropout=0.2))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs= inp, outputs = x)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [8]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE,encoding="utf8"))

In [9]:
word_index = tokenizer.word_index
num_words = min(MAXFEATURES, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDSIZE))
for word, i in word_index.items():
    if i >= MAXFEATURES:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [10]:
model = get_model(embedding_matrix, dropout=0.2)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 50)           100000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 100)          20100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 25, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 25, 120)           57960     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6050      
__________

In [11]:
# Call backs
wtFile = "weights.best.hdf5"
checkpoint = ModelCheckpoint(wtFile, monitor = 'val_loss', verbose=1, save_best_only=True, mode = 'min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early

In [12]:
model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_te,y_test), callbacks=callbacks_list)
print("Saved Model Weights to file!!")

Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_loss improved from inf to 0.07408, saving model to weights.best.hdf5
Epoch 2/3

Epoch 00002: val_loss improved from 0.07408 to 0.06754, saving model to weights.best.hdf5
Epoch 3/3

Epoch 00003: val_loss improved from 0.06754 to 0.06474, saving model to weights.best.hdf5
Saved Model Weights to file!!


In [13]:
model.load_weights(wtFile)

y_test = model.predict(X_te)

In [14]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
print("Saved Model to file!!")    

Saved Model to file!!


In [18]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Saved Tokenizer to file!!")  

Saved Tokenizer to file!!


In [15]:
# tweet = X_test["comment_text"].iloc[0]
# tweet_pre = glove_preprocess(tweet)
# print(tweet_pre)
# print(list_sentences_train[0])
# tweet_seq = tokenizer.texts_to_sequences([tweet_pre])
# print(tweet_seq)

In [16]:
tweet_seq = pad_sequences(tweet_seq, maxlen= MAXLEN)
print(tweet)
print(tweet_seq)
print(tweet_seq.shape)

NameError: name 'tweet_seq' is not defined

In [None]:
# y_tweet = model.predict(tweet_seq)
# y_tweet