# Bidirectional Gated Recurrent Unit  
Amanda Maiwald, Martin Falli, Radoslav Evtimov

## Import libraries, data and setup

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.metrics import f1_score 

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
import random

from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing import text, sequence
from keras.layers import Embedding, SpatialDropout1D
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input
from keras.optimizers import RMSprop
import keras.backend as K
from keras.layers import Dense, Input, GRU, LSTM, Bidirectional, Dropout, CuDNNLSTM, CuDNNGRU
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [0]:
# source https://github.com/QuantLet/AOBDL_code/blob/master/AOBDL_DL/separate_models/code_6_BGRU.ipynb

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Local directory structure
base_dir = '/content/drive/My Drive/BERT/BERT_Code_Input_Output/'
data_dir = base_dir + 'Data/'
model_dir = base_dir + 'Model_Output_BGRU'

In [0]:
train = pd.read_csv(data_dir + 'train_cleaned_no_punkt.csv') 

## Data preparation

In [0]:
train['mal'] = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train.comment_text.fillna("empty", inplace=True)

In [0]:
from sklearn.model_selection import train_test_split
rs=42

In [0]:
train, test = train_test_split(train, test_size = 0.2, stratify = train.mal)

In [0]:
true_labels = test.iloc[:,2:4]
y_test = true_labels
true_labels.head(3)

Unnamed: 0,mal
89938,False
83878,False
144391,False


In [0]:
test.drop(['mal'], axis=1, inplace=True) 
X_test= test.comment_text
test.head(3)

Unnamed: 0,id,comment_text
89938,f098086336fa0bc8,happy haunts there actually about ish happy ha...
83878,e078062f230ba9a0,alexander or justin david alexander vincent or...
144391,0c50408ed7ca7bdd,there no rule agreement except common way to h...


In [0]:
y = train.iloc[:,2:3]
y_train = y
y.head(3)

Unnamed: 0,mal
264,False
20537,False
102615,False


In [0]:
train.drop(['mal'], axis=1, inplace=True) 
X= train.comment_text
X_train = X
X.head(3)

264       education please keep list colleges in pittsbu...
20537     november utc dear rev michael s margolin bias ...
102615    by way i like how you page completely rigged a...
Name: comment_text, dtype: object

Defining some important features of the model

In [0]:
max_features = 40000
maxlen = 128
dropout_rate = 0
rs = 42
epochs = 4
batch_size = 250
embed_dim = 50
rec_units = 150

#### Creating the model and running the training process

In [0]:
def gru_keras(max_features, maxlen, dropout_rate, embed_dim, rec_units):
    if K.backend == 'tensorflow':        
        K.clear_session()
    input_layer = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, output_dim=embed_dim, trainable=True)(input_layer)
    x = SpatialDropout1D(dropout_rate)(embedding_layer)
    x = Bidirectional(CuDNNGRU(units=rec_units, return_sequences=False))(x)
    output_layer = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(clipvalue=1, clipnorm=1),
                  metrics=['acc'])
    print( model.summary())
    return model

In [0]:
K.clear_session()
#X_train = X
#y_train = y

tokenizer = text.Tokenizer(num_words=max_features, oov_token='unknown')
tokenizer.fit_on_texts(X_train)

list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test  = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

model = gru_keras(max_features, maxlen, dropout_rate, embed_dim, rec_units)

y_train = np.array(y_train)
y_test = np.array(y_test)

print('Fitting')
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
model.save_weights(f'{model_dir}/BGRU.h5')
probs = model.predict(X_test, batch_size=batch_size, verbose=1)
auc_f = average_precision_score(y_test, probs)
roc_f = roc_auc_score(y_test, probs)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 50)           2000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 128, 50)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300)               181800    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 2,182,101
Trainable params: 2,182,101
Non-trainable params: 0
_________________________________________________________________
None
Fitting
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


####Evaluation of results on test data 

In [0]:
probs_class = probs.copy()
probs_class=pd.DataFrame(probs_class)

In [0]:
probs_class = np.where(probs_class.iloc[:,0] >= 0.3, 1, 0)

In [0]:
print("AUC PR:",round(roc_f, 4))
print ("AUC ROC:",round(auc_f, 4) )
print("F1 Score:", round(sklearn.metrics.f1_score (y_test, probs_class),4))

AUC PR: 0.9794
AUC ROC: 0.9001
F1 Score: 0.8138


In [0]:
history_df = pd.DataFrame({'acc':history.history['acc'],
                    'loss': history.history['loss']#,
                    #'val_acc': history.history['val_acc'],
                    #'val_loss': history.history['val_loss']
                    })

In [0]:
history_df

Unnamed: 0,acc,loss
0,0.946019,0.164124
1,0.963088,0.102813
2,0.966629,0.09039
3,0.96829,0.085025


####Saving the results

In [0]:
# save to csv: 
#filename = 'BGRU_history.csv'
#with open(filename, mode='w') as f:
    #history_df.to_csv(model_dir + '/' + filename)
