In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# from utils import *

# helps in text preprocessing
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# helps in model building
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping

# Explaining the model
import ktrain
from ktrain import text

%matplotlib inline



In [2]:
parent_dir = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

In [3]:

df_train = pd.read_csv(os.path.join(parent_dir, 'Data\PreprocessedData\english_train_preprocess.csv'))
df_dev = pd.read_csv(os.path.join(parent_dir, 'Data\PreprocessedData\english_dev_preprocess.csv'))
df_test = pd.read_csv(os.path.join(parent_dir, 'Data\PreprocessedData\english_test_preprocess.csv'))

In [4]:
df_train = df_train[['preprocessed_text', 'label']]
df_dev = df_dev[['preprocessed_text', 'label']]
df_test = df_test[['preprocessed_text', 'label']]

In [5]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

df_train['label'] = df_train['label'].replace(label_replacement)
df_test['label'] = df_test['label'].replace(label_replacement)
df_dev['label'] = df_dev['label'].replace(label_replacement)

# Drop rows with label 2
df_train = df_train[df_train['label'] != 2]
df_test = df_test[df_test['label'] != 2]
df_dev = df_dev[df_dev['label'] != 2]

# Create Index Column
df_train['index'] = df_train.index
df_test['index'] = df_test.index
df_dev['index'] = df_dev.index

# Rename Columns
df_train = df_train.rename(columns={'preprocessed_text': 'text', 'label': 'label'})
df_test = df_test.rename(columns={'preprocessed_text': 'text', 'label': 'label'})
df_dev = df_dev.rename(columns={'preprocessed_text': 'text', 'label': 'label'})

In [6]:
x_train, x_test, preproc = text.texts_from_df(train_df=df_train, text_column='text', label_columns='label', val_df=df_dev)

['not_label', 'label']
   not_label  label
0        0.0    1.0
1        0.0    1.0
2        0.0    1.0
3        0.0    1.0
4        0.0    1.0
['not_label', 'label']
   not_label  label
0        0.0    1.0
1        0.0    1.0
2        0.0    1.0
3        0.0    1.0
4        1.0    0.0
language: en
Word Counts: 19984
Nrows: 22740
22740 train sequences
train sequence lengths:
	mean : 17
	95percentile : 46
	99percentile : 86
x_train shape: (22740,400)
y_train shape: (22740, 2)
Is Multi-Label? False
2841 test sequences
test sequence lengths:
	mean : 16
	95percentile : 45
	99percentile : 85
x_test shape: (2841,400)
y_test shape: (2841, 2)


In [7]:
X_train, y_train = df_train['text'].to_numpy(), df_train['label'].to_numpy()
X_dev, y_dev = df_dev['text'].to_numpy(), df_dev['label'].to_numpy()
X_test, y_test = df_test['text'].to_numpy(), df_test['label'].to_numpy()

In [8]:
t = Tokenizer()
t.fit_on_texts(X_train)

In [9]:
max_length = 400

In [10]:
vocab_size = len(t.word_index) + 1

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 24, input_length=max_length))
model.add(SimpleRNN(24, return_sequences=False))
model.add(Dense(2, activation='softmax'))

# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 24)           479640    
                                                                 
 simple_rnn (SimpleRNN)      (None, 24)                1176      
                                                                 
 dense (Dense)               (None, 2)                 50        
                                                                 
Total params: 480,866
Trainable params: 480,866
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
model = Sequential()
model.add(Embedding(vocab_size, 24, input_length=400)) # add 1 for padding token
model.add(SimpleRNN(24, return_sequences=False))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
learner = ktrain.get_learner(model, train_data=x_train, val_data=x_test)

# STEP 3: train
learner.autofit(0.005, 10, early_stopping=3)



begin training using triangular learning rate policy with max lr of 0.005...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
Weights from best epoch have been loaded into model.


<keras.callbacks.History at 0x1cb5431e4c0>

In [12]:
learner.view_top_losses(n=5, preproc=preproc)

----------
id:2062 | loss:5.26 | true:not_label | pred:label)

it happens in my country scum politician using rasism as a means to get power back once they lost it in elections it works people are like sheep especially people that sit on thier asses all day
----------
id:2782 | loss:5.1 | true:not_label | pred:label)

the walls came down in this interview
----------
id:1830 | loss:4.98 | true:not_label | pred:label)

i am saying all lives matter and i say law matter s and we need our cops and fire and we need to support our law and mostly our cops i hope our government or governors start supporting our cops let them do their job and i think they need a raise i think we need to stop charging them with dum stuff yes some need to go to jail the one who choked that guy yes but the one with a gun no he just doing his job give him a raise good cop
----------
id:1267 | loss:4.97 | true:not_label | pred:label)

nursing is a major because of societal pressure against the men who do it when coll

In [18]:
predictor = ktrain.get_predictor(learner.model, preproc)
# Dev Set List
dev_set = df_dev['text'].to_list()
# Choose Random Text
random_text = dev_set[0]

predictor.explain(random_text)



Contribution?,Feature
6.769,Highlighted in text (sum)
0.667,<BIAS>
