In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import time
import datetime

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

import re
import json
from tqdm.autonotebook import tqdm
import string

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import utils
from keras.models import *
from keras.layers import *
from keras.callbacks import *

from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils.vis_utils import plot_model

import warnings
warnings.filterwarnings('ignore')



In [None]:
#define stopwords
from nltk.corpus import stopwords

stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

# DEEP LEARNING: MLP, LSTM, RNN, GLOVE, GRU

* Separate the data into training and test sets.
* Use tokenizer methods to count the unique words in our vocabulary and assign each of those words to indices.
* Calling fit_on_texts() automatically creates a word index lookup of our vocabulary.
* We limit our vocabulary to the top words by passing a num_words param to the tokenizer.
* With our tokenizer, we can now use the texts_to_sequence method to create the training data that we’ll pass our model.
* We feed a one-hot vector to our model.
* After we transform our features and labels in a format Keras can read, we are ready to build our text classification model.
* When we build our model, all we need to do is tell Keras the shape of our input data, output data, and the type of each layer, keras will look after the rest.
* When training the model, we’ll call the fit() method, pass it our training data and labels, batch size and epochs.

# OBTAIN

In [None]:
#read train data
train_df = pd.read_csv('../input/flatiron-capstone-project/train_df.csv', index_col = 0)
train_df.head()

In [None]:
#read submission data
submission_df = pd.read_csv('../input/flatiron-capstone-project/submission.csv', index_col = 0)
submission_df.head()

Now, our job is to fill in the column 'PredictionString' with cleaned_label that matches the given text.

# PREPROCESSING

### Train-Test-Split

In [None]:
X = train_df['dataset_title'].to_numpy()
y = train_df['cleaned_label'].to_numpy()

#split traing data into training a validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [None]:
#check shape
print('Train sentences:', X_train.shape, '\n', 
      'Test sentences:', X_test.shape, '\n', 
      'Train labels:', y_train.shape, '\n', 
      'Test labels:', y_test.shape)

In [None]:
# find avg # of takens (words) in the training set
round(sum([len(i.split()) for i in X_train]) / len(X_train))

### Vectorize

In [None]:
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 5 #max length our sequences will be 

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode = 'int',
                                    output_sequence_length = max_length)

#fit the text vectorizer to dataset
text_vectorizer.adapt(X_train)
#text_vectorizer.adapt(X_test)

### Tokenize

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#method to count the unique words in vocabulary and assign each of those words to indices
tokenizer = Tokenizer()

#prepare vocabulary i.e word index lookup of our vocabulary
tokenizer.fit_on_texts(list(X_train) + list(X_test)) 

#convert text into integer sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

#padding to prepare sequences of same length
X_train_pad  = pad_sequences(X_train_seq, maxlen = 100)
X_test_pad = pad_sequences(X_test_seq, maxlen = 100)

In [None]:
#number of unique words in the training data
size_of_vocabulary = len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

In [None]:
word_index = tokenizer.word_index
word_index

### Encode Label

In [None]:
from sklearn import preprocessing

#use the LabelEncoder to convert text labels to integers, 0, 1, 2, etc.
encoder = preprocessing.LabelEncoder()

#since we have two different data set (X_train and X_test), 
#we need to fit it on all of our data otherwise there might be some categories in the test set X_test that were not in the train set X_train 
#and we will get errors
encoder.fit_transform(list(y_train) + list(y_test)) 
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

### Binarize Label

In [None]:
from keras import utils

#binarize the labels for the neural net
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [None]:
print('X_train shape:', X_train_pad.shape)
print('X_test shape:', X_test_pad.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

# Multilayer Perceptron

In [None]:
#deep learning library
from tensorflow import keras
from keras.models import *
from keras.layers import *
from keras.callbacks import *

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

batch_size = 32
epochs = 30

#build the model
MLP_model = Sequential()

MLP_model.add(Dense(512, activation = 'relu', input_shape = (15728, 100)))
MLP_model.add(Dropout(0.2))

MLP_model.add(Dense(128, activation = 'relu'))
MLP_model.add(Dropout(0.3))

MLP_model.add(Dense(num_classes, activation = 'softmax'))

In [None]:
#get summary
MLP_model.summary()

In [None]:
#plot
from keras.utils.vis_utils import plot_model

plot_model(MLP_model, to_file = 'MLP_model_plot.png', show_shapes = True, show_layer_names = True)

In [None]:
#compile model
MLP_model.compile(loss = 'categorical_crossentropy',
                  optimizer = keras.optimizers.Adam(0.0001),
                  metrics = ['acc'])

In [None]:
#add callbacks
from keras.callbacks import EarlyStopping, ModelCheckpoint

#define the callbacks
early_stopping = [EarlyStopping(monitor = 'val_loss', patience = 10, verbose = 1),
                 ModelCheckpoint(filepath = 'cnn_model.h5', monitor = 'val_loss', save_best_only = True)]

In [None]:
import datetime
start = datetime.datetime.now()

#fit
MLP_history = MLP_model.fit(np.array(X_train_pad), np.array(y_train),
                         batch_size = 32,
                         epochs = 100,
                         validation_data = (np.array(X_test_pad), np.array(y_test)),
                         verbose = 1,
                         callbacks = early_stopping)

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print('Training took a total of {}'.format(elapsed))

In [None]:
#save model
MLP_model.save('MLP_model.h5')

### Model Evaluation

In [None]:
fig , ax = plt.subplots(1,2)
fig.set_size_inches(20, 8)

MLP_train_acc = MLP_history.history['acc']
MLP_train_loss = MLP_history.history['loss']
MLP_val_acc = MLP_history.history['val_acc']
MLP_val_loss = MLP_history.history['val_loss']

epochs = range(1, len(MLP_train_acc) + 1)

ax[0].plot(epochs , MLP_train_acc , 'g-o' , label = 'Training Accuracy')
ax[0].plot(epochs , MLP_val_acc , 'y-o' , label = 'Validation Accuracy')
ax[0].set_title('MLP Model Training & Validation Accuracy')
ax[0].legend(loc = 'lower right')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')

ax[1].plot(epochs , MLP_train_loss , 'g-o' , label = 'Training Loss')
ax[1].plot(epochs , MLP_val_loss , 'y-o' , label = 'Validation Loss')
ax[1].set_title('MLP Model Training & Validation Loss')
ax[1].legend()
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss')

plt.show()

#save
plt.savefig('mlp_model.png')

In [None]:
#evaluate
print('Train loss & accuracy:' , MLP_model.evaluate(X_train_pad, y_train))
print('\n')
print('Test loss & accuracy:' , MLP_model.evaluate(X_test_pad, y_test))

In [None]:
#make prediction
MLP_yhat_test = MLP_model.predict(X_test_pad)

#get classification report
print('Model: Multilayer Perceptron', '\n', classification_report(y_test, MLP_yhat_test))

In [None]:
#from the sklearn docs: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          title = 'Confusion Matrix',
                          cmap = plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis]

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title, fontsize = 30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 90, fontsize = 12)
    plt.yticks(tick_marks, classes, fontsize = 12)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.ylabel('True label', fontsize = 25)
    plt.xlabel('Predicted label', fontsize = 25)

In [None]:
#code from https://github.com/tensorflow/workshops/blob/master/extras/keras-bag-of-words/keras-bow-model.ipynb

#generate a prediction on individual examples
MLP_text_labels = encoder.classes_ 

for i in range(10):
    prediction = MLP_model.predict(np.array([X_test_pad[i]]))
    predicted_label = MLP_text_labels[np.argmax(prediction)]
    print(train_df['dataset_title'].iloc[i][:50], '...')
    print('Actual label: ' + train_df['cleaned_label'].iloc[i])
    print('Predicted label: ' + predicted_label + '\n')

In [None]:
y_softmax = MLP_model.predict(X_test_pad)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test)):
    probs = y_test[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
#get confusion matrix
cm = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize = (40, 40))
plot_confusion_matrix(cm, classes = text_labels, title = 'MLP Confusion Matrix')
plt.show()

#save
plt.savefig('mlp_cm.png')

In [None]:
#summary table
summary_table = pd.DataFrame({'Model': [],
                              'Accuracy': [],
                              'Precision': [], 'Recall': [], 'F1': []})

In [None]:
#update summary table
summary_table.loc[0] = ['DL Multilayer Perceptron',
                        round(accuracy_score(y_test, MLP_yhat_test), 2),
                        round(precision_score(y_test, MLP_yhat_test, average = 'macro'), 2), 
                        round(recall_score(y_test, MLP_yhat_test, average = 'macro'), 2), 
                        round(f1_score(y_test, MLP_yhat_test, average = 'macro'), 2)]
summary_table.head()

# LSTM

In [None]:
#instantiate
LSTM_model = Sequential()

#embedding layer
LSTM_model.add(Embedding(size_of_vocabulary, 300, 
                         input_length = 100)) 

#lstm layer
LSTM_model.add(LSTM(128, return_sequences = True, dropout = 0.2))

#global mxpooling
LSTM_model.add(GlobalMaxPooling1D())

#fully connected layers
LSTM_model.add(Dense(64, activation = 'relu')) 
LSTM_model.add(Dense(130, activation = 'sigmoid')) 

In [None]:
#summary
LSTM_model.summary()

In [None]:
#plot
plot_model(LSTM_model, to_file = 'LSTM_model_plot.png', show_shapes = True, show_layer_names = True)

In [None]:
#add loss function, metrics, optimizer
LSTM_model.compile(optimizer = 'adam', 
                   loss = 'binary_crossentropy', 
                   metrics = ['acc']) 

In [None]:
import datetime
start = datetime.datetime.now()

#fit
LSTM_history = LSTM_model.fit(np.array(X_train_pad), np.array(y_train),
                         batch_size = 32,
                         epochs = 30,
                         validation_data = (np.array(X_test_pad), np.array(y_test)),
                         verbose = 1,
                         callbacks = early_stopping)

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print('Training took a total of {}'.format(elapsed))

In [None]:
#save model
LSTM_model.save('LSTM_model.h5')

In [None]:
fig , ax = plt.subplots(1,2)
fig.set_size_inches(20, 8)

LSTM_train_acc = LSTM_history.history['acc']
LSTM_train_loss = LSTM_history.history['loss']
LSTM_val_acc = LSTM_history.history['val_acc']
LSTM_val_loss = LSTM_history.history['val_loss']

epochs = range(1, len(LSTM_train_acc) + 1)

ax[0].plot(epochs, LSTM_train_acc , 'go-' , label = 'Training Accuracy')
ax[0].plot(epochs , LSTM_val_acc , 'yo-' , label = 'Validation Accuracy')
ax[0].set_title('LSTM Model Training & Validation Accuracy')
ax[0].legend()
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')

ax[1].plot(epochs, LSTM_train_loss , 'go-' , label = 'Training Loss')
ax[1].plot(epochs, LSTM_val_loss , 'yo-' , label = 'Validation Loss')
ax[1].set_title('LSTM Model Training & Validation Loss')
ax[1].legend()
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss')

plt.show()

#save
plt.savefig('lstm_acc_loss.png')

In [None]:
#evaluation 
print('Train loss & accuracy:' , LSTM_model.evaluate(X_train_seq, y_train))
print('\n')
print('Test loss & accuracy:' , LSTM_model.evaluate(X_test_seq, y_test))

In [None]:
#make prediction
LSTM_yhat_test = LSTM_model.predict(X_test_pad)

#get classification report
print('Model: LSTM', '\n', classification_report(y_test, LSTM_yhat_test))

In [None]:
#generate a prediction on individual examples
LSTM_text_labels = encoder.classes_ 

for i in range(10):
    prediction = LSTM_model.predict(np.array([X_test_pad[i]]))
    predicted_label = LSTM_text_labels[np.argmax(prediction)]
    print(train_df['dataset_title'].iloc[i][:50], '...')
    print('Actual label: ' + train_df['cleaned_label'].iloc[i])
    print('Predicted label: ' + predicted_label + '\n')

In [None]:
y_softmax = LSTM_model.predict(X_test_pad)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test)):
    probs = y_test[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
#get confusion matrix
cm = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize = (40, 40))
plot_confusion_matrix(cm, classes = text_labels, title = 'LSTM Confusion Matrix')
plt.show()

#save
plt.savefig('lstm_cm.png')

In [None]:
summary_table.loc[1] = ['DL LSTM',
                        round(accuracy_score(y_test, LSTM_yhat_test), 2),
                        round(precision_score(y_test, LSTM_yhat_test, average = 'macro'), 2), 
                        round(recall_score(y_test, LSTM_yhat_test, average = 'macro'), 2), 
                        round(f1_score(y_test, LSTM_yhat_test, average = 'macro'), 2)]
summary_table.head()

# Recurrent Neural Network

In [None]:
#initialize
RNN_model = Sequential()

#define model architecture
RNN_model.add(SimpleRNN(20, return_sequences = True, input_shape = (15728, 100))
RNN_model.add(SimpleRNN(20, return_sequences = True))

#output layer
RNN_model.add(Dense(1, activation = 'softmax'))

In [None]:
#add loss function, metrics, optimizer
RNN_model.compile(optimizer = keras.optimizers.Adam(0.0001), 
                   loss = 'binary_crossentropy', 
                   metrics = ['acc']) 

#summary
print(RNN_model.summary())

In [None]:
start = datetime.datetime.now()

#fit
RNN_history = RNN_model.fit(np.array(X_train_seq), np.array(y_train),
                         batch_size = 32,
                         epochs = 30,
                         validation_data = (np.array(X_test_seq), np.array(y_test)),
                         verbose = 1,
                         callbacks = early_stopping)

In [None]:
#time
end = datetime.datetime.now()
elapsed = end - start
print('Training took a total of {}'.format(elapsed))

In [None]:
#save model
RNN_model.save('RNN_model.h5')

In [None]:
fig , ax = plt.subplots(1,2)
fig.set_size_inches(20, 8)

RNN_train_acc = RNN_history.history['acc']
RNN_train_loss = RNN_history.history['loss']
RNN_val_acc = RNN_history.history['val_acc']
RNN_val_loss = RNN_history.history['val_loss']

epochs = range(1, len(RNN_train_acc) + 1)

ax[0].plot(epochs, RNN_train_acc , 'go-' , label = 'Training Accuracy')
ax[0].plot(epochs , RNN_val_acc , 'yo-' , label = 'Validation Accuracy')
ax[0].set_title('RNN Model Training & Validation Accuracy')
ax[0].legend()
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')

ax[1].plot(epochs, RNN_train_loss , 'go-' , label = 'Training Loss')
ax[1].plot(epochs, RNN_val_loss , 'yo-' , label = 'Validation Loss')
ax[1].set_title('RNN Model Training & Validation Loss')
ax[1].legend()
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss')

plt.show()

#save
plt.savefig('rnn_acc_loss.png')

In [None]:
#evaluation 
print('Train loss & accuracy:' , RNN_model.evaluate(X_train_seq, y_train))
print('\n')
print('Test loss & accuracy:' , RNN_model.evaluate(X_test_seq, y_test))

In [None]:
#make prediction
RNN_yhat_test = RNN_model.predict(X_test_pad)

#get classification report
print('Model: LSTM', '\n', classification_report(y_test, RNN_yhat_test))

In [None]:
#generate a prediction on individual examples
RNN_text_labels = encoder.classes_ 

for i in range(10):
    prediction = RNN_model.predict(np.array([X_test_pad[i]]))
    predicted_label = RNN_text_labels[np.argmax(prediction)]
    print(train_df['dataset_title'].iloc[i][:50], '...')
    print('Actual label: ' + train_df['cleaned_label'].iloc[i])
    print('Predicted label: ' + predicted_label + '\n')

In [None]:
y_softmax = RNN_model.predict(X_test_pad)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test)):
    probs = y_test[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
#get confusion matrix
cm = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize = (40, 40))
plot_confusion_matrix(cm, classes = text_labels, title = 'RNN Confusion Matrix')
plt.show()

#save
plt.savefig('rnn_cm.png')

In [None]:
#update summary table
summary_table.loc[2] = ['DL Recurrent Neural Network',
                        round(accuracy_score(y_test, RNN_yhat_test), 2),
                        round(precision_score(y_test, RNN_yhat_test, average = 'macro'), 2), 
                        round(recall_score(y_test, RNN_yhat_test, average = 'macro'), 2), 
                        round(f1_score(y_test, RNN_yhat_test, average = 'macro'), 2)]
summary_table.head()

# Pretrained Word Vectors With GloVe

Global Vectors for Word Representation, or GloVe, is an “unsupervised learning algorithm for obtaining vector representations for words.” Simply put, GloVe allows us to take a corpus of text, and intuitively transform each word in that corpus into a position in a high-dimensional space. This means that similar words will be placed together.

In [None]:
#load the whole embedding into memory
embeddings_index = {}
f = open('../input/glove840b300dtxt/glove.840B.300d.txt')

for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype = 'float32')
        embeddings_index[word] = coefs
    except ValueError: #catch the exception where there are strings in the GloVe text file, can be avoided if use glove.42B.300d.txt
        pass
    
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
#create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
#simple bidirectional LSTM with GloVe embeddings and two dense layers
GloVe_model = Sequential()

#embedding layer
GloVe_model.add(Embedding(size_of_vocabulary, 300, 
                          weights = [embedding_matrix], 
                          input_length = 100, 
                          trainable = False)) 

#lstm layer
GloVe_model.add(LSTM(128, return_sequences = True, dropout = 0.2))
GloVe_model.add(SpatialDropout1D(0.3))
GloVe_model.add(Bidirectional(LSTM(300, dropout = 0.3, recurrent_dropout = 0.3)))

#fully connected layers
GloVe_model.add(Dense(1024, activation = 'relu'))
GloVe_model.add(Dropout(0.8))

GloVe_model.add(Dense(512, activation = 'relu'))
GloVe_model.add(Dropout(0.5))

#output layer
GloVe_model.add(Dense(130, activation = 'sigmoid')) 

In [None]:
#summary
GloVe_model.summary()

In [None]:
#compile
GloVe_model.compile(optimizer = 'adam', 
                    loss = 'binary_crossentropy',
                    metrics = ['acc']) 

In [None]:
start = datetime.datetime.now()

#fit
GloVe_history = GloVe_model.fit(np.array(X_train_pad), np.array(y_train),
                                batch_size = 32,
                                epochs = 30,
                                verbose = 1,
                                validation_data = (np.array(X_test_pad), np.array(y_test)),
                                callbacks = early_stopping)

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print('Training took a total of {}'.format(elapsed))

In [None]:
#save model
GloVe_model.save('GloVe_model.h5')

In [None]:
fig , ax = plt.subplots(1,2)
fig.set_size_inches(20, 8)

GloVe_train_acc = GloVe_history.history['acc']
GloVe_train_loss = GloVe_history.history['loss']
GloVe_val_acc = GloVe_history.history['val_acc']
GloVe_val_loss = GloVe_history.history['val_loss']

epochs = range(1, len(mlp_train_acc) + 1)

ax[0].plot(epochs , GloVe_train_acc , 'g-o' , label = 'Training Accuracy')
ax[0].plot(epochs , GloVe_val_acc , 'y-o' , label = 'Validation Accuracy')
ax[0].set_title('GloVe Model Training & Validation Accuracy')
ax[0].legend(loc = 'lower right')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')

ax[1].plot(epochs, GloVe_train_loss , 'g-o' , label = 'Training Loss')
ax[1].plot(epochs, GloVe_val_loss , 'y-o' , label = 'Validation Loss')
ax[1].set_title('GloVe Model Training & Validation Loss')
ax[1].legend()
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy')

plt.show()

In [None]:
#save
plt.savefig('GloVe_model.png')

In [None]:
print('Train loss & accuracy:' , GloVe_model.evaluate(X_train_pad, y_train))
print('\n')
print('Test loss & accuracy:' , GloVe_model.evaluate(X_test_pad, y_test))

In [None]:
#make prediction
GloVe_yhat_test = GloVe_model.predict(X_test_pad)

#get classification report
print('Model: GloVe', '\n', classification_report(y_test, GloVe_yhat_test))

In [None]:
#generate a prediction on individual examples
GloVe_text_labels = encoder.classes_ 

for i in range(10):
    prediction = GloVe_model.predict(np.array([X_test_pad[i]]))
    predicted_label = GloVe_text_labels[np.argmax(prediction)]
    print(train_df['dataset_title'].iloc[i][:50], '...')
    print('Actual label: ' + train_df['cleaned_label'].iloc[i])
    print('Predicted label: ' + predicted_label + '\n')

In [None]:
y_softmax = GloVe_model.predict(X_test_pad)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test)):
    probs = y_test[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
#get confusion matrix
cm = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize = (40, 40))
plot_confusion_matrix(cm, classes = text_labels, title = 'GloVe Confusion Matrix')
plt.show()

#save
plt.savefig('GloVe_cm.png')

In [None]:
#update summary table
summary_table.loc[3] = ['DL GloVe',
                        round(accuracy_score(y_test, GloVe_yhat_test), 2),
                        round(precision_score(y_test, GloVe_yhat_test, average = 'macro'), 2), 
                        round(recall_score(y_test, GloVe_yhat_test, average = 'macro'), 2), 
                        round(f1_score(y_test, GloVe_yhat_test, average = 'macro'), 2)]
summary_table.head()

# GRU

In [None]:
GRU_model = Sequential()

#embedding layer
GRU_model.add(Embedding(size_of_vocabulary, 300,
                        weights = [embedding_matrix],
                        input_length = 100,
                        trainable = False)) 

#lstm layer
GRU_model.add(LSTM(128, return_sequences = True, dropout = 0.2))
GRU_model.add(SpatialDropout1D(0.3))
GRU_model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
GRU_model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

#fully connected layers
GRU_model.add(Dense(1024, activation = 'relu'))
GRU_model.add(Dropout(0.8))

GRU_model.add(Dense(512, activation = 'relu'))
GRU_model.add(Dropout(0.5))

#output layer
GRU_model.add(Dense(130, activation = 'sigmoid')) 

In [None]:
#summary
GRU_model.summary()

In [None]:
#compile
GRU_model.compile(optimizer = 'adam', 
                    loss = 'binary_crossentropy',
                    metrics = ['acc']) 

In [None]:
start = datetime.datetime.now()

#fit
GRU_history = GRU_model.fit(np.array(X_train_pad), np.array(y_train),
                                batch_size = 32,
                                epochs = 30,
                                verbose = 1,
                                validation_data = (np.array(X_test_pad), np.array(y_test)),
                                callbacks = early_stopping)

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print('Training took a total of {}'.format(elapsed))

In [None]:
#save model
GRU_model.save('GRU_model.h5')

In [None]:
fig , ax = plt.subplots(1,2)
fig.set_size_inches(20, 8)

GRU_train_acc = GRU_history.history['acc']
GRU_train_loss = GRU_history.history['loss']
GRU_val_acc = GRU_history.history['val_acc']
GRU_val_loss = GRU_history.history['val_loss']

epochs = range(1, len(mlp_train_acc) + 1)

ax[0].plot(epochs , GRU_train_acc , 'g-o' , label = 'Training Accuracy')
ax[0].plot(epochs , GRU_val_acc , 'y-o' , label = 'Validation Accuracy')
ax[0].set_title('GRU Model Training & Validation Accuracy')
ax[0].legend(loc = 'lower right')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')

ax[1].plot(epochs, GRU_train_loss , 'g-o' , label = 'Training Loss')
ax[1].plot(epochs, GRU_val_loss , 'y-o' , label = 'Validation Loss')
ax[1].set_title('GRU Model Training & Validation Loss')
ax[1].legend()
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy')

plt.show()

In [None]:
#make prediction
GRU_yhat_test = GRU_model.predict(X_test_pad)

#get classification report
print('Model: GRU', '\n', classification_report(y_test, GRU_yhat_test))

In [None]:
#generate a prediction on individual examples
GRU_text_labels = encoder.classes_ 

for i in range(10):
    prediction = GRU_model.predict(np.array([X_test_pad[i]]))
    predicted_label = GRU_text_labels[np.argmax(prediction)]
    print(train_df['dataset_title'].iloc[i][:50], '...')
    print('Actual label: ' + train_df['cleaned_label'].iloc[i])
    print('Predicted label: ' + predicted_label + '\n')

In [None]:
y_softmax = GRU_model.predict(X_test_pad)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test)):
    probs = y_test[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
#get confusion matrix
cm = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize = (40, 40))
plot_confusion_matrix(cm, classes = text_labels, title = 'GRU Confusion Matrix')
plt.show()

#save
plt.savefig('GRU_cm.png')

In [None]:
#update summary table
summary_table.loc[4] = ['DL GRU',
                        round(accuracy_score(y_test, GRU_yhat_test), 2),
                        round(precision_score(y_test, GRU_yhat_test, average = 'macro'), 2), 
                        round(recall_score(y_test, GRU_yhat_test, average = 'macro'), 2), 
                        round(f1_score(y_test, GRU_yhat_test, average = 'macro'), 2)]
summary_table.head()

## spaCy

spaCy is a natural language processing (NLP) library for Python designed specifically for production use and helps you build applications that process and “understand” large volumes of text. It can be used to build information extraction or natural language understanding systems, or to pre-process text for deep learning. It is designed to have fast performance, and with word embedding models built in, it’s perfect for a quick and easy start.

# INTERPRETATION

In [None]:
y_hats = model.predict(X_test)

y_hats  = pd.DataFrame(y_hats)

df_out = X_test.reset_index()
df_out["Actual"] = y_test.reset_index()["Columns_Name"]
df_out["Prediction"] = y_hats.reset_index()[0]


y_test['preds'] = y_hats

df_out = pd.merge(df,y_test[['preds']],how = 'left',left_index = True, right_index = True)