In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

## Import relevant libraries

In [2]:
# core system imports
import os

import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import string
import random
import joblib
import itertools

from timeit import timeit
from unidecode import unidecode
import matplotlib.pyplot as plt

from gensim.models import Word2Vec

import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Dense, 
    Input, 
    LSTM, 
    Embedding, 
    Dropout, 
    GlobalMaxPool1D
)
from tensorflow.keras.models import Sequential

from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import (
    validation_curve,
    learning_curve
)
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
    precision_recall_fscore_support
)

# Matplotlib config
%matplotlib inline
%alias_magic t timeit


SyntaxError: invalid syntax (Temp/ipykernel_30976/269759905.py, line 42)

In [None]:
# check for available GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
# Location of the pipeline metadata store
_pipeline_root = '../pipeline/'

# Directory of the raw data files
_data_root = '../input/'

# Directory of the pretrained word embeddings
_embedding_root = '../input/embeddings'

_data_filepath = os.path.join(_data_root, "data.csv")

_stopwords_filepath = os.path.join(_data_root, "stopwords.txt")

_embedding_model_filepath = os.path.join(_embedding_root, "model_sg")
_pretrained_vectors_filepath = os.path.join(_embedding_root, "model_sg.wv.vectors.npy")
_neg_vectors_filepath = os.path.join(_embedding_root, "model_sg.trainables.syn1neg.npy")


In [None]:
# List input datasets in directory
os.listdir(_data_root)

# Word embeddings
print(os.listdir(_embedding_root))

## Load training and test data

In [None]:
# Read data from CSV file
data = pd.read_csv(_data_filepath)

In [None]:
data.info()

In [None]:
data.head()

In [None]:
# Read stop words
stopwords_list = list()

with open(_stopwords_filepath) as file:
    stopwords_list = [line.strip() for line in file.readlines()]

In [None]:
listToStr = ' '.join([str(word) for word in stopwords_list])
print(listToStr)

In [None]:
# Read saved data from disk
def load_pickle(filename):
    data = joblib.load(filename)
    return(data)
    
# Save data to disk for future use
def save_pickle(data, filename):
    joblib.dump(data, filename)
    return True

In [None]:
# removing stopwords
def _apply_lowercase(text):
    text = [item for item in text if item not in stopwords_list]
    text = ''.join(text)
    return text

# removing stopwords
def _stopwords_removal(text):
    text = [item for item in text if item not in stopwords_list]
    text = ''.join(text)
    return text

# remove punctuations
def _punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

# Shuffle dataset
def _shuffle_dataset(dataset):
    dataset = shuffle(data)
    dataset = data.reset_index(drop=True)
    return dataset

In [None]:
# Change string to lower case
data['text'] = data['text'].apply(str.lower)

# Change label to lower case
data['Label'] = data['Label'].apply(str.lower)

# remove punctuations or special characters
data['text'] = data['text'].apply(_punctuation_removal)

# remove stopwords
data['text'] = data['text'].apply(_stopwords_removal)

# Shuffle the dataset to prevent bias:
data = _shuffle_dataset(data)

# Print head of the data
data.head()

### Encode output lalbel

In [None]:
# make copy of dataframe
data_encoded = data.copy()

# create a label encoder
label_encoder = LabelEncoder()
label_encoder.fit(data_encoded['Label'])

data_encoded['label_encoded'] = label_encoder.transform(data_encoded['Label'])

data_encoded.head()

### Split Dataset

+ Tranining: 70% of the dataset
+ Testing: 30% of dataset

In [None]:
list_classes = ["label_encoded"]

X_train, X_test, Y_train, Y_test = train_test_split(data_encoded["text"], data_encoded[list_classes], test_size=0.3, random_state = 1)

Y_train = Y_train.values
Y_test = Y_test.values

# Show dimension of the comments
X_train.shape, X_test.shape

### Tokenize Sentences

To be able to train our model with a text data, we'd have to convert it into number form, for this we're going to use the Tokenizer module from Keras.preprocessing library

In [None]:
list_sentences_train = X_train.values
list_sentences_test = X_test.values

In [None]:
tokenizer = Tokenizer(lower=True)

tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [None]:
# size of training vocabulary (number of unique words)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
# print out a random sequence of text from the tokenized training data
print(random.choice(list_tokenized_train))

In [None]:
print(len(list_tokenized_train), 'train sequences')
print(len(list_tokenized_test), 'test sequences')

print('Average train sequence length: {}'.format(np.mean(list(map(len, list_tokenized_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, list_tokenized_test)), dtype=int)))

## Pad tokenized Sentences

You might have observed that the sentences are not of the same lengths, so we need to pad them with zeros (0's) so that the resulting array will have equal length.

We'd use `text` module from `keras.preprocessing` library

We'd use a max character of 300

In [3]:
maxlen = 300
X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

NameError: name 'pad_sequences' is not defined

In [None]:
print(X_train[:1])

## Use pre-trained Word Embedding

In [None]:
# load saved genism model from disk
model_sg = Word2Vec.load(_embedding_model_filepath)

# load saved numpy vectors from disk
vectors = np.load(_pretrained_vectors_filepath)

# load a saved trainable numpy vectors from disk
vectors_neg = np.load(_neg_vectors_filepath)

In [None]:
# get vocabularies from saved model
vocabs = list()

for word, vocab_obj in model_sg.wv.vocab.items():
    vocabs.append(word)

# delete unused variables to free memory
del((word, vocab_obj))

In [None]:
# Check length of vectors
len(vectors), len(vectors_neg), len(vocabs)

In [None]:
# create word embedding dictionary
embeddings = dict()

# zip words and their corresponding vectors
for i, (word,vector) in enumerate(zip(vocabs, vectors)):
    embeddings[word] = vector
    
# delete unused variables to free memory
del((i, word, vector))

In [None]:
num_tokens = vocab_size
embedding_dim = 300
hits = 0
misses = 0


# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
        
print("Converted %d words (%d misses)" % (hits, misses))

## Build Model

In [None]:
model = Sequential()

In [None]:
model.add(Input(shape=(maxlen, )))

In [None]:
embed_size = 300
model.add(Embedding(vocab_size, embed_size, weights=[embedding_matrix], trainable=False))

In [None]:
model.add(LSTM(60, return_sequences=True,name='lstm_layer'))

In [None]:
model.add(GlobalMaxPool1D())

In [None]:
model.add(Dropout(0.1))

In [None]:
model.add(Dense(50, activation="relu"))

In [None]:
model.add(Dropout(0.1))

In [None]:
model.add(Dense(3, activation="softmax"))

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
model.summary()

### Create plot for model architecture

In [None]:
# Plot model
dot_img_file = 'model_1.png'
tf.keras.utils.plot_model(model, to_file=dot_img_file,
                          show_shapes=True,
                          show_layer_names=True,
                          rankdir="TB",
                          expand_nested=True,
                          dpi=96
                         )

### Train model

In [None]:
# saving training history
csv_logger = CSVLogger('training.log', separator=',', append=False)

In [None]:
batch_size = 16
epochs = 10
history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, 
                    validation_data=(X_test, Y_test), callbacks=[csv_logger])

### Evaluate model

In [None]:
score, accuracy = model.evaluate(X_test, Y_test, verbose = 1, batch_size = 32)
print("score: %.4f" % (score))
print("acc: %.4f" % (accuracy))

In [None]:
y_pred = model.predict(X_test, batch_size=32, verbose=1)

print("sample of probabilistic predictions: \n {}".format(y_pred[:2]))

In [None]:
# map classes to results
predictions = np.argmax(y_pred, axis=1)
print("sample of y_pred: \n {}".format(predictions[:2]))

In [None]:
precision = precision_score(Y_test, predictions, average='micro')
recall = recall_score(Y_test, predictions, average='micro')
f1 = f1_score(Y_test, predictions, average='micro')
accuracy = accuracy_score(Y_test, predictions, normalize=True)
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}, Accuracy: {:.4f}".format(precision, recall, f1, accuracy))

In [None]:
# Load training history
history = pd.read_csv('training.log', sep=',', engine='python')

history.head()

## Plot Model Accuracy & Loss

In [None]:
# plot history for accuracy
plt.plot(history[ 'accuracy' ])
plt.plot(history[ 'val_accuracy' ])
plt.title( 'model accuracy' )
plt.ylabel( 'accuracy' )
plt.xlabel( 'epoch' )
plt.legend([ 'train' , 'test' ], loc= 'lower right' )
plt.show()

# plot history for accuracy
plt.plot(history[ 'loss' ])
plt.plot(history[ 'val_loss' ])
plt.title( 'model loss' )
plt.ylabel( 'loss' )
plt.xlabel( 'epoch' )
plt.legend([ 'train' , 'test' ], loc= 'upper left' )
plt.show()

## Confusion Matrix

In [None]:
cnf_matrix = confusion_matrix(Y_test, predictions)

FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [None]:
print(TP, FP, TN, FN)
print(TPR, TNR, PPV, NPV, FPR, FNR, FDR, ACC)

## Plot ROC Curve

In [None]:
# plt.plot(FPR, TPR, linestyle='--', label='No Skill')
plt.plot(FPR, TPR, marker='.', label='LSTM')

# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()