In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
# accessing data from google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Loading the dataset
data = pd.read_csv('/content/drive/MyDrive/AIP Team 5/new_data.csv')

# first 5 samples from dataset
data.head()

In [None]:
# converting comments into lowercase
data['processed_text'] = data['processed_text'].str.lower()

In [None]:
# creating X and y for train and test
X= data['processed_text']
y= data['label']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75)

In [None]:
Counter(y_train)

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

In [None]:
X_train_tf = vectorizer.transform(X_train)

In [None]:
overSampler = RandomOverSampler(sampling_strategy=0.5)
X_train_os, y_train_os = overSampler.fit_resample(X_train_tf, y_train)

In [None]:
X_train = vectorizer.inverse_transform(X_train_os)

In [None]:
len(X_train),len(y_train_os)

In [None]:
# Making the embedding layer using pre-trained weights, taken from Glove
embedding_dict = {}

glove_file_path = r"/content/drive/MyDrive/Sem 3/Deep Learning/glove.6B.100d.txt"
with open(glove_file_path,'r') as f:
    for line in f:

        # Every lines contains word and then its embedding
        # spliting the line
        values = line.split()

        # first token will be the word
        word = values[0]

        # rest of the tokens are the embedding values of that word
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word] = vectors
        
f.close()

In [None]:
temp = []
for xt in X_train:
  temp.append(' '.join(xt))


In [None]:
X_train = temp

In [None]:
X_train[15]

In [None]:
# from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

# encoding the words to numbers
tokenizer.fit_on_texts(X_train)
X_train_encoding = tokenizer.texts_to_sequences(X_train)
X_test_encoding = tokenizer.texts_to_sequences(X_test)

max_review_length = 70

# using padding to make every review of equal size
X_train = pad_sequences(X_train_encoding, maxlen=max_review_length,padding='post')
X_test = pad_sequences(X_test_encoding, maxlen=max_review_length,padding='post')

words_to_index = tokenizer.word_index

In [None]:
# length of our vocab
vocab_len = len(words_to_index)+1

# defining the numpy matrix to store the encodings
emb_matrix = np.zeros((vocab_len, 100))

not_list = []
for word, index in words_to_index.items():
  embedding_vector = embedding_dict.get(word)

  # if word is in the embedding dictionary else discard it
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector
  else:
    not_list.append(index)

embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_len, output_dim=100, input_length=max_review_length, weights = [emb_matrix], trainable=True)

In [None]:
import tensorflow as tf

# Parameters
embedding_dim = 16
lstm_dim = 32
dense_dim = 24

# Build the model
model = tf.keras.Sequential([
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim,return_sequences=True,dropout=0.2)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim,return_sequences=True,dropout=0.2)),
    tf.keras.layers.LSTM(lstm_dim,dropout=0.2),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Set the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model.summary()

In [None]:
NUM_EPOCHS = 20

# Train the model
history_lstm = model.fit(X_train,y_train_os, epochs=NUM_EPOCHS)

In [None]:
prediction = model.predict(X_test)

In [None]:
y_pred = (prediction > 0.5)

In [None]:
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix,recall_score,precision_score

print("Accuracy of the model : ", accuracy_score(y_pred, y_test))
print('F1-score: ', f1_score(y_pred, y_test))
print('Confusion matrix:')
confusion_matrix(y_test,y_pred)

In [None]:
prediction = model.predict(X_train)
y_pred = (prediction > 0.5)

In [None]:
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix,recall_score,precision_score

print("Accuracy of the model : ", accuracy_score(y_pred, y_train_os))
print('F1-score: ', f1_score(y_pred, y_train_os))
print('Confusion matrix:')
confusion_matrix(y_train_os,y_pred)