In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
# accessing data from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Loading the dataset
data = pd.read_csv('/content/drive/MyDrive/AIP Team 5/new_data.csv')

# first 5 samples from dataset
data.head()

Unnamed: 0,comment,label,processed_text,comment_len,processed_text_len
0,Mohammad harun he is an awesome guy very info...,1,mohammad harun awesome guy informative helpful...,44,22.0
1,amazing guy gaurav was so patience and kind ...,1,amazing guy gaurav patience kind helped start ...,55,27.0
2,Gaurav was very knowledgeable and very helpful...,1,gaurav knowledgeable helpful knows situation a...,36,17.0
3,I called them regarding my flight cancellation...,1,i called regarding flight cancellation narrate...,34,19.0
4,VERY GOOD SERVICE BY GAURAV LOHAT SERVED AS SO...,1,very good service by gaurav lohat served as so...,23,23.0


In [None]:
# converting comments into lowercase
data['processed_text'] = data['processed_text'].str.lower()

In [None]:
# data['label'] = data['label'].apply(lambda x: 1 if x=='Postive' else 0)

In [None]:
# spliting 30000 positive random samples
df_train_pos = data[data['label']==1].sample(30000, random_state=13)
df_test_pos = data[data['label']==1].drop(df_train_pos.index)

# spliting 20000 negative random samples
df_train_neg = data[data['label']==0].sample(20000, random_state=13)
df_test_neg = data[data['label']==0].drop(df_train_neg.index)

# joining negative and positives samples for train and test
df_train = pd.concat([df_train_pos,df_train_neg])
df_test = pd.concat([df_test_pos,df_test_neg])


In [None]:
# # 5k ssample dataset for trials
# df_5k = df_train.sample(5000,random_state=13)
# X_train = df_5k['processed_text']
# y_train = df_5k['label']

# df_2k = df_train.sample(2000,random_state=13)
# X_test = df_2k['processed_text']
# y_test = df_2k['label']


In [None]:
# creating X and y for train and test
X_train = df_train['processed_text']
y_train = df_train['label']

X_test = df_test['processed_text']
y_test = df_test['label']


In [None]:
# number of samples in train and test for each class
y_test.value_counts(),y_train.value_counts()

(1    245522
 0      8459
 Name: label, dtype: int64,
 1    30000
 0    20000
 Name: label, dtype: int64)

In [None]:
# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

In [None]:
# creating word dictionary
documents = [text.split() for text in X_train] 
len(documents)

50000

In [None]:
# building word2vec model
import gensim

w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT)

w2v_model.build_vocab(documents)

In [None]:
# training word2vec model
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

(42598795, 50454304)

In [None]:
# from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

# encoding the words to numbers
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
X_train_encoding = tokenizer.texts_to_sequences(X_train)
X_test_encoding = tokenizer.texts_to_sequences(X_test)

max_review_length = 40

# using padding to make every review of equal size
X_train = pad_sequences(X_train_encoding, maxlen=max_review_length,padding='post')
X_test = pad_sequences(X_test_encoding, maxlen=max_review_length,padding='post')

words_to_index = tokenizer.word_index

In [None]:
# words and their embeddings 
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(28042, 300)


In [None]:
# embedding layer for the model
embedding_layer = tf.keras.layers.Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=max_review_length, trainable=False)

In [None]:
import tensorflow as tf

# Parameters
embedding_dim = 16
lstm_dim = 32
dense_dim = 24

# Build the model
model = tf.keras.Sequential([
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim,return_sequences=True,dropout=0.2)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim,return_sequences=True,dropout=0.2)),
    tf.keras.layers.LSTM(lstm_dim,dropout=0.2),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Set the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 300)           8412600   
                                                                 
 bidirectional (Bidirectiona  (None, 40, 64)           85248     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 40, 64)           24832     
 nal)                                                            
                                                                 
 lstm_2 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 24)                792       
                                                                 
 dropout (Dropout)           (None, 24)                0

In [None]:
NUM_EPOCHS = 20

# Train the model
history_lstm = model.fit(X_train,y_train, epochs=NUM_EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
prediction = model.predict(X_test)



In [None]:
y_pred = (prediction > 0.5)

In [None]:
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix,recall_score,precision_score

print("Accuracy of the model : ", accuracy_score(y_pred, y_test))
print('F1-score: ', f1_score(y_pred, y_test))
print('Confusion matrix:')
confusion_matrix(y_test,y_pred)

Accuracy of the model :  0.9097019068355507
F1-score:  0.9513700111534725
Confusion matrix:


array([[  6713,   1746],
       [ 21188, 224334]])

In [None]:
prediction = model.predict(X_train)
y_pred = (prediction > 0.5)



In [None]:
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix,recall_score,precision_score

print("Accuracy of the model : ", accuracy_score(y_pred, y_train))
print('F1-score: ', f1_score(y_pred, y_train))
print('Confusion matrix:')
confusion_matrix(y_train,y_pred)

Accuracy of the model :  0.93688
F1-score:  0.9484970136101049
Confusion matrix:


array([[17783,  2217],
       [  939, 29061]])