In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
# accessing data from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Loading the dataset
data = pd.read_csv('/content/drive/MyDrive/AIP Team 5/new_data.csv')

# first 5 samples from dataset
data.head()

Unnamed: 0,comment,label,processed_text,comment_len,processed_text_len
0,Mohammad harun he is an awesome guy very info...,1,mohammad harun awesome guy informative helpful...,44,22.0
1,amazing guy gaurav was so patience and kind ...,1,amazing guy gaurav patience kind helped start ...,55,27.0
2,Gaurav was very knowledgeable and very helpful...,1,gaurav knowledgeable helpful knows situation a...,36,17.0
3,I called them regarding my flight cancellation...,1,i called regarding flight cancellation narrate...,34,19.0
4,VERY GOOD SERVICE BY GAURAV LOHAT SERVED AS SO...,1,very good service by gaurav lohat served as so...,23,23.0


In [None]:
# converting comments into lowercase
data['processed_text'] = data['processed_text'].str.lower()

In [None]:
# data['label'] = data['label'].apply(lambda x: 1 if x=='Postive' else 0)

In [None]:
# spliting 30000 positive random samples
df_train_pos = data[data['label']==1].sample(30000, random_state=13)
df_test_pos = data[data['label']==1].drop(df_train_pos.index)

# spliting 20000 negative random samples
df_train_neg = data[data['label']==0].sample(20000, random_state=13)
df_test_neg = data[data['label']==0].drop(df_train_neg.index)

# joining negative and positive samples for train and test
df_train = pd.concat([df_train_pos,df_train_neg])
df_test = pd.concat([df_test_pos,df_test_neg])


In [None]:
# # 5k ssample dataset for trials
# df_5k = df_train.sample(5000,random_state=13)
# X_train = df_5k['processed_text']
# y_train = df_5k['label']

# df_2k = df_train.sample(2000,random_state=13)
# X_test = df_2k['processed_text']
# y_test = df_2k['label']


In [None]:
# creating X and y for train and test
X_train = df_train['processed_text']
y_train = df_train['label']

X_test = df_test['processed_text']
y_test = df_test['label']


In [None]:
# number of samples in train and test for each class
y_test.value_counts(),y_train.value_counts()

(1    245522
 0      8459
 Name: label, dtype: int64, 1    30000
 0    20000
 Name: label, dtype: int64)

#Task to complete
- oversampling and undersampling.....
- what's the accuracy of the model....
- analysing the incorrect prediction....
- Transformerr...


#Transformers

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

# encoding the words to numbers
tokenizer.fit_on_texts(X_train)
X_train_encoding = tokenizer.texts_to_sequences(X_train)
X_test_encoding = tokenizer.texts_to_sequences(X_test)

max_review_length = 60

# using padding to make every review of equal size
X_train = pad_sequences(X_train_encoding, maxlen=max_review_length,padding='post')
X_test = pad_sequences(X_test_encoding, maxlen=max_review_length,padding='post')

words_to_index = tokenizer.word_index

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
vocab_size = len(words_to_index)+1

In [None]:
embed_dim = 16  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 24  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_review_length,))
embedding_layer = TokenAndPositionEmbedding(max_review_length, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = model.fit(
    X_train, y_train, batch_size=32, epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
prediction = model.predict(X_test)



In [None]:
y_pred = [0 if p[0]>p[1] else 1 for p in prediction]

In [None]:
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix,recall_score,precision_score

print("Accuracy of the model : ", accuracy_score(y_pred, y_test))
print('F1-score: ', f1_score(y_pred, y_test))
print('Confusion matrix:')
confusion_matrix(y_test,y_pred)

Accuracy of the model :  0.8645764840677058
F1-score:  0.9253232302398037
Confusion matrix:


array([[  6491,   1968],
       [ 32427, 213095]])

In [None]:
prediction = model.predict(X_train)
y_pred = [0 if p[0]>p[1] else 1 for p in prediction]




In [None]:
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix,recall_score,precision_score

print("Accuracy of the model : ", accuracy_score(y_pred, y_train))
print('F1-score: ', f1_score(y_pred, y_train))
print('Confusion matrix:')
confusion_matrix(y_train,y_pred)

Accuracy of the model :  0.97876
F1-score:  0.9824607762180017
Confusion matrix:


array([[19194,   806],
       [  256, 29744]])

NameError: ignored