In [1]:
import keras
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
train_data_path = 'data/train_4000.csv'
test_data_path = 'data/test_4000.csv'
val_data_path = 'data/val_4000.csv'

first_n_words = 200

# Read raw data
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)
df_val = pd.read_csv(val_data_path)

df_train[['label']] = df_train[['label']].replace(["negative", "positive"],[0, 1])
df_test[['label']] = df_test[['label']].replace(["negative", "positive"],[0, 1])
df_val[['label']] = df_val[['label']].replace(["negative", "positive"],[0, 1])


# Take particular columns
train_sentences = df_train['sequence'].values
test_sentences = df_test['sequence'].values
val_sentences = df_val['sequence'].values
train_labels = df_train['label'].values
test_labels = df_test['label'].values
val_labels = df_val['label'].values


# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 100 # choose based on statistics, for example 100 to 200
padding_type='post'
trunc_type='post'

# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# convert Val dataset to sequence and pad sequences
val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences, padding='post', maxlen=max_length)

In [3]:
train_padded

array([[ 473,   94,   54, ...,    0,    0,    0],
       [   1, 2697,    2, ...,    0,    0,    0],
       [   1,   38, 1965, ...,    0,    0,    0],
       ...,
       [  26,   67,   13, ...,    0,    0,    0],
       [  54, 2023, 2156, ...,    0,    0,    0],
       [   1,  107,   14, ...,    0,    0,    0]], dtype=int32)

In [4]:
test_padded

array([[1424,    0,    0, ...,    0,    0,    0],
       [   1,    1,  592, ...,    0,    0,    0],
       [1080,    4,  568, ...,    0,    0,    0],
       ...,
       [   4,  610,    6, ...,    0,    0,    0],
       [1267,   70,    2, ...,    0,    0,    0],
       [ 329,  932,    2, ...,    0,    0,    0]], dtype=int32)

In [5]:
val_padded

array([[ 151,   37, 2315, ...,    0,    0,    0],
       [ 682,   98,    4, ...,    0,    0,    0],
       [   1,   69,  109, ...,    0,    0,    0],
       ...,
       [  35,    1,   27, ...,    0,    0,    0],
       [ 248,  315,  837, ...,    0,    0,    0],
       [  37,   34,   59, ...,    0,    0,    0]], dtype=int32)

In [6]:
# Model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# Compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# Model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          300000    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              84480     
 l)                                                              
                                                                 
 dense (Dense)               (None, 24)                3096      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 387,601
Trainable params: 387,601
Non-trainable params: 0
_________________________________________________________________


In [7]:
# Training
num_epochs = 3
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)
model.save('checkpoints/lstm-base-uncased_4000_0_best.h5')

Epoch 1/3


2023-04-07 13:16:59.844515: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/3
Epoch 3/3


In [8]:
# Checking reconstruct the model identically.
model_check = keras.models.load_model("checkpoints/lstm-base-uncased_4000_0_best.h5")

prediction = model_check.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0

pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))
print("F1 Score of prediction on test set : ", f1_score(test_labels,pred_labels))
print("Precision of prediction on val set : ", precision_score(test_labels,pred_labels))
print("Recall of prediction on val set : ", recall_score(test_labels,pred_labels))

df_test['predicted'] = pred_labels

Accuracy of prediction on test set :  0.704375
F1 Score of prediction on test set :  0.6451612903225806
Precision of prediction on val set :  0.7142857142857143
Recall of prediction on val set :  0.5882352941176471


In [9]:
prediction = model_check.predict(val_padded)
# Get labels based on probability 1 if p>= 0.5 else 0

pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on val set : ", accuracy_score(val_labels,pred_labels))
print("F1 Score of prediction on val set : ", f1_score(val_labels,pred_labels))
print("Precision of prediction on val set : ", precision_score(val_labels,pred_labels))
print("Recall of prediction on val set : ", recall_score(val_labels,pred_labels))



df_val['predicted'] = pred_labels

Accuracy of prediction on val set :  0.6825
F1 Score of prediction on val set :  0.6231454005934719
Precision of prediction on val set :  0.7094594594594594
Recall of prediction on val set :  0.5555555555555556


In [10]:
prediction = model_check.predict(val_padded)
# Get labels based on probability 1 if p>= 0.5 else 0

pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on val set : ", accuracy_score(val_labels,pred_labels))
print("F1 Score of prediction on val set : ", f1_score(val_labels,pred_labels))
print("Precision of prediction on val set : ", precision_score(val_labels,pred_labels))
print("Recall of prediction on val set : ", recall_score(val_labels,pred_labels))

df_val['predicted'] = pred_labels

Accuracy of prediction on val set :  0.6825
F1 Score of prediction on val set :  0.6231454005934719
Precision of prediction on val set :  0.7094594594594594
Recall of prediction on val set :  0.5555555555555556


In [11]:
# Check on some given sentences
sentence = ["The movie was very touching and heart whelming", 
            "I have never seen a terrible movie like this", 
            "it was a drastic and abad turn around",
            "game was amazing",
            "nice one",
            "looks bad"]
# convert to a sequence
sequences = tokenizer.texts_to_sequences(sentence)
print('token', tokenizer)
print(sequences)
# pad the sequence
padded = pad_sequences(sequences, padding='post', maxlen=max_length)
# Get labels based on probability 1 if p>= 0.5 else 0
prediction = model_check.predict(padded)
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
for i in range(len(sentence)):
    print(sentence[i])
    if pred_labels[i] == 1:
        s = 'Positive'
    else:
        s = 'Negative'
    print("Predicted sentiment : ",s)

token <keras.preprocessing.text.Tokenizer object at 0x168c23c70>
[[2, 722, 15, 125, 1, 5, 946, 1], [8, 23, 181, 546, 4, 2316, 722, 53, 31], [17, 15, 4, 1, 5, 1, 693, 319], [772, 15, 924], [255, 48], [444, 208]]
The movie was very touching and heart whelming
Predicted sentiment :  Positive
I have never seen a terrible movie like this
Predicted sentiment :  Negative
it was a drastic and abad turn around
Predicted sentiment :  Positive
game was amazing
Predicted sentiment :  Positive
nice one
Predicted sentiment :  Positive
looks bad
Predicted sentiment :  Negative
