In [1]:
import pandas as pd
import numpy as np
import gensim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report
import gensim.downloader as api

import tensorflow as tf
import random
seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Read preprocessed data

In [2]:
df = pd.read_csv("dataset_imdb_preprocessed.csv",index_col=0)
df

Unnamed: 0,sentiment,lemmatized_review
0,1,one reviewer ha mentioned watching 1 oz episod...
1,1,wonderful little production filming technique ...
2,1,thought wa wonderful way spend time hot summer...
3,0,basically family little boy jake think zombie ...
4,1,petter mattei love time money visually stunnin...
...,...,...
49995,1,thought movie right good job wa creative origi...
49996,0,bad plot bad dialogue bad acting idiotic direc...
49997,0,catholic taught parochial elementary school nu...
49998,0,going disagree previous comment side maltin on...


# Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_review'], df['sentiment'], test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=seed)

# Load model Work2Vec

In [4]:
w2v_model = api.load("word2vec-google-news-300")

# Create Embedding Matrix

In [5]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)


X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)


max_length = 100
X_train_seq = pad_sequences(X_train_seq, maxlen=max_length)
X_val_seq = pad_sequences(X_val_seq, maxlen=max_length)
X_test_seq = pad_sequences(X_test_seq, maxlen=max_length)


embedding_dim = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in w2v_model:
        embedding_matrix[i] = w2v_model[word]


embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

# Model

In [6]:


model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(SimpleRNN(100, return_sequences=True))) 
model.add(Bidirectional(SimpleRNN(100)))
model.add(Dense(1, activation='sigmoid'))

model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


model.fit(X_train_seq, y_train, epochs=10, batch_size=32, validation_data=(X_val_seq, y_val))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          23351400  
                                                                 
 bidirectional (Bidirectiona  (None, 100, 200)         80200     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 200)              60200     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 23,492,001
Trainable params: 140,601
Non-trainable params: 23,351,400
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch

<keras.callbacks.History at 0x2a359a2b370>

# evaluate

In [9]:
predictions = model.predict(X_test_seq)
y_pred = (predictions > 0.5).astype(int)


precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_test, y_pred))


Precision: 0.69
Recall: 0.89
F1 Score: 0.77
Accuracy: 0.74
              precision    recall  f1-score   support

           0       0.84      0.59      0.69      4961
           1       0.69      0.89      0.77      5039

    accuracy                           0.74     10000
   macro avg       0.76      0.74      0.73     10000
weighted avg       0.76      0.74      0.73     10000

