In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


2025-12-31 23:03:38.731364: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-31 23:03:38.763753: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-31 23:03:40.104844: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-31 23:03:46.336747: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

In [3]:
data = pd.read_csv("/home/andriel/ML RP2/NLP/IMDB_Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
print(data.shape)
print(data['sentiment'].value_counts())


(50000, 2)
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


## Data Preprocessing

In [5]:
# Convert labels to numbers
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [6]:
# Convert text to lowercase
data['review'] = data['review'].str.lower()

## Train–Test Split

In [8]:
X = data['review']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## TF-IDF Vectorization

In [9]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [10]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [12]:
y_pred_nb = nb_model.predict(X_test_tfidf)

print("TF-IDF + Naive Bayes Accuracy:")
print(accuracy_score(y_test, y_pred_nb))

print(classification_report(y_test, y_pred_nb))

TF-IDF + Naive Bayes Accuracy:
0.8508
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [13]:
sample_review = ["The movie was emotional and very well directed"]
sample_vec = tfidf.transform(sample_review)

prediction = nb_model.predict(sample_vec)
print("Sentiment:", "Positive" if prediction[0] == 1 else "Negative")


Sentiment: Positive


# LSTM (Deep Learning)

In [14]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['review'])

sequences = tokenizer.texts_to_sequences(data['review'])


In [15]:
X_pad = pad_sequences(sequences, maxlen=200)
y = data['sentiment']


In [16]:
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_pad, y, test_size=0.2, random_state=42
)

In [17]:
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
lstm_model.add(LSTM(64))
lstm_model.add(Dense(1, activation='sigmoid'))


E0000 00:00:1767203277.167974    5373 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1767203277.189133    5373 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [18]:
lstm_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [19]:
lstm_model.fit(
    X_train_lstm, y_train_lstm,
    epochs=3,
    batch_size=64,
    validation_split=0.2
)


Epoch 1/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 334ms/step - accuracy: 0.8137 - loss: 0.4074 - val_accuracy: 0.8691 - val_loss: 0.3092
Epoch 2/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 352ms/step - accuracy: 0.8951 - loss: 0.2627 - val_accuracy: 0.8817 - val_loss: 0.2794
Epoch 3/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 355ms/step - accuracy: 0.9095 - loss: 0.2276 - val_accuracy: 0.8799 - val_loss: 0.3018


<keras.src.callbacks.history.History at 0x7aa4e5ed5250>

In [20]:
loss, accuracy = lstm_model.evaluate(X_test_lstm, y_test_lstm)
print("LSTM Accuracy:", accuracy)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 101ms/step - accuracy: 0.8842 - loss: 0.2878
LSTM Accuracy: 0.8841999769210815


In [21]:
test_review = ["This film was boring and a complete waste of time"]

seq = tokenizer.texts_to_sequences(test_review)
pad_seq = pad_sequences(seq, maxlen=200)

pred = lstm_model.predict(pad_seq)
print("Sentiment:", "Positive" if pred[0][0] > 0.5 else "Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441ms/step
Sentiment: Negative
