In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("imdb_master.csv", encoding_errors='ignore')
df

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
...,...,...,...,...,...
49995,49995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt
49996,49996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt
49997,49997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt
49998,49998,train,A Christmas Together actually came before my t...,pos,99_8.txt


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, MaxPooling2D, Conv2D, Dropout

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
texts = df['review'].astype(str).tolist()
labels = df['label'].tolist()

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

max_words = 10000
maxlen = 300

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
data = pad_sequences(sequences, maxlen=maxlen)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [10]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=32, input_length=maxlen),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [11]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [12]:
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step - accuracy: 0.7057 - loss: 0.5189 - val_accuracy: 0.8537 - val_loss: 0.3374
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.9434 - loss: 0.1580 - val_accuracy: 0.8644 - val_loss: 0.3375
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step - accuracy: 0.9902 - loss: 0.0368 - val_accuracy: 0.8585 - val_loss: 0.4776
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.9986 - loss: 0.0088 - val_accuracy: 0.8626 - val_loss: 0.5465
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.9996 - loss: 0.0025 - val_accuracy: 0.8618 - val_loss: 0.6245


In [13]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8704 - loss: 0.5989
Test Accuracy: 0.8655


In [14]:
sample_review = "This movie was absolutely fantastic! The plot was thrilling and the acting was superb."

# Preprocess the review
sequence = tokenizer.texts_to_sequences([sample_review])  # Convert to sequence of integers
padded_sequence = pad_sequences(sequence, maxlen=maxlen)  # Pad to match input length

# Predict sentiment
prediction = model.predict(padded_sequence)[0][0]

# Interpret result
if prediction >= 0.5:
    print(f"Prediction: Positive ({prediction:.2f})")
else:
    print(f"Prediction: Negative ({prediction:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Prediction: Positive (1.00)
