In [1]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting certifi>=2023.7.22 (from kaggle)
  Using cached certifi-2024.8.30-py3-none-any.whl.metadata (2.2 kB)
Collecting requests (from kaggle)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm (from kaggle)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting urllib3 (from kaggle)
  Using cached urllib3-2.2.3-py3-none-any.whl.metadata (6.5 kB)
Collecting bleach (from kaggle)
  Downloading bleach-6.2.0-py3-none-any.whl.metadata (30 kB)
Co


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## importing dependency

In [None]:
import os
import json

from zipfile import ZipFile
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'pandas'

## Data collection-kaggle Api

In [None]:
kaggle_dictionary = json.load(open("kaggle.json"))

In [None]:
# setup kaggle credentials as environmet variables
os.environ['KAGGLE_USERNAME']=kaggle_dictionary['username']

In [None]:
# setup kaggle credentials as environmet variables
os.environ['KAGGLE_KEY']=kaggle_dictionary['key']

In [None]:
#!/bin/bash
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


## unzip dataset file

In [None]:
with ZipFile("/content/imdb-dataset-of-50k-movie-reviews.zip","r") as zip_ref:
  zip_ref.extractall()

## Loading the dataset

In [None]:
data=pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.shape

(50000, 2)

In [None]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
data.replace({'sentiment':{'positive':1,'negative':0}},inplace=True)

  data.replace({'sentiment':{'positive':1,'negative':0}},inplace=True)


In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
# split data into train and test
train_data,test_data=train_test_split(data,test_size=0.2,random_state=42)

In [None]:
train_data.shape

(40000, 2)

In [None]:
train_data

Unnamed: 0,review,sentiment
39087,That's what I kept asking myself during the ma...,0
30893,I did not watch the entire movie. I could not ...,0
45278,A touching love story reminiscent of In the M...,1
16398,This latter-day Fulci schlocker is a totally a...,0
13653,"First of all, I firmly believe that Norwegian ...",0
...,...,...
11284,`Shadow Magic' recaptures the joy and amazemen...,1
44732,I found this movie to be quite enjoyable and f...,1
38158,Avoid this one! It is a terrible movie. So wha...,0
860,This production was quite a surprise for me. I...,1


In [None]:
test_data.shape

(10000, 2)

## Data Preprocessing

In [None]:
tokenizer=Tokenizer(num_words=5000)

In [None]:
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x78bfbd125300>

In [None]:
tokenizer.fit_on_texts(train_data['review'])

In [None]:
X_train=pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200)

In [None]:
X_test=pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [None]:
y_train=train_data['sentiment']

In [None]:
y_test=test_data['sentiment']

In [None]:
y_train

Unnamed: 0,sentiment
39087,0
30893,0
45278,1
16398,0
13653,0
...,...
11284,1
44732,1
38158,0
860,1


## Building LSTM MOdel

In [None]:
def debug_shape(x):
    print("Shape:", x.shape)
    return x

In [None]:
# build the model
model=Sequential()
# input length is max length of sentence
# *********************************************************************************
# (batch_size,sequence_length) ----> (batch_size,sequence_length,embedding_dim)
# (64,200) ----> (64,200,128)
# passes 64 sentence to embedding layer at once
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))
# ********************************************************************************
# (batch_size,sequence_length,embedding_dim) ------->  (batch_size,units)
# (64,200,128) ----> (64,128)
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
# ---------------------------------------------------------------------------------
# (batch_size,units) ----> (batch_size,1)
# (64,128) -----> (64,1)
model.add(Dense(1,activation='sigmoid'))
# ---------------------------------------------------------------------------------



In [None]:
# LSTM(128): The LSTM layer has 128 hidden units, meaning it outputs a 128-dimensional vector for each input sequence.
# dropout=0.2: During training, 20% of the LSTM’s neurons are randomly dropped to prevent overfitting.
# recurrent_dropout=0.2: 20% of the recurrent connections (the links between time steps) are randomly dropped to help improve generalization.

In [None]:
model.build(input_shape=(None, 200)) # Batch size is None, sequence length is 200

In [None]:
model.summary()

In [None]:
# compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

## Training the model

In [None]:
model.fit(X_train,y_train,batch_size=64,validation_split=0.2,epochs=5)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 254ms/step - accuracy: 0.8457 - loss: 0.3721 - val_accuracy: 0.8349 - val_loss: 0.3854
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 249ms/step - accuracy: 0.8622 - loss: 0.3343 - val_accuracy: 0.8677 - val_loss: 0.3319
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 249ms/step - accuracy: 0.8882 - loss: 0.2742 - val_accuracy: 0.8664 - val_loss: 0.3336
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 260ms/step - accuracy: 0.9017 - loss: 0.2465 - val_accuracy: 0.7806 - val_loss: 0.4570
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 250ms/step - accuracy: 0.8898 - loss: 0.2712 - val_accuracy: 0.8371 - val_loss: 0.3692


<keras.src.callbacks.history.History at 0x78bf38ecd6c0>

In [None]:
loss,accuracy=model.evaluate(X_test,y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 85ms/step - accuracy: 0.8416 - loss: 0.3656


In [None]:
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.3638
Test Accuracy: 0.8419


## Building a predictive system

In [None]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence=tokenizer.texts_to_sequences([review])
  padded_sequence=pad_sequences(sequence,maxlen=200)

  # make prediction
  prediction=model.predict(padded_sequence)
  print(prediction)
  # interpret the prediction
  sentiment='positive' if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [None]:
predict_sentiment("I loved this movie! It was amazing.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step
[[0.99207556]]


'positive'

In [None]:
predict_sentiment("This movie was not that good.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[[0.1829626]]


'negative'

In [None]:
predict_sentiment("The movie was bad")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
[[0.12639184]]


'negative'