# Installing The Required Libraris

In [1]:
!pip install kaggle



In [8]:
!pip install tensorflow



In [9]:
!pip install keras



# Importing The Required Dependencies

In [3]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Kaggle API Integration

In [4]:
kaggle_dict=json.load(open("/content/ kaggle.json"))

In [5]:
# set up kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"]=kaggle_dict["username"]
os.environ["KAGGLE_KEY"]=kaggle_dict["key"]

In [6]:
# download dataset by running API command
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
# unzipping
import zipfile
with zipfile.ZipFile('/content/imdb-dataset-of-50k-movie-reviews.zip','r') as zip:
  zip.extractall()

# Handling With Data

In [8]:
data=pd.read_csv("/content/IMDB Dataset.csv")

In [9]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [11]:
data.shape

(50000, 2)

In [12]:
# converting the output attribute to numerical format
data.replace({'sentiment':{'positive':1,'negative':0}},inplace=True)

In [13]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [14]:
# dealing with extra data in review column
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Download the stopwords from NLTK
nltk.download('stopwords')

# Initialize the Porter Stemmer
porter_stemmer = PorterStemmer()

# Define a function to clean and stem the reviews
def clean_and_stem(review):
    # Remove non-alphabetic characters
    cleaned_review = re.sub('[^a-zA-Z]', ' ', review)

    # Convert to lowercase
    cleaned_review = cleaned_review.lower()

    # Split into words
    words = cleaned_review.split()

    # Remove stopwords and apply stemming
    filtered_words = []
    for word in words:
        if word not in stopwords.words('english'):
            stemmed_word = porter_stemmer.stem(word)
            filtered_words.append(stemmed_word)

    # Join the words back into a single string
    processed_review = ' '.join(filtered_words)
    return processed_review

# Assuming 'data' is your DataFrame and 'review' is the column to be processed
data['review'] = data['review'].apply(clean_and_stem)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch oz episod hook right ...,1
1,wonder littl product br br film techniqu unass...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [15]:
# splitting the data into training and testing data

train_data,test_data=train_test_split(data,test_size=0.2,random_state=42)

In [16]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


In [18]:
# tokenize the data

tokenizer=Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])
x_train=pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200)
x_test=pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [19]:
y_train=train_data['sentiment']
y_test=test_data['sentiment']

# Building a LSTM model

In [20]:
model=Sequential()

model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))
model.add(LSTM(units=128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))



In [21]:
# compiling the model
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

In [22]:
# training the model
model.fit(x_train,y_train,batch_size=64,epochs=5,validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 257ms/step - accuracy: 0.7624 - loss: 0.4823 - val_accuracy: 0.8636 - val_loss: 0.3302
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 255ms/step - accuracy: 0.8787 - loss: 0.3065 - val_accuracy: 0.8604 - val_loss: 0.3437
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 257ms/step - accuracy: 0.8871 - loss: 0.2830 - val_accuracy: 0.8673 - val_loss: 0.3295
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 255ms/step - accuracy: 0.9097 - loss: 0.2330 - val_accuracy: 0.8649 - val_loss: 0.3268
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 258ms/step - accuracy: 0.9204 - loss: 0.2114 - val_accuracy: 0.8599 - val_loss: 0.3448


<keras.src.callbacks.history.History at 0x7befbfcc7e80>

In [23]:
# model evaluation
loss,accuracy=model.evaluate(x_test,y_test)
print("Loss:",loss)
print("Accuracy:",accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 87ms/step - accuracy: 0.8685 - loss: 0.3321
Loss: 0.3350304067134857
Accuracy: 0.8686000108718872


# Building a Predictive model

In [30]:
def predictive_sentiment(review):
  review=tokenizer.texts_to_sequences([review])
  review=pad_sequences(review,maxlen=200)
  prediction=model.predict(review)
  #prediction=np.argmax(prediction)
  if prediction[0][0]>0.5:
    print("Positive")
  else:
    print("Negative")

input_review=input("Enter the review:")
predictive_sentiment(input_review)

Enter the review:This product exceeded my expectations. Great value for the price!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Positive
