# Sentiment Analysis on IMDb Movie Reviews 🎬

This notebook contains the full workflow for IMDb review sentiment analysis, including:

- Web scraping reviews and ratings
- Data preprocessing (cleaning, stopwords, stemming)
- Feature extraction (Bag-of-Words)
- Model training (Naive Bayes classifiers)
- Evaluation and predictions on new reviews


In [None]:
import requests

In [None]:
import csv
def get_reviews_and_ratings(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
reviews_with_ratings = []
for review in soup.find_all('div', class_='lister-item-content'):
rating = review.find('span', class_='rating-other-user-rating')

In [None]:
rating_value = float(rating.find('span').text.strip())
if rating_value >= 6:
sentiment = 'Positive'

In [None]:
sentiment = 'Negative'
rating = str(rating_value)

In [None]:
rating = 'N/A'
sentiment = 'N/A'
text = review.find('div', class_='text show-more__control').text.strip()
reviews_with_ratings.append((text, rating, sentiment))

In [None]:
def save_reviews_to_csv(movie_urls, output_file):
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Review', 'Rating', 'Sentiment'])

In [None]:
reviews_with_ratings = get_reviews_and_ratings(url)

In [None]:
writer.writerow([review, rating, sentiment])
# Example list of movie URLs (replace with your own)
movie_urls = [
'https://m.imdb.com/title/tt0111161/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0068646/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0071562/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0468569/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0050083/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0108052/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0167260/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0110912/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0120737/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0060196/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0109830/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0167261/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0137523/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt1375666/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0080684/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0133093/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0099685/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0073486/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt15239678/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0114369/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0816692/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0038650/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0047478/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0102926/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0120815/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0317248/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0118799/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0120689/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0103064/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0076759/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0088763/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0245429/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0253474/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt6751668/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0054215/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt9362722/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0172495/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0110357/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0110413/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0407887/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0120586/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt2582802/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0482571/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0095327/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0056058/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0114814/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0034583/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt1675434/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0095765/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0027977/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0047396/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0078748/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0021749/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt1853728/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0078788/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0209144/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt23849204/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0910970/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0082971/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0405094/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0043014/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0050825/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt4154756/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt4633694/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0051201/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0081505/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0057012/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0087843/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt1856101/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0435761/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt4154796/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt1345836/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0090605/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0169547/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt7286456/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0266543/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0118715/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt4154664/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0073195/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0361748/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0052618/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0087845/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0022100/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt1677720/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0041959/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0082096/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0457430/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0053198/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0086879/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0434409/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0105236/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0405159/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0208092/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0086190/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0047296/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0075686/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0025316/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0120735/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0167404/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0057013/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0086879/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0434409/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0105236/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0405159/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0208092/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0086190/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0047296/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0075686/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0025316/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0120735/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0167404/reviews/?ref_=tt_ql_2',
'https://m.imdb.com/title/tt0057013/reviews/?ref_=tt_ql_2'

In [None]:
# Output CSV file
output_file = 'movie_reviews_with_sentiment.csv'
# Call function to save reviews to CSV
save_reviews_to_csv(movie_urls, output_file)
print("Reviews downloaded and saved to", output_file)
#Removing 2nd column which is Rating
import pandas as pd
# Load the CSV file
data = pd.read_csv("movie_reviews_with_sentiment.csv")
# Remove the second column (index 1) from the dataframe
data = data.drop(data.columns[1], axis=1)
# Save the modified dataframe back to a CSV file
data.to_csv("movie_reviews_without_rating.csv", index=False)
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):

In [None]:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# Load the dataset
file_path = '/content/movie_reviews_without_rating.csv'  # Change the path to the location where you've uploaded the file
df = pd.read_csv(file_path)
#Showing top 5 Rows
df.head()
# Taking 10 Random Samples
df.sample(10)
# Review of 1st row

In [None]:
#Taking 1000 Samples
df=df.sample(1000)

In [None]:
#showing information

In [None]:
#Calculating null values
df.isnull().sum()
#Droping the rows with null values
df.dropna(inplace=True)
#Calculating null values
df.isna().sum()
#To clean all html tags
import re
clean=re.compile('<.*?>')
re.sub(clean,'',df.iloc[2].Review)
# Function
def clean_html(text):
clean=re.compile('<.?>')
return re.sub(clean,'',text)
#apply clean_html
df['Review']=df['Review'].apply(clean_html)
#convert everything to lowercase
def convert_lower(text):
return text.lower()
#apply conver_lower
df['Review']=df['Review'].apply(convert_lower)
# function to remove special character(commas, full stop, open bracket etc.)
def remove_special(text):
x=''

In [None]:
if i.isalnum():
x=x+i

In [None]:
x=x+' '

In [None]:
df['Review']=df['Review'].apply(remove_special)
# remove stop words
import nltk

In [None]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('stopwords')
stopwords.words('english')
#function to remove stop words
def remove_stopwords(text):
x=[]
for i in text.split():
if i not in stopwords.words('english'):
x.append(i)
y=x[:]
x.clear()

In [None]:
df['Review']=df['Review'].apply(remove_stopwords)

In [None]:
#performing stemming(multiple word with same meaning to single word)

In [None]:
ps=PorterStemmer()
y=[]
def stem_words(text):

In [None]:
y.append(ps.stem(i))
z=y[:]
y.clear()

In [None]:
stem_words(['I','loved','loving','it'])
df['Review']=df['Review'].apply(stem_words)

In [None]:
#join back
def join_back(list_input):
return " ".join(list_input)
df['Review']=df['Review'].apply(join_back)
# taking review column
X=df.iloc[:,0:1].values

In [None]:
cv=CountVectorizer(max_features=1000)
X=cv.fit_transform(df['Review']).toarray()

In [None]:
# taking 2nd column()
y=df.iloc[:,-1].values

In [None]:
#X,y
#train set
#test set

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
#object creation
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()
#model train
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)
#predict
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

In [None]:
print("Gaussian NB acc=",accuracy_score(y_test,y_pred1))
print("Multinomial NB acc=",accuracy_score(y_test,y_pred2))
print("Bernoulli NB acc=",accuracy_score(y_test,y_pred3))
import nltk
nltk.download('punkt')
import re
import nltk

In [None]:
# Function to preprocess the text of reviews
def preprocess_text(text):
# Remove HTML tags
clean = re.compile('<.*?>')
text = re.sub(clean, '', text)
# Remove non-alphanumeric characters and convert to lowercase
text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
# Tokenize the text
tokens = nltk.word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
# Join the tokens back into a string
processed_text = ' '.join(filtered_tokens)

In [None]:
# Function to train the model
def train_model(X_train, y_train):
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

In [None]:
# Function to predict sentiment for new reviews
def predict_sentiment(new_reviews, clf, vectorizer):
# Preprocess the new reviews
preprocessed_reviews = [preprocess_text(review) for review in new_reviews]
# Vectorize the preprocessed reviews using the same vectorizer used for training
X_new_counts = vectorizer.transform(preprocessed_reviews)
# Make predictions using the trained model
predicted_sentiments = clf.predict(X_new_counts)

In [None]:
# Sample training data (replace with your own data)
X_train = [

In [None]:
# Add more training data here

In [None]:
y_train = [1, 0, 1]  # 1 for positive sentiment, 0 for negative sentiment
# Train the model
clf, vectorizer = train_model(X_train, y_train)
# Sample new reviews (replace with your own new reviews)
new_reviews = [

In [None]:
# Make predictions for the new reviews
predicted_sentiments = predict_sentiment(new_reviews, clf, vectorizer)
# Print the predicted sentiments for the new reviews
for review, sentiment in zip(new_reviews, predicted_sentiments):
print(f"Review: {review}")
print(f"Predicted Sentiment: {'Positive' if sentiment == 1 else 'Negative'}")
print()