<a href="https://colab.research.google.com/github/avocadopelvis/imdb-movie-reviews-sentiment-analysis/blob/main/sentiment_analysis_imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [2]:
!mkdir ~/.kaggle

In [3]:
!cp kaggle.json ~/.kaggle/

In [4]:
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s] 35% 9.00M/25.7M [00:00<00:00, 75.4MB/s]
100% 25.7M/25.7M [00:00<00:00, 126MB/s] 


In [6]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [7]:
#load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
# from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re, string, unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob, Word
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Load the dataset

In [8]:
imdb_data = pd.read_csv('/content/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Exploratory Data Analysis

In [9]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


### Sentiment Count

In [10]:
imdb_data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

##### We observe that there are equal number of negative and postive sentiment. Thus, the dataset is balanced.

### Splitting the dataset

In [11]:
#train dataset
train_reviews = imdb_data.review[:40000]
train_sentiments = imdb_data.review[:40000]

#test dataset
test_reviews = imdb_data.review[40000:]
test_sentiments = imdb_data.sentiment[40000:]

print(train_reviews.shape, train_sentiments.shape)
print(test_reviews.shape, test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


### Text Normalization

In [12]:
#Text Tokenization
tokenizer = ToktokTokenizer()

#Setting English Stopwords
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Cleaning the dataset

In [13]:
#removing the html strips
def strip_html(text):
  soup = BeautifulSoup(text, 'html.parser')
  return soup.get_text()

#removing the square brackets
def remove_square(text):
  return re.sub('\[[^]]*\]', '', text)

#removing special characters
def remove_special(text, remove_digits=True):
  pattern = r'[^a-zA-z0-9\s]'
  return re.sub(pattern, '', text)

#removing the noisy text
def remove_noise(text):
  text = strip_html(text)
  text = remove_square(text)
  text = remove_special(text)
  return text

imdb_data['review'] = imdb_data['review'].apply(remove_noise)

### Text Stemming

In [14]:
def simple_stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

imdb_data['review'] = imdb_data['review'].apply(simple_stemmer)

### Removing stopwords

In [15]:
#set stopwords to english
stop = set(stopwords.words('english'))
print(stop)

{'or', 'any', 'not', 'you', 'at', "should've", 'hasn', 'am', 've', "won't", 'above', 'hers', "that'll", 'weren', 'haven', 're', 'their', 'were', 'myself', 'them', 'out', 'as', 'needn', 'herself', 'a', 'again', 'shan', 'having', 'isn', 'your', 'most', 'where', 'he', 'off', 'all', 'did', "you're", "you've", 'yourself', 'can', 'its', "isn't", 'has', 'if', 'which', 'on', 'how', 'ourselves', 'through', 'about', 'mightn', "haven't", 'his', "she's", 'why', "needn't", 'same', 'theirs', 'for', "couldn't", 'will', 'during', 'themselves', "you'll", 'ma', 'hadn', 'me', 't', 'ain', 'the', 'while', "mightn't", 'was', 'm', 'between', 'those', 'under', 'below', 'should', 'other', 'yours', 'too', 'down', 'these', 'by', "wasn't", 'wasn', 's', 'what', 'him', 'is', 'aren', 'up', 'nor', "mustn't", 'are', 'shouldn', 'do', "weren't", 'once', 'didn', 'in', 'now', 'been', "don't", 'an', 'who', 'ours', 'own', 'i', "doesn't", "shan't", 'here', 'couldn', "wouldn't", 'until', 'to', 'this', "hasn't", 'll', 'it', 'h

In [16]:
#removing the stopwords
def remove_stopwords(text, is_lower_case = False):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

imdb_data['review'] = imdb_data['review'].apply(remove_stopwords)

### Normalized Train Reviews

In [17]:
norm_train_reviews = imdb_data.review[:40000]
norm_train_reviews[0]

'one review ha mention watch 1 Oz episod youll hook right thi exactli happen meth first thing struck Oz wa brutal unflinch scene violenc set right word GO trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call OZ nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda Em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast Oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

### As shown above, we can clearly observed that the stopwords such as have been removed while the text have been stemmed as well.

### Normalized Test Reviews

In [18]:
norm_test_reviews = imdb_data.review[40000:]
norm_test_reviews[40001]

'wa excit see sitcom would hope repres indian candian found thi show funni produc cast probabl happi get bad good feed back becaus far concern get talk wa readi stereotyp problem becaus stereotyp exist reason usual true realli wasnt anyth funni stereotyp charact fresh boat dad doesnt understand hi daughter radic feminist muslim daughter way terribl actress young modern indian man tri run hi mosqu polit correct pretti good actor onli see get betterit veri contriv dialog doesnt flow well wa much potenti someth like thi sadli think fail dont realli care watch anoth episodei howev enjoy watch great canadian actress sheila mccarthi alway treat natur everyth doe bad daughter show doesnt act abil'

### Bag of Words Model

In [19]:
#count vectorizer
cv = CountVectorizer(min_df = 0, max_df = 1, binary = False, ngram_range = (1, 3))

#transformed train reviews
cv_train_reviews = cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews = cv.transform(norm_test_reviews)

print('bow_cv_train:', cv_train_reviews.shape)
print('bow_cv_test:', cv_test_reviews.shape)

bow_cv_train: (40000, 6209089)
bow_cv_test: (10000, 6209089)


### Term Frequency-Inverse Document Frequenct Model 

In [20]:
#tfidf vectorizer
tv = TfidfVectorizer(min_df = 0, max_df = 1, use_idf = True, ngram_range = (1, 3))

#transformed train reviews
tv_train_reviews = tv.fit_transform(norm_train_reviews)
#tranformed test reviews
tv_test_reviews = tv.transform(norm_test_reviews)

print('tfidf_train:', tv_train_reviews.shape)
print('tfidf_test:', tv_test_reviews.shape)

tfidf_train: (40000, 6209089)
tfidf_test: (10000, 6209089)


### Labelling the sentiment text

In [21]:
lb = LabelBinarizer()

#transformed sentiment data
sentiment_data = lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


### Splitting the sentiment data

In [22]:
train_sentiments = sentiment_data[:40000]
test_sentiments = sentiment_data[40000:]

In [23]:
print(train_sentiments)

[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]


In [24]:
print(test_sentiments)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


### Modelling the dataset

### Logistic Regression

In [26]:
#training the model
lr = LogisticRegression(penalty = 'l2', max_iter = 500, C = 1, random_state = 42)

#fitting the model for bag of words
lr_bow = lr.fit(cv_train_reviews, train_sentiments)
print(lr_bow)

#fitting the model for tfidf features
lr_tfidf = lr.fit(tv_train_reviews, train_sentiments)
print(lr_tfidf)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, max_iter=500, random_state=42)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, max_iter=500, random_state=42)


### Logistic Regression Model performance on test dataset

In [27]:
#predicting the model for bag of words
lr_bow_predict = lr.predict(cv_test_reviews)
print(lr_bow_predict)

#predicting the model for tfidf features
lr_tfidf_predict = lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

[0 0 0 ... 0 1 1]
[0 0 0 ... 0 1 1]


### Accuracy of the model

In [28]:
#accuracy score for bag of words
lr_bow_score = accuracy_score(test_sentiments, lr_bow_predict)
print('lr_bow_score:', lr_bow_score)

#accuracy score for tfidf features
lr_tfidf_score = accuracy_score(test_sentiments, lr_tfidf_predict)
print('lr_tfidf_score:', lr_tfidf_score)

lr_bow_score: 0.7512
lr_tfidf_score: 0.75


### Classification Report

In [29]:
#classification report for bag of words
lr_bow_report = classification_report(test_sentiments, lr_bow_predict, target_names = ['Postive', 'Negative'])
print(lr_bow_report)

              precision    recall  f1-score   support

     Postive       0.75      0.75      0.75      4993
    Negative       0.75      0.75      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



In [31]:
#classification report for tfidf features
lr_tfidf_report = classification_report(test_sentiments, lr_tfidf_predict, target_names= ['Postive', 'Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

     Postive       0.74      0.77      0.75      4993
    Negative       0.76      0.73      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000

