<a href="https://colab.research.google.com/github/avocadopelvis/imdb-movie-reviews-sentiment-analysis/blob/main/sentiment_analysis_imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [2]:
!mkdir ~/.kaggle

In [3]:
!cp kaggle.json ~/.kaggle/

In [4]:
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:00<00:00, 75.2MB/s]
100% 25.7M/25.7M [00:00<00:00, 102MB/s] 


In [6]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [7]:
#load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
# from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re, string, unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob, Word
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Load the dataset

In [8]:
imdb_data = pd.read_csv('/content/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Exploratory Data Analysis

In [9]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


### Sentiment Count

In [10]:
imdb_data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

##### We observe that there are equal number of negative and postive sentiment. Thus, the dataset is balanced.

### Splitting the dataset

In [11]:
#train dataset
train_reviews = imdb_data.review[:40000]
train_sentiments = imdb_data.review[:40000]

#test dataset
test_reviews = imdb_data.review[40000:]
test_sentiments = imdb_data.sentiment[40000:]

print(train_reviews.shape, train_sentiments.shape)
print(test_reviews.shape, test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


### Text Normalization

In [12]:
#Text Tokenization
tokenizer = ToktokTokenizer()

#Setting English Stopwords
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Cleaning the dataset

In [13]:
#removing the html strips
def strip_html(text):
  soup = BeautifulSoup(text, 'html.parser')
  return soup.get_text()

#removing the square brackets
def remove_square(text):
  return re.sub('\[[^]]*\]', '', text)

#removing special characters
def remove_special(text, remove_digits=True):
  pattern = r'[^a-zA-z0-9\s]'
  return re.sub(pattern, '', text)

#removing the noisy text
def remove_noise(text):
  text = strip_html(text)
  text = remove_square(text)
  text = remove_special(text)
  return text

imdb_data['review'] = imdb_data['review'].apply(remove_noise)

### Text Stemming

In [14]:
def simple_stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

imdb_data['review'] = imdb_data['review'].apply(simple_stemmer)

### Removing stopwords

In [15]:
#set stopwords to english
stop = set(stopwords.words('english'))
print(stop)

{'hasn', 'off', 'wasn', "shan't", 'their', 'itself', 'do', 'only', "you've", 'so', 'the', 'below', 's', 'were', "you'd", "mightn't", 'against', 'during', 'under', "aren't", 'further', "mustn't", 'weren', 'can', 'was', 'wouldn', 'm', 'than', 'y', 'hadn', 'who', 'doesn', "weren't", 'at', 'now', 'again', 'own', 'couldn', 'for', 'what', 'having', 'himself', 'aren', 'whom', "it's", 'll', 'does', 'did', "needn't", 'will', 'up', 'once', 'mustn', 'are', 'as', 'have', 'yourselves', 'any', "haven't", 'won', 'because', 'our', 'before', "shouldn't", 'ain', 'mightn', 'an', 'he', 'should', 're', 've', "won't", 'such', 'isn', 'if', 'very', 'needn', 'myself', 'its', 'themselves', "isn't", 'and', 'had', 'ours', 'a', 'she', 'into', "hadn't", 'same', 'yours', "wasn't", 'too', 'd', 'hers', 'over', 'it', 'me', 'from', 'some', "hasn't", 'don', 'these', "didn't", 'doing', 'in', 'until', "you'll", 'theirs', 'shouldn', 'herself', 'why', 'this', 'that', "that'll", 'haven', 'they', 'there', 'her', 'each', 'about

In [24]:
#removing the stopwords
def remove_stopwords(text, is_lower_case = False):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

imdb_data['review'] = imdb_data['review'].apply(remove_stopwords)