In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import string
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
import spacy
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
df = pd.read_csv('/content/IMDB Dataset.csv')

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [9]:
df.columns

Index(['review', 'sentiment'], dtype='object')

# **Pre-Processing**

In [10]:
df['review'].head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

### **Lowercasing**

In [11]:
df['review'] = df['review'].str.lower()

In [12]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

### **Remove HTML Tags**

In [13]:
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'',text)

In [14]:
df['review'] = df['review'].apply(remove_html_tags)

In [15]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

### **Remove Punctuation**

In [16]:
remove_punc = string.punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('','',remove_punc))

In [17]:
df['review'] = df['review'].apply(remove_punctuation)

In [18]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

### **Removing Stop-Words**

In [19]:
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = text.split()
    filtered_sentence = [word for word in word_tokens if word.lower() not in stop_words and word not in string.punctuation]
    return ' '.join(filtered_sentence)

In [20]:
df['review'] = df['review'].apply(remove_stop_words)

In [21]:
df['review']

0        one reviewers mentioned watching 1 oz episode ...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically theres family little boy jake thinks...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary schools n...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movies high art fans exp...
Name: review, Length: 50000, dtype: object

### **Tokenization**

In [22]:
df['review'] = df['review'].apply(word_tokenize)

In [23]:
df['review']

0        [one, reviewers, mentioned, watching, 1, oz, e...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, theres, family, little, boy, jake,...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [thought, movie, right, good, job, wasnt, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [im, going, disagree, previous, comment, side,...
49999    [one, expects, star, trek, movies, high, art, ...
Name: review, Length: 50000, dtype: object

### **Lemmatization**

#### Using NLTK

In [24]:
lemmatizer = WordNetLemmatizer()

In [25]:
def lemmatize_text_nltk(words):
    return [lemmatizer.lemmatize(word) for word in words]

In [26]:
df['review'].apply(lemmatize_text_nltk)

0        [one, reviewer, mentioned, watching, 1, oz, ep...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, there, family, little, boy, jake, ...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [thought, movie, right, good, job, wasnt, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [im, going, disagree, previous, comment, side,...
49999    [one, expects, star, trek, movie, high, art, f...
Name: review, Length: 50000, dtype: object

### Using Spacy

In [27]:
nlp = spacy.load("en_core_web_sm")

In [28]:
def lemmatize_text_spacy(words):
    doc = nlp(' '.join(words))
    return [token.lemma_ for token in doc]

In [29]:
df['review'] = df['review'].apply(lemmatize_text_spacy)

In [30]:
df['review']

0        [one, reviewer, mention, watch, 1, oz, episode...
1        [wonderful, little, production, filming, techn...
2        [think, wonderful, way, spend, time, hot, summ...
3        [basically, there, s, family, little, boy, jak...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [think, movie, right, good, job, be, not, crea...
49996    [bad, plot, bad, dialogue, bad, act, idiotic, ...
49997    [catholic, teach, parochial, elementary, schoo...
49998    [I, m, go, disagree, previous, comment, side, ...
49999    [one, expect, star, trek, movie, high, art, fa...
Name: review, Length: 50000, dtype: object

# **Text Representation**

### Total number of words in reviews and total number of unique words(vocabulary)

In [31]:
all_reviews = ' '.join(df['review'].astype(str).tolist())
words = re.findall(r'\b\w+\b', all_reviews.lower())

In [32]:
total_words = len(words)
unique_words = len(set(words))

In [33]:
print(f'Total number of words in the corpus: {total_words}')
print(f'Total number of unique words (vocabulary): {unique_words}')

Total number of words in the corpus: 6121468
Total number of unique words (vocabulary): 205726
