In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
imbd_data = pd.read_csv('/Practice code/DataSets/IMDB Dataset.csv')

In [4]:
imbd_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
imbd_data.shape

(50000, 2)

In [6]:
imbd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


# Text Preprocessing

## Removing Punctuations, Numbers, and Special Characters

In [7]:
import re

In [8]:
def data_cleaning(text):
    clean_text = re.sub('[^A-Za-z]+'," ",text)
    return clean_text

In [10]:
imbd_data['clean_review'] = imbd_data['review'].apply(lambda text : data_cleaning(text))

In [11]:
imbd_data['clean_review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production br br The filmin...
2        I thought this was a wonderful way to spend ti...
3        Basically there s a family where a little boy ...
4        Petter Mattei s Love in the Time of Money is a...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    I am a Catholic taught in parochial elementary...
49998    I m going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: clean_review, Length: 50000, dtype: object

## converting clean_review to lower case

In [12]:
imbd_data['clean_review'] = imbd_data['clean_review'].apply(lambda text : text.lower())

In [13]:
imbd_data['clean_review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production br br the filmin...
2        i thought this was a wonderful way to spend ti...
3        basically there s a family where a little boy ...
4        petter mattei s love in the time of money is a...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    i m going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: clean_review, Length: 50000, dtype: object

## Tokenization

In [14]:
imbd_data['review_token'] = imbd_data['clean_review'].apply(lambda text : text.split())

In [15]:
imbd_data['review_token']

0        [one, of, the, other, reviewers, has, mentione...
1        [a, wonderful, little, production, br, br, the...
2        [i, thought, this, was, a, wonderful, way, to,...
3        [basically, there, s, a, family, where, a, lit...
4        [petter, mattei, s, love, in, the, time, of, m...
                               ...                        
49995    [i, thought, this, movie, did, a, down, right,...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, am, a, catholic, taught, in, parochial, el...
49998    [i, m, going, to, have, to, disagree, with, th...
49999    [no, one, expects, the, star, trek, movies, to...
Name: review_token, Length: 50000, dtype: object

## Removal of Stop words

In [16]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adars\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
stop_words = stopwords.words('english')

imbd_data['review_token'] = imbd_data['review_token'].apply(lambda token_text:[word for word in token_text if word not in stop_words])

In [19]:
imbd_data['review_token']

0        [one, reviewers, mentioned, watching, oz, epis...
1        [wonderful, little, production, br, br, filmin...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [thought, movie, right, good, job, creative, o...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [going, disagree, previous, comment, side, mal...
49999    [one, expects, star, trek, movies, high, art, ...
Name: review_token, Length: 50000, dtype: object

## Text Normalization

### a.     Stemming



In [20]:
from nltk.stem import PorterStemmer    
ps = PorterStemmer()

In [21]:
imbd_data['stem_text'] = imbd_data['review_token'].apply(lambda text: [ps.stem(word) for word in text]) # stemming
imbd_data

Unnamed: 0,review,sentiment,clean_review,review_token,stem_text
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st..."
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...,"[thought, movie, right, good, job, creative, o...","[thought, movi, right, good, job, creativ, ori..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir..."
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...,"[catholic, taught, parochial, elementary, scho...","[cathol, taught, parochi, elementari, school, ..."
49998,I'm going to have to disagree with the previou...,negative,i m going to have to disagree with the previou...,"[going, disagree, previous, comment, side, mal...","[go, disagre, previou, comment, side, maltin, ..."


### b.     Lemmatization

In [22]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wl=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adars\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
imbd_data['lemma_text'] = imbd_data['review_token'].apply(lambda review_token : [wl.lemmatize(word,pos='v') for word in review_token])

imbd_data

Unnamed: 0,review,sentiment,clean_review,review_token,stem_text,lemma_text
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook...","[one, reviewers, mention, watch, oz, episode, ..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq...","[wonderful, little, production, br, br, film, ..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[think, wonderful, way, spend, time, hot, summ..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi...","[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st...","[petter, mattei, love, time, money, visually, ..."
...,...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...,"[thought, movie, right, good, job, creative, o...","[thought, movi, right, good, job, creativ, ori...","[think, movie, right, good, job, creative, ori..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir...","[bad, plot, bad, dialogue, bad, act, idiotic, ..."
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...,"[catholic, taught, parochial, elementary, scho...","[cathol, taught, parochi, elementari, school, ...","[catholic, teach, parochial, elementary, schoo..."
49998,I'm going to have to disagree with the previou...,negative,i m going to have to disagree with the previou...,"[going, disagree, previous, comment, side, mal...","[go, disagre, previou, comment, side, maltin, ...","[go, disagree, previous, comment, side, maltin..."


In [24]:
imbd_data['stem_text'] = imbd_data['stem_text'].apply(lambda x: ' '.join(x))

In [26]:
imbd_data['lemma_text'] = imbd_data['lemma_text'].apply(lambda x: ' '.join(x))

In [27]:
imbd_data.head()

Unnamed: 0,review,sentiment,clean_review,review_token,stem_text,lemma_text
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, oz, epis...",one review mention watch oz episod hook right ...,one reviewers mention watch oz episode hook ri...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[wonderful, little, production, br, br, filmin...",wonder littl product br br film techniqu unass...,wonderful little production br br film techniq...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...",thought wonder way spend time hot summer weeke...,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, family, little, boy, jake, thinks,...",basic famili littl boy jake think zombi closet...,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, love, time, money, visually, ...",petter mattei love time money visual stun film...,petter mattei love time money visually stun fi...
