In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

In [4]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### Process in NLP project

1. data collecting

2. data cleaning
    - lowercasing
    - removing leading and trailing spaces
    - removing html tags
    - removing urls
    - expanding abbreviations
    - spelling correction
    - punctuations
    - remove special characters(@, #, %, etc...)


3. data preprocessing
    - tokenization
        - It is the process of breaking a sentence into tokens which are easier to deal with.
    - stop word removal
        - stop words are words which do not have any semantic value and are just there to help form the sentence. Examples are I, have, and, or, could, should, etc...
    - Stemming
        - It is the process of converting a word into it's base form. For example: dancing, danced, danced are all converted to dance.
        


4. EDA(Exploratory Data Analysis)

5. Make Features

6. Vectorization := Machine Learning Models work on numbers. So we need to convert our textual data into numbers. For this we have 3 methods:
    - Bag Of Words (BOW)
    - TFIDF
    - Word2Vec
    
    
7. Modelling
8. Evaluation
9. Deploying
10. Monitoring

# Data Cleaning

## `Drop duplicate rows`

In [5]:
df.duplicated().sum()

418

In [6]:
df = df.drop_duplicates()

In [7]:
df.duplicated().sum()

0

## `lowercasing`

In [8]:
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## `removing leading and trailing spaces`

In [9]:
df['review'] = df['review'].str.strip(' ')
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## `removing html tags`

In [10]:
import re # regular expression
def remove_html(text):
    text = re.sub(r'<.*?>', '', text)
    return text

In [11]:
remove_html('<h1>Heading</h1><br/>My name is Abhishek Jha')

'HeadingMy name is Abhishek Jha'

In [12]:
df['review'] = df['review'].apply(remove_html)

In [13]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


#### `This is one way of removing html elements but this is not optimal as we have passed a function which is not vectorized.`
#### `Thus we may use the vectorized pandas replace() method.`

In [14]:
df['review'].str.replace(r'<.*?>', '', regex = True)
# It is mandatory that we pass regex = True as in previous versions of pandas it was set to True by default.

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

In [15]:
df['review'] = df['review'].str.replace(r'<.*?>', '', regex = True)

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## `removing urls`

In [17]:
def remove_url(text):
    text=re.sub(r"https?://\S+|www\.\S+",'',text)
    return text

In [18]:
remove_url('My name is Abhishek Jha and My website is www.google.com.')

'My name is Abhishek Jha and My website is '

In [19]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


#### `Find all the reviews which contain a url`

In [20]:
mask = df['review'].str.contains(r"https?://\S+|www\.\S+", regex = True)
reviews_with_url = df[mask]
reviews_with_url

Unnamed: 0,review,sentiment
742,mario lewis of the competitive enterprise inst...,negative
907,following directly from where the story left o...,positive
1088,this quasi j-horror film followed a young woma...,negative
1137,i really think i should make my case and have ...,positive
1141,this show has to be my favorite out of all the...,positive
...,...,...
48887,trite and unoriginal. it's like someone watche...,negative
49063,"trick or treat, quickie review this zany romp ...",positive
49596,"this is absolutely the best 80s cartoon ever, ...",positive
49637,if you liked the richard chamberlain version o...,positive


In [21]:
reviews_with_url['review'].iloc[2] # at position 1088 in original dataframe

"this quasi j-horror film followed a young woman as she returns to her childhood village on the island of shikoku to sell the family house and meet up with old friends. she finds that one, the daughter of the village priestess, drowned several years earlier. she and fumiko (another childhood friend) then learn that sayori's mother is trying to bring her back to life with black magic. already the bonds between the dead and living are getting weak and the friends and villagers are seeing ghosts. nothing was exceptional or even very good about this movie. unlike stellar j-horror films, the suspense doesn't really build, the result doesn't seem overly threatening and the ending borders on the absurd.this movie is like plain white rice cooked a little too long so that it is bordering on mushy. sometimes you get this at poor asian restaurants or cook your own white rice a little too long. you end up eating it, because you need it with the meal, because what is chinese or japanese food withou

#### `We need to remove url from all these reviews`

In [22]:
df['review'] = df['review'].apply(remove_url)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


#### `Now let us check if we still have urls in reviews or not`

In [23]:
mask = df['review'].str.contains(r"https?://\S+|www\.\S+", regex = True)
reviews_with_url = df[mask]
reviews_with_url

Unnamed: 0,review,sentiment


## `Approach 2 of removing urls`

In [24]:
df['review'] = df['review'].str.replace(r"https?://\S+|www\.\S+", '', regex = True)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## `Expanding Abbreviations`

In [25]:
def expand_abb(data):
    data = re.sub(r"he's", "he is", data)
    data = re.sub(r"there's", "there is", data)
    data = re.sub(r"We're", "We are", data)
    data = re.sub(r"That's", "That is", data)
    data = re.sub(r"won't", "will not", data)
    data = re.sub(r"they're", "they are", data)
    data = re.sub(r"Can't", "Cannot", data)
    data = re.sub(r"wasn't", "was not", data)
    data = re.sub(r"don\x89Ûªt", "do not", data)
    data= re.sub(r"aren't", "are not", data)
    data = re.sub(r"isn't", "is not", data)
    data = re.sub(r"What's", "What is", data)
    data = re.sub(r"haven't", "have not", data)
    data = re.sub(r"hasn't", "has not", data)
    data = re.sub(r"There's", "There is", data)
    data = re.sub(r"He's", "He is", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"You're", "You are", data)
    data = re.sub(r"I'M", "I am", data)
    data = re.sub(r"shouldn't", "should not", data)
    data = re.sub(r"wouldn't", "would not", data)
    data = re.sub(r"i'm", "I am", data)
    data = re.sub(r"I\x89Ûªm", "I am", data)
    data = re.sub(r"I'm", "I am", data)
    data = re.sub(r"Isn't", "is not", data)
    data = re.sub(r"Here's", "Here is", data)
    data = re.sub(r"you've", "you have", data)
    data = re.sub(r"you\x89Ûªve", "you have", data)
    data = re.sub(r"we're", "we are", data)
    data = re.sub(r"what's", "what is", data)
    data = re.sub(r"couldn't", "could not", data)
    data = re.sub(r"we've", "we have", data)
    data = re.sub(r"it\x89Ûªs", "it is", data)
    data = re.sub(r"doesn\x89Ûªt", "does not", data)
    data = re.sub(r"It\x89Ûªs", "It is", data)
    data = re.sub(r"Here\x89Ûªs", "Here is", data)
    data = re.sub(r"who's", "who is", data)
    data = re.sub(r"I\x89Ûªve", "I have", data)
    data = re.sub(r"y'all", "you all", data)
    data = re.sub(r"can\x89Ûªt", "cannot", data)
    data = re.sub(r"would've", "would have", data)
    data = re.sub(r"it'll", "it will", data)
    data = re.sub(r"we'll", "we will", data)
    data = re.sub(r"wouldn\x89Ûªt", "would not", data)
    data = re.sub(r"We've", "We have", data)
    data = re.sub(r"he'll", "he will", data)
    data = re.sub(r"Y'all", "You all", data)
    data = re.sub(r"Weren't", "Were not", data)
    data = re.sub(r"Didn't", "Did not", data)
    data = re.sub(r"they'll", "they will", data)
    data = re.sub(r"they'd", "they would", data)
    data = re.sub(r"DON'T", "DO NOT", data)
    data = re.sub(r"That\x89Ûªs", "That is", data)
    data = re.sub(r"they've", "they have", data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"should've", "should have", data)
    data = re.sub(r"You\x89Ûªre", "You are", data)
    data = re.sub(r"where's", "where is", data)
    data = re.sub(r"Don\x89Ûªt", "Do not", data)
    data = re.sub(r"we'd", "we would", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"weren't", "were not", data)
    data = re.sub(r"They're", "They are", data)
    data = re.sub(r"Can\x89Ûªt", "Cannot", data)
    data = re.sub(r"you\x89Ûªll", "you will", data)
    data = re.sub(r"I\x89Ûªd", "I would", data)
    data = re.sub(r"let's", "let us", data)
    data = re.sub(r"it's", "it is", data)
    data = re.sub(r"can't", "cannot", data)
    data = re.sub(r"don't", "do not", data)
    data = re.sub(r"you're", "you are", data)
    data = re.sub(r"i've", "I have", data)
    data = re.sub(r"that's", "that is", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"doesn't", "does not",data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"didn't", "did not", data)
    data = re.sub(r"ain't", "am not", data)
    data = re.sub(r"you'll", "you will", data)
    data = re.sub(r"I've", "I have", data)
    data = re.sub(r"Don't", "do not", data)
    data = re.sub(r"I'll", "I will", data)
    data = re.sub(r"I'd", "I would", data)
    data = re.sub(r"Let's", "Let us", data)
    data = re.sub(r"you'd", "You would", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"Ain't", "am not", data)
    data = re.sub(r"Haven't", "Have not", data)
    data = re.sub(r"Could've", "Could have", data)
    data = re.sub(r"youve", "you have", data)  
    data = re.sub(r"donå«t", "do not", data)
    
    return data

In [26]:
expand_abb("I'll join on Monday. y'all need not worry.")

'I will join on Monday. you all need not worry.'

In [27]:
df['review'] = df['review'].apply(expand_abb)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## `correcting spelling mistakes`

In [28]:
from textblob import TextBlob

In [29]:
text = 'I lik to driv at nigt. It is veri gud'
TextBlob(text).correct().string

'I like to drive at night. It is very god'

In [30]:
def spelling_corrector(text):
    return TextBlob(text).correct().string

In [31]:
try:
    df['review'] = df['review'].apply(spelling_corrector)
except KeyboardInterrupt as ex:
    print('The process was interrupted by key interrupt in between as the process is taking very long.')

The process was interrupted by key interrupt in between as the process is taking very long.


In [32]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,I am going to have to disagree with the previo...,negative


## `removing punctuation marks`

In [33]:
import string

In [34]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [35]:
def remove_punctuation(text):
    for punctuation_mark in string.punctuation:
        if punctuation_mark in text:
            text = text.replace(punctuation_mark, '')
    return text

In [36]:
remove_punctuation("$%&Abhishek\'Jha()*@[\\]^is a very good boy")

'AbhishekJhais a very good boy'

In [37]:
df['review'] = df['review'].apply(remove_punctuation)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,petter matteis love in the time of money is a ...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,I am going to have to disagree with the previo...,negative


# `preprocessing`

    Preprocessing is the process of converting data into a form which can be easily used by a machine learning model. ML models work on numbers
    so it is necessary that we convert our data into machine learning model comfortable data form.
    
    
`Tokenization`

`stop words removal`

### `Tokenization`

In [38]:
from nltk.tokenize import word_tokenize

In [39]:
df['review'].apply(word_tokenize)

0        [one, of, the, other, reviewers, has, mentione...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, this, was, a, wonderful, way, to,...
3        [basically, there, is, a, family, where, a, li...
4        [petter, matteis, love, in, the, time, of, mon...
                               ...                        
49995    [i, thought, this, movie, did, a, down, right,...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, am, a, catholic, taught, in, parochial, el...
49998    [I, am, going, to, have, to, disagree, with, t...
49999    [no, one, expects, the, star, trek, movies, to...
Name: review, Length: 49582, dtype: object

In [40]:
df['tokenized_review'] = df['review'].apply(word_tokenize)
df

Unnamed: 0,review,sentiment,tokenized_review
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there is a family where a little boy...,negative,"[basically, there, is, a, family, where, a, li..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon..."
...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el..."
49998,I am going to have to disagree with the previo...,negative,"[I, am, going, to, have, to, disagree, with, t..."


### `removing stopwords`

In [41]:
from nltk.corpus import stopwords

In [42]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [43]:
len(stopwords.words('english'))

179

In [44]:
def remove_stopwords(str_list):
    words = stopwords.words('english')
    
    l = []
    
    for word in str_list:
        if word not in words:
            l.append(word)
            
    return l

In [45]:
remove_stopwords(['i', 'thought', 'this', 'was', 'a', 'wonderful', 'way', 'and', 'was', 'about', 'to', 'proceed'])

['thought', 'wonderful', 'way', 'proceed']

In [46]:
df['tokenized_review'] = df['tokenized_review'].apply(remove_stopwords)
df

Unnamed: 0,review,sentiment,tokenized_review
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,a wonderful little production the filming tech...,positive,"[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there is a family where a little boy...,negative,"[basically, family, little, boy, jake, thinks,..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, time, money, visually,..."
...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[thought, movie, right, good, job, creative, o..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,i am a catholic taught in parochial elementary...,negative,"[catholic, taught, parochial, elementary, scho..."
49998,I am going to have to disagree with the previo...,negative,"[I, going, disagree, previous, comment, side, ..."


#### `Now we need to join the tokenized reviews and then replace it with reviews.`

In [47]:
df['tokenized_review'].str.join(" ")

0        one reviewers mentioned watching 1 oz episode ...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically family little boy jake thinks zombie...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job creative original...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary schools n...
49998    I going disagree previous comment side maltin ...
49999    one expects star trek movies high art fans exp...
Name: tokenized_review, Length: 49582, dtype: object

In [48]:
df['review'] = df['tokenized_review'].str.join(" ")
df

Unnamed: 0,review,sentiment,tokenized_review
0,one reviewers mentioned watching 1 oz episode ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn..."
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically family little boy jake thinks zombie...,negative,"[basically, family, little, boy, jake, thinks,..."
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,..."
...,...,...,...
49995,thought movie right good job creative original...,positive,"[thought, movie, right, good, job, creative, o..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,catholic taught parochial elementary schools n...,negative,"[catholic, taught, parochial, elementary, scho..."
49998,I going disagree previous comment side maltin ...,negative,"[I, going, disagree, previous, comment, side, ..."


## `EDA and making new features`

    Now we will make new features in our dataframe. This is called feature engineering.

##### `We can make a feature called as num_of_words. This can be related to the review. I mean generally negative reviews are lengthy 🙄`

In [50]:
df['num_of_words_in_review'] = df['tokenized_review'].apply(len)
df

Unnamed: 0,review,sentiment,tokenized_review,num_of_words_in_review
0,one reviewers mentioned watching 1 oz episode ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e...",166
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn...",84
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su...",87
3,basically family little boy jake thinks zombie...,negative,"[basically, family, little, boy, jake, thinks,...",64
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,...",125
...,...,...,...,...
49995,thought movie right good job creative original...,positive,"[thought, movie, right, good, job, creative, o...",79
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...",55
49997,catholic taught parochial elementary schools n...,negative,"[catholic, taught, parochial, elementary, scho...",114
49998,I going disagree previous comment side maltin ...,negative,"[I, going, disagree, previous, comment, side, ...",114


##### `We can also make a feature called as review_length which stores the length of review. This could also be a good feature. Although we shall decide later.`

In [52]:
df['review_length'] = df['review'].str.len()
df

Unnamed: 0,review,sentiment,tokenized_review,num_of_words_in_review,review_length
0,one reviewers mentioned watching 1 oz episode ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e...",166,1142
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn...",84,656
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su...",87,592
3,basically family little boy jake thinks zombie...,negative,"[basically, family, little, boy, jake, thinks,...",64,440
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,...",125,863
...,...,...,...,...,...
49995,thought movie right good job creative original...,positive,"[thought, movie, right, good, job, creative, o...",79,514
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...",55,394
49997,catholic taught parochial elementary schools n...,negative,"[catholic, taught, parochial, elementary, scho...",114,819
49998,I going disagree previous comment side maltin ...,negative,"[I, going, disagree, previous, comment, side, ...",114,829
