In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
imbd_data = pd.read_csv('/Practice code/DataSets/IMDB Dataset.csv',nrows=10000)

In [3]:
imbd_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
imbd_data.shape

(10000, 2)

In [5]:
imbd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


# Text Preprocessing

## Removing Punctuations, Numbers, and Special Characters

In [6]:
import re

In [7]:
def data_cleaning(text):
    clean_text = re.sub('[^A-Za-z]+'," ",text)
    return clean_text

In [8]:
imbd_data['clean_review'] = imbd_data['review'].apply(lambda text : data_cleaning(text))

In [9]:
imbd_data['clean_review']

0       One of the other reviewers has mentioned that ...
1       A wonderful little production br br The filmin...
2       I thought this was a wonderful way to spend ti...
3       Basically there s a family where a little boy ...
4       Petter Mattei s Love in the Time of Money is a...
                              ...                        
9995    Fun entertaining movie about WWII German spy J...
9996    Give me a break How can anyone say that this i...
9997    This movie is a bad movie But after watching a...
9998    This is a movie that was probably made to ente...
9999    Smashing film about film making Shows the inte...
Name: clean_review, Length: 10000, dtype: object

## converting clean_review to lower case

In [10]:
imbd_data['clean_review'] = imbd_data['clean_review'].apply(lambda text : text.lower())

In [11]:
imbd_data['clean_review']

0       one of the other reviewers has mentioned that ...
1       a wonderful little production br br the filmin...
2       i thought this was a wonderful way to spend ti...
3       basically there s a family where a little boy ...
4       petter mattei s love in the time of money is a...
                              ...                        
9995    fun entertaining movie about wwii german spy j...
9996    give me a break how can anyone say that this i...
9997    this movie is a bad movie but after watching a...
9998    this is a movie that was probably made to ente...
9999    smashing film about film making shows the inte...
Name: clean_review, Length: 10000, dtype: object

## Tokenization

In [12]:
imbd_data['review_token'] = imbd_data['clean_review'].apply(lambda text : text.split())

In [13]:
imbd_data['review_token']

0       [one, of, the, other, reviewers, has, mentione...
1       [a, wonderful, little, production, br, br, the...
2       [i, thought, this, was, a, wonderful, way, to,...
3       [basically, there, s, a, family, where, a, lit...
4       [petter, mattei, s, love, in, the, time, of, m...
                              ...                        
9995    [fun, entertaining, movie, about, wwii, german...
9996    [give, me, a, break, how, can, anyone, say, th...
9997    [this, movie, is, a, bad, movie, but, after, w...
9998    [this, is, a, movie, that, was, probably, made...
9999    [smashing, film, about, film, making, shows, t...
Name: review_token, Length: 10000, dtype: object

## Removal of Stop words

In [14]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adars\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
stop_words = stopwords.words('english')

imbd_data['review_token'] = imbd_data['review_token'].apply(lambda token_text:[word for word in token_text if word not in stop_words])

In [16]:
imbd_data['review_token']

0       [one, reviewers, mentioned, watching, oz, epis...
1       [wonderful, little, production, br, br, filmin...
2       [thought, wonderful, way, spend, time, hot, su...
3       [basically, family, little, boy, jake, thinks,...
4       [petter, mattei, love, time, money, visually, ...
                              ...                        
9995    [fun, entertaining, movie, wwii, german, spy, ...
9996    [give, break, anyone, say, good, hockey, movie...
9997    [movie, bad, movie, watching, endless, series,...
9998    [movie, probably, made, entertain, middle, sch...
9999    [smashing, film, film, making, shows, intense,...
Name: review_token, Length: 10000, dtype: object

## Text Normalization

### a.     Stemming



In [17]:
from nltk.stem import PorterStemmer    
ps = PorterStemmer()

In [18]:
imbd_data['stem_text'] = imbd_data['review_token'].apply(lambda text: [ps.stem(word) for word in text]) # stemming
imbd_data

Unnamed: 0,review,sentiment,clean_review,review_token,stem_text
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st..."
...,...,...,...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive,fun entertaining movie about wwii german spy j...,"[fun, entertaining, movie, wwii, german, spy, ...","[fun, entertain, movi, wwii, german, spi, juli..."
9996,Give me a break. How can anyone say that this ...,negative,give me a break how can anyone say that this i...,"[give, break, anyone, say, good, hockey, movie...","[give, break, anyon, say, good, hockey, movi, ..."
9997,This movie is a bad movie. But after watching ...,negative,this movie is a bad movie but after watching a...,"[movie, bad, movie, watching, endless, series,...","[movi, bad, movi, watch, endless, seri, bad, h..."
9998,This is a movie that was probably made to ente...,negative,this is a movie that was probably made to ente...,"[movie, probably, made, entertain, middle, sch...","[movi, probabl, made, entertain, middl, school..."


### b.     Lemmatization

In [19]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wl=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adars\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
imbd_data['lemma_text'] = imbd_data['review_token'].apply(lambda review_token : [wl.lemmatize(word,pos='v') for word in review_token])

imbd_data

Unnamed: 0,review,sentiment,clean_review,review_token,stem_text,lemma_text
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook...","[one, reviewers, mention, watch, oz, episode, ..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq...","[wonderful, little, production, br, br, film, ..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[think, wonderful, way, spend, time, hot, summ..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi...","[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st...","[petter, mattei, love, time, money, visually, ..."
...,...,...,...,...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive,fun entertaining movie about wwii german spy j...,"[fun, entertaining, movie, wwii, german, spy, ...","[fun, entertain, movi, wwii, german, spi, juli...","[fun, entertain, movie, wwii, german, spy, jul..."
9996,Give me a break. How can anyone say that this ...,negative,give me a break how can anyone say that this i...,"[give, break, anyone, say, good, hockey, movie...","[give, break, anyon, say, good, hockey, movi, ...","[give, break, anyone, say, good, hockey, movie..."
9997,This movie is a bad movie. But after watching ...,negative,this movie is a bad movie but after watching a...,"[movie, bad, movie, watching, endless, series,...","[movi, bad, movi, watch, endless, seri, bad, h...","[movie, bad, movie, watch, endless, series, ba..."
9998,This is a movie that was probably made to ente...,negative,this is a movie that was probably made to ente...,"[movie, probably, made, entertain, middle, sch...","[movi, probabl, made, entertain, middl, school...","[movie, probably, make, entertain, middle, sch..."


## Joining of text

In [21]:
imbd_data['stem_text'] = imbd_data['stem_text'].apply(lambda x: ' '.join(x))

In [22]:
imbd_data['lemma_text'] = imbd_data['lemma_text'].apply(lambda x: ' '.join(x))

In [23]:
imbd_data.head()

Unnamed: 0,review,sentiment,clean_review,review_token,stem_text,lemma_text
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, oz, epis...",one review mention watch oz episod hook right ...,one reviewers mention watch oz episode hook ri...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[wonderful, little, production, br, br, filmin...",wonder littl product br br film techniqu unass...,wonderful little production br br film techniq...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...",thought wonder way spend time hot summer weeke...,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, family, little, boy, jake, thinks,...",basic famili littl boy jake think zombi closet...,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, love, time, money, visually, ...",petter mattei love time money visual stun film...,petter mattei love time money visually stun fi...


# Text Vector Generation


## Bag of words

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
imbd_vectorizer = CountVectorizer()

imbd_features   = imbd_vectorizer.fit_transform(imbd_data['stem_text'])
imbd_features.get_shape()

(10000, 34935)

### build a model on above feature

In [28]:
X = imbd_features    #### Features
y = imbd_data['sentiment']  #### Target

#Using the train_test_split to create train and test sets.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

print('Training set :', X_train.shape)
print('Testing set :', X_test.shape)

#Importing the Decision tree classifier from the sklearn library.
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion = 'entropy')


#Training the decision tree classifier. 
clf.fit(X_train, y_train)

#Predicting labels on the test set.
y_pred =  clf.predict(X_test)

#Importing the accuracy metric from sklearn.metrics library

from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Training set : (7500, 34935)
Testing set : (2500, 34935)
Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.7068


## TF IDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_imbd_vectorizer = TfidfVectorizer()
tfidf_imbd_features = tfidf_imbd_vectorizer.fit_transform(imbd_data['stem_text'])


In [26]:
tfidf_imbd_features.shape

(10000, 34935)

### build a model on above feature

In [27]:
X = imbd_features    #### Features
y = imbd_data['sentiment']  #### Target

#Using the train_test_split to create train and test sets.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

print('Training set :', X_train.shape)
print('Testing set :', X_test.shape)

#Importing the Decision tree classifier from the sklearn library.
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion = 'entropy')

#Training the decision tree classifier. 
clf.fit(X_train, y_train)

#Predicting labels on the test set.
y_pred =  clf.predict(X_test)

#Importing the accuracy metric from sklearn.metrics library

from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Training set : (7500, 34935)
Testing set : (2500, 34935)
Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.7072
