In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
review_data = pd.read_csv('IMDB Dataset.csv',nrows=50000)

In [3]:
review_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
review_data.shape

(50000, 2)

In [5]:
features = review_data['review']
target = review_data['sentiment']

# Text Preprocessing
Removing Punctuations, Numbers, and Special Characters


In [6]:
import re

In [7]:
def data_cleaning(review):
    clean_text = re.sub('[^A-Za-z]+'," ",review)
    return clean_text

In [8]:
review_data['review'] = review_data['review'].apply(lambda review : data_cleaning(review))

In [9]:
review_data['review'] = review_data['review'].apply(lambda review: review.lower())

# Tokenization

In [10]:
review_data

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is a...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i m going to have to disagree with the previou...,negative


In [11]:
review_data['token_text'] = review_data['review'].apply(lambda review : review.split())

In [12]:
review_data

Unnamed: 0,review,sentiment,token_text
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production br br the filmin...,positive,"[a, wonderful, little, production, br, br, the..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there s a family where a little boy ...,negative,"[basically, there, s, a, family, where, a, lit..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, s, love, in, the, time, of, m..."
...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el..."
49998,i m going to have to disagree with the previou...,negative,"[i, m, going, to, have, to, disagree, with, th..."


# Removal of Stopwords

In [13]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abhijeet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [15]:
def remove_stopwords(token_text):
    words=[]
    for word in token_text:
        if word not in stop_words:
            words.append(word)
            
    return words
    

In [16]:
review_data['token_text'] = review_data['token_text'].apply(lambda token_text : remove_stopwords(token_text))

In [17]:
review_data['token_text'] = review_data['token_text'].apply(lambda token_text:[word for word in token_text if word not in stop_words])


In [18]:
review_data

Unnamed: 0,review,sentiment,token_text
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis..."
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ..."
...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[thought, movie, right, good, job, creative, o..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,i am a catholic taught in parochial elementary...,negative,"[catholic, taught, parochial, elementary, scho..."
49998,i m going to have to disagree with the previou...,negative,"[going, disagree, previous, comment, side, mal..."


# Text Normalization

## Stemming


In [19]:
from nltk.stem import PorterStemmer    
ps = PorterStemmer() 

In [20]:
def porter_stemming(token_text):
    
    stem_words=[]
    for word in token_text:
        stem_word = ps.stem(word)
        stem_words.append(stem_word)
        
    return stem_words
        

review_data['stem_text'] = review_data['token_text'].apply(lambda token_text : porter_stemming(token_text))

In [21]:
review_data['stem_text'] = review_data['token_text'].apply(lambda text : [ps.stem(word) for word in text])


In [22]:
review_data

Unnamed: 0,review,sentiment,token_text,stem_text
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook..."
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st..."
...,...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[thought, movie, right, good, job, creative, o...","[thought, movi, right, good, job, creativ, ori..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir..."
49997,i am a catholic taught in parochial elementary...,negative,"[catholic, taught, parochial, elementary, scho...","[cathol, taught, parochi, elementari, school, ..."
49998,i m going to have to disagree with the previou...,negative,"[going, disagree, previous, comment, side, mal...","[go, disagre, previou, comment, side, maltin, ..."


# Lemmatization

In [23]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wl=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Abhijeet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
def wordnet_lemma(token_text):
    lemma_words=[]
    for word in token_text:
        lemma_word = wl.lemmatize(word,pos='v')
        lemma_words.append(lemma_word)
        
    return lemma_words

In [25]:
review_data['lemma_text'] = review_data['token_text'].apply(lambda token_text : wordnet_lemma(token_text))

In [26]:
review_data['lemma_text'] = review_data['token_text'].apply(lambda token_text : [wl.lemmatize(word,pos='v') for word in token_text])

In [27]:
review_data

Unnamed: 0,review,sentiment,token_text,stem_text,lemma_text
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook...","[one, reviewers, mention, watch, oz, episode, ..."
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq...","[wonderful, little, production, br, br, film, ..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[think, wonderful, way, spend, time, hot, summ..."
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi...","[basically, family, little, boy, jake, think, ..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st...","[petter, mattei, love, time, money, visually, ..."
...,...,...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[thought, movie, right, good, job, creative, o...","[thought, movi, right, good, job, creativ, ori...","[think, movie, right, good, job, creative, ori..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir...","[bad, plot, bad, dialogue, bad, act, idiotic, ..."
49997,i am a catholic taught in parochial elementary...,negative,"[catholic, taught, parochial, elementary, scho...","[cathol, taught, parochi, elementari, school, ...","[catholic, teach, parochial, elementary, schoo..."
49998,i m going to have to disagree with the previou...,negative,"[going, disagree, previous, comment, side, mal...","[go, disagre, previou, comment, side, maltin, ...","[go, disagree, previous, comment, side, maltin..."


In [29]:
review_data['cleaned_text'] = review_data['stem_text'].apply(lambda text: ' '.join(text))

In [30]:
review_data

Unnamed: 0,review,sentiment,token_text,stem_text,lemma_text,cleaned_text
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook...","[one, reviewers, mention, watch, oz, episode, ...",one review mention watch oz episod hook right ...
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq...","[wonderful, little, production, br, br, film, ...",wonder littl product br br film techniqu unass...
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[think, wonderful, way, spend, time, hot, summ...",thought wonder way spend time hot summer weeke...
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi...","[basically, family, little, boy, jake, think, ...",basic famili littl boy jake think zombi closet...
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st...","[petter, mattei, love, time, money, visually, ...",petter mattei love time money visual stun film...
...,...,...,...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[thought, movie, right, good, job, creative, o...","[thought, movi, right, good, job, creativ, ori...","[think, movie, right, good, job, creative, ori...",thought movi right good job creativ origin fir...
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir...","[bad, plot, bad, dialogue, bad, act, idiotic, ...",bad plot bad dialogu bad act idiot direct anno...
49997,i am a catholic taught in parochial elementary...,negative,"[catholic, taught, parochial, elementary, scho...","[cathol, taught, parochi, elementari, school, ...","[catholic, teach, parochial, elementary, schoo...",cathol taught parochi elementari school nun ta...
49998,i m going to have to disagree with the previou...,negative,"[going, disagree, previous, comment, side, mal...","[go, disagre, previou, comment, side, maltin, ...","[go, disagree, previous, comment, side, maltin...",go disagre previou comment side maltin one sec...


# 2. Text Vector Generation(Words Embedding)

Bag of words o TF_IDF

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
review_vectorizer = CountVectorizer()

review_features   = review_vectorizer.fit_transform(review_data['cleaned_text'])
review_features.get_shape()

(50000, 68997)

# Build Sentiment Analysis using Decision Tree


In [32]:
X = review_features    
y = review_data['sentiment']

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

In [34]:
print('Training set :', X_train.shape)
print('Testing set :', X_test.shape)

Training set : (37500, 68997)
Testing set : (12500, 68997)


In [35]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion = 'entropy')

In [36]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [37]:
y_pred =  clf.predict(X_test)

In [38]:
from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.71616


# TF IDF

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_review_vectorizer = TfidfVectorizer()
tfidf_review_features = tfidf_review_vectorizer.fit_transform(review_data['cleaned_text'])

In [40]:
tfidf_review_features.shape

(50000, 68997)

In [42]:
X = review_features    
y = review_data['sentiment']  


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

print('Training set :', X_train.shape)
print('Testing set :', X_test.shape)


from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion = 'entropy')

 
clf.fit(X_train, y_train)


y_pred =  clf.predict(X_test)




Training set : (37500, 68997)
Testing set : (12500, 68997)


In [44]:
from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.7204
