In [8]:
import pandas as pd
import numpy as np
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt




from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize


%matplotlib inline

[nltk_data] Downloading package stopwords to /home/asseli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/asseli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
df = pd.read_csv('IMDB Dataset.csv')

In [10]:
re.sub(r'\W', ' ', df.review[0])

'One of the other reviewers has mentioned that after watching just 1 Oz episode you ll be hooked  They are right  as this is exactly what happened with me  br    br   The first thing that struck me about Oz was its brutality and unflinching scenes of violence  which set in right from the word GO  Trust me  this is not a show for the faint hearted or timid  This show pulls no punches with regards to drugs  sex or violence  Its is hardcore  in the classic use of the word  br    br   It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary  It focuses mainly on Emerald City  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  Em City is home to many  Aryans  Muslims  gangstas  Latinos  Christians  Italians  Irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away  br    br   I would say the main appeal of the show is due to the fa

## 1. Exploratory data-analysis


In [11]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


## 2. Data preprocessing

- Text normalization
- Getting rid of redundant characters and symbols, punctuation, HTML
- Stopword removal
- Stemming
- Lemmatization

In [13]:
# 1. remove HTML
html_expr = re.compile('<.*?>') 
df.review = df.review.apply(lambda x: re.sub(html_expr, '', x))
df.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [14]:
# 2. remove punctuation
df.review = df.review.apply(lambda x: re.sub(r'[^\w\s]',' ', x))
df.review[0]

'One of the other reviewers has mentioned that after watching just 1 Oz episode you ll be hooked  They are right  as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence  which set in right from the word GO  Trust me  this is not a show for the faint hearted or timid  This show pulls no punches with regards to drugs  sex or violence  Its is hardcore  in the classic use of the word It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary  It focuses mainly on Emerald City  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  Em City is home to many  Aryans  Muslims  gangstas  Latinos  Christians  Italians  Irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [15]:
# 3. remove numbers
df.review = df.review.apply(lambda x: re.sub(r'\d+', '', x))
df.review[0]

'One of the other reviewers has mentioned that after watching just  Oz episode you ll be hooked  They are right  as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence  which set in right from the word GO  Trust me  this is not a show for the faint hearted or timid  This show pulls no punches with regards to drugs  sex or violence  Its is hardcore  in the classic use of the word It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary  It focuses mainly on Emerald City  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  Em City is home to many  Aryans  Muslims  gangstas  Latinos  Christians  Italians  Irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away I would say the main appeal of the show is due to the fact that it goes where other shows wou

In [16]:
# 4. lowercase
df.review = df.review.apply(lambda x: x.lower())
df.review[0]

'one of the other reviewers has mentioned that after watching just  oz episode you ll be hooked  they are right  as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence  which set in right from the word go  trust me  this is not a show for the faint hearted or timid  this show pulls no punches with regards to drugs  sex or violence  its is hardcore  in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary  it focuses mainly on emerald city  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  em city is home to many  aryans  muslims  gangstas  latinos  christians  italians  irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wou

In [17]:
# 5. whitespace
df.review = df.review.apply(lambda x: x.split()) 
df.review[0]

['one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 'oz',
 'episode',
 'you',
 'll',
 'be',
 'hooked',
 'they',
 'are',
 'right',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 'trust',
 'me',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 'sex',
 'or',
 'violence',
 'its',
 'is',
 'hardcore',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 'it',
 'is',
 'called',
 'oz',
 'as',
 'that',
 'is',
 'the',
 'nickname',
 'given',
 'to',
 'the',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'it',
 'focuses',
 'mainly',
 'on',
 '

In [18]:
# 6. stopwords
stop_words = set(stopwords.words('english'))
df.review = df.review.apply(lambda x: [w for w in x if w not in stop_words])
df.review[0]

['one',
 'reviewers',
 'mentioned',
 'watching',
 'oz',
 'episode',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'shows',
 'dare',
 

In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas


df.review = df.review.apply(lambda x: [lemmatizer.lemmatize(w, pos='v') for w in x])
df.review[0]

['one',
 'reviewers',
 'mention',
 'watch',
 'oz',
 'episode',
 'hook',
 'right',
 'exactly',
 'happen',
 'first',
 'thing',
 'strike',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pull',
 'punch',
 'regard',
 'drug',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'call',
 'oz',
 'nickname',
 'give',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focus',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'front',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffle',
 'death',
 'star',
 'dodgy',
 'deal',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'go',
 'show',
 'dare',
 'forget',
 'pretty',
 'picture

In [20]:
# 7. stemming

stemmer = PorterStemmer()

df.review = df.review.apply(lambda x: " ".join([stemmer.stem(w) for w in x]))
df.review[0]


'one review mention watch oz episod hook right exactli happen first thing strike oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam give oswald maximum secur state penitentari focu mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death star dodgi deal shadi agreement never far away would say main appeal show due fact go show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw strike nasti surreal say readi watch develop tast oz get accustom high level graphic violenc violenc injustic crook guard sell nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side'

In [21]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch oz episod hook right ...,positive
1,wonder littl product film techniqu unassum old...,positive
2,think wonder way spend time hot summer weekend...,positive
3,basic famili littl boy jake think zombi closet...,negative
4,petter mattei love time money visual stun film...,positive


## 3. Feature extraction

### BagOfWords

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(train.review)
X_test_bow = vectorizer.transform(test.review)

In [26]:
X_train_bow.shape, X_test_bow.shape

((40000, 64597), (10000, 64597))

### TFIDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train.review)
X_test_tfidf = tfidf_vectorizer.transform(test.review)

In [28]:
X_train_tfidf.shape, X_test_tfidf.shape

((40000, 64597), (10000, 64597))

### Labels

In [29]:
y_train, y_test = train.sentiment, test.sentiment

## 4. Models

### Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [31]:
# Logistic Regression when using Bag of Words
lr = LogisticRegression(max_iter=99999)
lr_bow_model = lr.fit(X_train_bow, y_train)
lr_bow_predictions= lr_bow_model.predict(X_test_bow)
lr_bow_acc = accuracy_score(y_test, lr_bow_predictions)
print(f'Accuracy for BoW features using Logistic Regression: {lr_bow_acc}')

Accuracy for BoW features using Logistic Regression: 0.8815


In [32]:
# Logistic Regression when using TFIDF
lr_tfidf_model = lr.fit(X_train_tfidf, y_train)
lr_tfidf_predictions= lr_tfidf_model.predict(X_test_tfidf)
lr_tfidf_acc = accuracy_score(y_test, lr_tfidf_predictions)
print(f'Accuracy for TFIDF features using Logistic Regression: {lr_tfidf_acc}')

Accuracy for TFIDF features using Logistic Regression: 0.8964


### Support Vector Machine

In [33]:
from sklearn.svm import SVC


In [34]:
svc = SVC(kernel='rbf', gamma='auto', max_iter=999)
svc_bow = svc.fit(X_train_bow, y_train)



In [35]:
# BoW
# svc = SVC()
# svc_bow = svc.fit(X_train_bow, y_train)
svc_bow_predictions = svc_bow.predict(X_test_bow)
svc_bow_acc = accuracy_score(y_test, svc_bow_predictions)
print(f'Accuracy for BoW features using Support Vector Machine: {svc_bow_acc}')

Accuracy for BoW features using Support Vector Machine: 0.4998


In [36]:
# Support Vector Machine when using TFIDF
# BoW
svc_tfidf = svc.fit(X_train_tfidf, y_train)
svc_tfidf_predictions = svc_bow.predict(X_test_tfidf)
svc_tfidf_acc = accuracy_score(y_test, svc_tfidf_predictions)
print(f'Accuracy for TFIDF features using Support Vector Machine: {svc_tfidf_acc}')



Accuracy for TFIDF features using Support Vector Machine: 0.5913


## Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [38]:
# Random Forest using Bag of Words features
rf_bow = rf.fit(X_train_bow, y_train)
rf_bow_pred = rf_bow.predict(X_test_bow)
rf_bow_acc = accuracy_score(y_test, rf_bow_pred)
print(f'Accuracy for BoW features using Random Forest: {rf_bow_acc}')

Accuracy for BoW features using Random Forest: 0.8538


In [39]:
# Random Forest using Bag of Words features
rf_tfidf = rf.fit(X_train_tfidf, y_train)
rf_tfidf_pred = rf_bow.predict(X_test_tfidf)
rf_tfidf_acc = accuracy_score(y_test, rf_tfidf_pred)
print(f'Accuracy for TFIDF features using Random Forest: {rf_tfidf_acc}')

Accuracy for TFIDF features using Random Forest: 0.8553


## Multilayer Perceptron classifier

In [40]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()

In [41]:
mlp_bow = mlp.fit(X_train_bow, y_train)
mlp_bow_pred = mlp.predict(X_test_bow)
mlp_bow_acc = accuracy_score(y_test, mlp_bow_pred)
print(f'Accuracy for BOW features using MLP: {mlp_bow_acc}')

Accuracy for BOW features using MLP: 0.8664


## 5. Model evaluation

In [42]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
a = confusion_matrix(y_test, lr_bow_predictions)

NameError: name 'confusion_matrix' is not defined

In [3]:
ConfusionDisplay(a).plot()
plt.show()

NameError: name 'ConfusionDisplay' is not defined