In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from string import punctuation
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Data processing

## Remove html tags

In [3]:
def remove_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

df['review'] = df['review'].apply(remove_html)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Tokenize words

In [4]:
df['tokenized_review'] = df['review'].apply(word_tokenize)
df

Unnamed: 0,review,sentiment,tokenized_review
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. The filming tec...,positive,"[A, wonderful, little, production, ., The, fil..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei, 's, ``, Love, in, the, Time, ..."
...,...,...,...
49995,I thought this movie did a down right good job...,positive,"[I, thought, this, movie, did, a, down, right,..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"[Bad, plot, ,, bad, dialogue, ,, bad, acting, ..."
49997,I am a Catholic taught in parochial elementary...,negative,"[I, am, a, Catholic, taught, in, parochial, el..."
49998,I'm going to have to disagree with the previou...,negative,"[I, 'm, going, to, have, to, disagree, with, t..."


## Remove punctuation

In [5]:
print(punctuation)
pattern = f"[{re.escape(punctuation)}]"

df['tokenized_review'] = df['tokenized_review'].apply(lambda words: [word for word in words if not re.search(pattern, word)])
df

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,review,sentiment,tokenized_review
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. The filming tec...,positive,"[A, wonderful, little, production, The, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there, a, family, where, a, little..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei, Love, in, the, Time, of, Mone..."
...,...,...,...
49995,I thought this movie did a down right good job...,positive,"[I, thought, this, movie, did, a, down, right,..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"[Bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,negative,"[I, am, a, Catholic, taught, in, parochial, el..."
49998,I'm going to have to disagree with the previou...,negative,"[I, going, to, have, to, disagree, with, the, ..."


## Normalization

In [6]:
df['tokenized_review'] = df['tokenized_review'].apply(lambda words: [word.lower() for word in words])
df

Unnamed: 0,review,sentiment,tokenized_review
0,One of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. The filming tec...,positive,"[a, wonderful, little, production, the, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,"[basically, there, a, family, where, a, little..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, in, the, time, of, mone..."
...,...,...,...
49995,I thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el..."
49998,I'm going to have to disagree with the previou...,negative,"[i, going, to, have, to, disagree, with, the, ..."


## Lemmatization

In [7]:
def lemmatize_review(tokens):
    return [wnl.lemmatize(word) for word in tokens]
    
nltk.download('omw-1.4')
nltk.download('wordnet')
wnl = WordNetLemmatizer()
df['lemmatized_review'] = df['tokenized_review'].apply(lemmatize_review)
df

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\zoika\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zoika\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,review,sentiment,tokenized_review,lemmatized_review
0,One of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, of, the, other, reviewer, ha, mentioned,..."
1,A wonderful little production. The filming tec...,positive,"[a, wonderful, little, production, the, filmin...","[a, wonderful, little, production, the, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[i, thought, this, wa, a, wonderful, way, to, ..."
3,Basically there's a family where a little boy ...,negative,"[basically, there, a, family, where, a, little...","[basically, there, a, family, where, a, little..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, in, the, time, of, mone...","[petter, mattei, love, in, the, time, of, mone..."
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,...","[i, thought, this, movie, did, a, down, right,..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el...","[i, am, a, catholic, taught, in, parochial, el..."
49998,I'm going to have to disagree with the previou...,negative,"[i, going, to, have, to, disagree, with, the, ...","[i, going, to, have, to, disagree, with, the, ..."


In [8]:
df = df.drop(columns=["tokenized_review", "review"])
df

Unnamed: 0,sentiment,lemmatized_review
0,positive,"[one, of, the, other, reviewer, ha, mentioned,..."
1,positive,"[a, wonderful, little, production, the, filmin..."
2,positive,"[i, thought, this, wa, a, wonderful, way, to, ..."
3,negative,"[basically, there, a, family, where, a, little..."
4,positive,"[petter, mattei, love, in, the, time, of, mone..."
...,...,...
49995,positive,"[i, thought, this, movie, did, a, down, right,..."
49996,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,negative,"[i, am, a, catholic, taught, in, parochial, el..."
49998,negative,"[i, going, to, have, to, disagree, with, the, ..."


## Remove stop words

In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['lemmatized_review'] = df['lemmatized_review'].apply(lambda x: [word for word in x if word not in stop_words])
df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zoika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,sentiment,lemmatized_review
0,positive,"[one, reviewer, ha, mentioned, watching, 1, oz..."
1,positive,"[wonderful, little, production, filming, techn..."
2,positive,"[thought, wa, wonderful, way, spend, time, hot..."
3,negative,"[basically, family, little, boy, jake, think, ..."
4,positive,"[petter, mattei, love, time, money, visually, ..."
...,...,...
49995,positive,"[thought, movie, right, good, job, wa, creativ..."
49996,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,negative,"[catholic, taught, parochial, elementary, scho..."
49998,negative,"[going, disagree, previous, comment, side, mal..."


In [10]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == "positive" else 0)
df

Unnamed: 0,sentiment,lemmatized_review
0,1,"[one, reviewer, ha, mentioned, watching, 1, oz..."
1,1,"[wonderful, little, production, filming, techn..."
2,1,"[thought, wa, wonderful, way, spend, time, hot..."
3,0,"[basically, family, little, boy, jake, think, ..."
4,1,"[petter, mattei, love, time, money, visually, ..."
...,...,...
49995,1,"[thought, movie, right, good, job, wa, creativ..."
49996,0,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,0,"[catholic, taught, parochial, elementary, scho..."
49998,0,"[going, disagree, previous, comment, side, mal..."


In [11]:
df['lemmatized_review'] = df['lemmatized_review'].apply(lambda x: ' '.join(x))
df

Unnamed: 0,sentiment,lemmatized_review
0,1,one reviewer ha mentioned watching 1 oz episod...
1,1,wonderful little production filming technique ...
2,1,thought wa wonderful way spend time hot summer...
3,0,basically family little boy jake think zombie ...
4,1,petter mattei love time money visually stunnin...
...,...,...
49995,1,thought movie right good job wa creative origi...
49996,0,bad plot bad dialogue bad acting idiotic direc...
49997,0,catholic taught parochial elementary school nu...
49998,0,going disagree previous comment side maltin on...


In [12]:
df.to_csv("dataset_imdb_preprocessed.csv")