In [10]:
# Step 0. Load libraries and custom functions
# Matrices and datasets ------------------------------------------------
import pandas as pd
import numpy as np
# Graphics -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learning -----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
# Deep Learning --------------------------------------------------------
import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [11]:
# Step 1. Load data
# 1.1 Read csv and get basic info
df_raw = pd.read_csv('../data/01_IMDB_Dataset_HuggingFace.csv')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [12]:
# 1.2 Get a sample
df_raw.sample(10)

Unnamed: 0,review,sentiment
7996,"This film is an impressionistic, poetic take o...",positive
20155,While essentially a remake of the original Chi...,positive
24954,Dr. McCoy and Mr. Spock find themselves trappe...,positive
48048,Neatly skipping over everything from the coup ...,positive
3397,Has the proliferation of relatively high quali...,positive
18955,First I must say that I enjoyed the first Unde...,negative
1549,"""Ah Ritchie's made another gangster film with ...",positive
27010,I found Code 46 very disappointing. I thought ...,negative
22028,I just saw this film at the 2001 Toronto inter...,positive
12982,"America's next top model is a good show, it he...",positive


In [13]:
# 1.3 Verify if there are duplicates
df_raw['review'].value_counts()

review
Loved today's show!!! It was a variety and not solely cooking (which would have been great too). Very stimulating and captivating, always keeping the viewer peeking around the corner to see what was coming up next. She is as down to earth and as personable as you get, like one of us which made the show all the more enjoyable. Special guests, who are friends as well made for a nice surprise too. Loved the 'first' theme and that the audience was invited to play along too. I must admit I was shocked to see her come in under her time limits on a few things, but she did it and by golly I'll be writing those recipes down. Saving time in the kitchen means more time with family. Those who haven't tuned in yet, find out what channel and the time, I assure you that you won't be disappointed.                                                                                                                                                                                                         

In [14]:
# 1.3 Preprocess data in order to avoid html tags and show result
df_interim = df_raw.copy()
df_interim['user_review'] = df_interim['review'].str.replace(r'(<.*?>)','',regex=True)
df_interim['user_review'] = df_interim['user_review'].str.replace(r'\s+',' ',regex=True)
df_interim['user_review'] = df_interim['user_review'].str.replace(r'\s,\s',', ',regex=True)
df_interim['label'] = df_interim['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df_interim = df_interim.drop_duplicates()
df_interim = df_interim.drop([44855],axis=0)
df_interim

Unnamed: 0,review,sentiment,user_review,label
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,negative,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,negative,I'm going to have to disagree with the previou...,0


In [15]:
df = df_interim.drop(['review','sentiment'], axis=1).copy()
df

Unnamed: 0,user_review,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [16]:
# Step 3. Create a basic data analysis
# 3.1 Describe data
df.describe(include='all')

Unnamed: 0,user_review,label
count,49581,49581.0
unique,49581,
top,One of the other reviewers has mentioned that ...,
freq,1,
mean,,0.501886
std,,0.500001
min,,0.0
25%,,0.0
50%,,1.0
75%,,1.0


In [19]:
# 3.2 Preprocess text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['user_review'])
sequences = tokenizer.texts_to_sequences(df['user_review'])
X = pad_sequences(sequences, maxlen=200)