In [1]:
# Step 0. Load libraries and custom functions
# Matrices and datasets ------------------------------------------------
import pandas as pd
import numpy as np
# Graphics -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learning -----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
# Deep Learning --------------------------------------------------------
import keras
from keras import layers
from keras.layers import TextVectorization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Custom functions -----------------------------------------------------
def sentence_fixed_split(x:list, words: int):
    """
    Split a list of sentences into a list of fixed length sentences.
    param x: sentence as a list of words
    param words: number of fixed words required
    return: list of fixed length sentences
    """
    words_lenght = len(x.split(' '))
    if words_lenght>1 and words > 1 and words_lenght > words:
        return [' '.join(x.split(' ')[i:i+words]) for i in range(0, len(x.split(' ')), words)]
    else:
        return x

In [None]:
# Step 1. Load data
# 1.1 Read csv and get basic info
df_raw = pd.read_csv('../data/01_IMDB_Dataset_HuggingFace.csv')
df_raw.info()

In [None]:
# 1.2 Get a sample
df_raw.sample(10)

In [None]:
# 1.3 Verify if there are duplicates
df_raw['review'].value_counts()

In [None]:
# 1.3 Preprocess data in order to avoid html tags and show result
df_interim = df_raw.copy()
# Get original lenght of words
df_interim['original_len'] = df_interim['review'].apply(lambda x: len(x.split(' '))) 
# Supress html tags
df_interim['user_review'] = df_interim['review'].str.replace(r'(<.*?>)','',regex=True)
# In case of many white spaces, replace with only one white space
df_interim['user_review'] = df_interim['user_review'].str.replace(r'\s+',' ',regex=True)
# In case of a space followed by a comma, replace with a comma followed by a space
df_interim['user_review'] = df_interim['user_review'].str.replace(r'\s,\s',', ',regex=True)
# Replace backslashes
df_interim['user_review'] = df_interim['user_review'].str.replace(r'\\','',regex=True)
# In case of three or more consecutive letters, replace with only two or less consecutive letters
df_interim['user_review'] = df_interim['user_review'].str.replace(r'([a-zA-Z])\1{2,}', r'\1', regex=True)
# Strip white spaces at the beginning and at the end of the review
df_interim['user_review'] = df_interim['user_review'].str.strip()
# Convert labels into integers
df_interim['label'] = df_interim['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
# Drop duplicates
df_interim = df_interim.drop_duplicates()
# There's a particular repeated row
df_interim = df_interim.drop([44855],axis=0)
# Apply custom function to split long sentences into more 
df_interim['reviews'] = df_interim['user_review'].apply(lambda x: sentence_fixed_split(x,1000))
df_interim = df_interim.explode('reviews')

In [None]:
df = df_interim.drop(['review','sentiment','user_review'], axis=1).copy()
df

In [None]:
# Step 3. Create a basic data analysis
# 3.1 Describe data
df.describe(include='all')

In [None]:
# 3.2 Get statistics based on the len
df['review_len'] = df['reviews'].apply(lambda x: len(x.split(' ')))
df['review_len'].describe()

In [None]:
# 3.3 Plot review lenght histogram
df['review_len'].hist()
plt.show()

In [None]:
# 3.2 Preprocess text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['user_review'])
sequences = tokenizer.texts_to_sequences(df['user_review'])
X = pad_sequences(sequences, maxlen=200)

In [None]:
# Step 4. Create model and evaluate
# 4.1 Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, df['label'], test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
# 4.2 Build the model
imputs = keras.Input()