In [51]:
# Step 0. Load libraries and custom functions
# Matrices and datasets ------------------------------------------------
import pandas as pd
import numpy as np
# Graphics -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
# Text processors
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from wordcloud import WordCloud
# Machine Learning -----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
# Deep Learning --------------------------------------------------------
import keras
import tensorflow as tf
from keras import layers
from keras.layers import TextVectorization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Custom functions -----------------------------------------------------
def sentence_fixed_split(x:list, words: int):
    """
    Split a list of sentences into a list of fixed length sentences.
    param x: sentence as a list of words
    param words: number of fixed words required
    return: list of fixed length sentences
    """
    words_lenght = len(x.split(' '))
    if words_lenght>1 and words > 1 and words_lenght > words:
        return [' '.join(x.split(' ')[i:i+words]) for i in range(0, len(x.split(' ')), words)]
    else:
        return x

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexismena/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Step 1. Load data
# 1.1 Read csv and get basic info
df_raw = pd.read_csv('../data/01_IMDB_Dataset_HuggingFace.csv')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [3]:
# 1.2 Get a sample
df_raw.sample(10, random_state=2024)

Unnamed: 0,review,sentiment
16441,"Like most musicals of the era, one must check ...",negative
26819,I saw this show about 3-4 years ago. It was da...,positive
9242,"One of, if not THE most visually beautiful fil...",positive
16234,Although this lovely work of art does use some...,positive
19184,Don't waste your time or money on this one. Th...,negative
34763,"Twelve Monkeys is an insane time-travelling, a...",positive
36781,Just when you thought it was safe to go back i...,negative
9082,at first i thought it was bad because i had gr...,negative
39944,"I wouldn't exactly call this a good movie, in ...",negative
38247,"This movie is very bad. In fact, the only reas...",negative


In [4]:
# 1.3 Verify if there are duplicates
df_raw['review'].duplicated().sum()

418

In [5]:
# 1.4 Preprocess data based on observed information
df_interim = df_raw.copy()
df_interim = df_interim[~df_interim.review.duplicated()]
df_interim.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49582 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49582 non-null  object
 1   sentiment  49582 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [6]:
# 1.5 Create the final dataframe
df_final = df_interim.copy()

In [None]:
# 1.6 Create your train, validation and test datasets
test_split = 0.3
val_split = 0.5
train_df, test_df = train_test_split(
    df_final,
    test_size=test_split,
    stratify=df_final['sentiment'].values,
    random_state=2024
)

val_df = test_df.sample(frac=val_split)
test_df = test_df.drop(val_df.index)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
# Step 2. Create a basic analysis
# 2.1 Get basic info
train_df.info()

In [None]:
# 2.2 Get a summary of the data
train_df.describe(include='all')

In [None]:
# 2.3 Get words statistics
train_df['review'].apply(lambda x: len(x.split())).describe()

In [None]:
# 2.4 Show histogram of word count
train_df['review'].apply(lambda x: len(x.split(' '))).hist(bins=50)
plt.show()

In [None]:
# 2.5 Show histogram of word count stratified by sentiment
(train_df
 .assign(word_count = train_df['review'].apply(lambda x: len(x.split())))
 .hist('word_count', by='sentiment', grid=True, sharey=True, bins=30)
)
plt.show()

In [None]:
# 2.6 Show boxplot of word count stratified by sentiment
(train_df
 .assign(word_count = train_df['review'].apply(lambda x: len(x.split())))
 .boxplot('word_count', by='sentiment')
)
plt.show()

In [None]:
# 2.7 List total words in our vocabulary
train_df['review'].str.split(expand=True).stack().value_counts().count()

In [None]:
# 2.8 List most frequent words in reviews
train_df['review'].str.split(expand=True).stack().value_counts()[:20]

In [None]:
# 2.9 View the wordcloud of reviews without stopwords
text = ' '.join(i.lower() for i in train_df['review'])
wordcloud = WordCloud(max_words=100, 
                      stopwords=None, 
                      background_color='White', 
                      collocations=False).generate(text)
plt.figure(figsize=(12,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()


After analyzing the dataset, we need to create a processing pipeline that:
- removes stopwords
- removes html tags
- removes punctuations
- uses a compact vocabulary
- lowercases the words
- corrects mispelled words like soooo, muuuuch, etc (more than 3 vowels)
- encode sentiment into 1 or 0 

For our traning task, we can split longest reviews into segments or just 
truncate the review up to 500 words and eliminate the rest, but it 
results in losing some information, so we will try both aproaches

In [None]:
# Step 3. Train and execute model iter 1.
# Create a copy
train_ds    = train_df.copy()
val_ds      = val_df.copy()
test_ds     = test_df.copy()
# Encode sentiment response variable
train_ds['sentiment']   = train_ds['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
val_ds['sentiment']     = val_ds['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
test_ds['sentiment']    = test_ds['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
# Tokenize words on the dataset
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_ds['review'])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_ds['review'])
padded_sequences_train = pad_sequences(sequences, maxlen=200, truncating='post')
sequences = tokenizer.texts_to_sequences(val_ds['review'])
padded_sequences_val = pad_sequences(sequences, maxlen=200, truncating='post')
sequences = tokenizer.texts_to_sequences(test_ds['review'])
padded_sequences_test = pad_sequences(sequences, maxlen=200, truncating='post')
# Create model
inputs = keras.Input(shape=(200,))
x = layers.Dense(26, activation='relu')(inputs)
x = layers.Dropout(0.4)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
model.summary()
model.fit(padded_sequences_train, train_ds['sentiment'], epochs=10, batch_size=32, validation_data=(padded_sequences_val, val_ds['sentiment']))

In [9]:
# Train and execute model iter 2
# Recode data and format text
df_final['sentiment'] = df_final['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
df_final['review'] = df_final['review'].apply(lambda x: x.lower())
df_final['review'] = df_final['review'].str.replace(r'(<.*?>)','',regex=True)
df_final['review'] = df_final['review'].str.strip()
# Preprocess the data
tokenizer = Tokenizer(num_words=20000,oov_token='<OOV>')
tokenizer.fit_on_texts(df_final['review'])
vocab = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df_final['review'])
X = pad_sequences(sequences, maxlen=200, truncating='post')

#Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    df_final['sentiment'], 
                                                    test_size=0.2, 
                                                    random_state=2024)
# Build the model
inputs = keras.Input(shape=(200,))
x = layers.Dense(64, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs=inputs, outputs=outputs, name='sentiment_model')
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=32)



Epoch 1/10


2024-02-23 13:03:34.664130: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2d90b5e10>

In [10]:
y_pred = model.predict(X_test).round()
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy:.4f}')

Test accuracy: 1.0000


In [47]:
mudf = pd.DataFrame({'id':[1], 'textio':['This night, it was... fantastic!, so you now; or was it?']})
mudf['textio'] = mudf['textio'].str.replace(r'''['!,.;]''', '', regex=True)
mudf['textio'] = mudf['textio'].str.replace(r'[?]', ' question_mark ', regex=True)
print(mudf['textio'].to_list())

['This night it was fantastic so you now or was it question_mark ']


In [54]:
# Iter 3
# Read dataset
sw = stopwords.words('english')
df_train = pd.read_csv('../data/01_IMDB_Train.csv')
df_val = pd.read_csv('../data/01_IMDB_Valid.csv')
df_test = pd.read_csv('../data/01_IMDB_Test.csv')
df_train.info()
# Clean data
def clean_text_fn(dataframe):
    dataframe['text'] = dataframe['text'].apply(lambda x: x.lower())
    dataframe['text'] = dataframe['text'].str.replace(r'(<.*?>)','',regex=True)
    dataframe['text'] = dataframe['text'].str.replace(r'''['!,.;]''', '', regex=True)
    dataframe['text'] = dataframe['text'].str.replace(r'[?]', ' question_mark ', regex=True)
    dataframe['text'] = dataframe['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))
    dataframe['text'] = dataframe['text'].str.strip()
    return dataframe
train_ds = clean_text_fn(df_train)
val_ds = clean_text_fn(df_val)
test_ds = clean_text_fn(df_test)
# Tokenize data
countvectorizer = CountVectorizer()
countvectorizer.fit(train_ds['text'])
X_train = countvectorizer.transform(train_ds['text'])
X_val = countvectorizer.transform(val_ds['text'])
X_test = countvectorizer.transform(test_ds['text'])

# tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
# tokenizer.fit_on_texts(train_ds['text'])
# word_index = tokenizer.word_index
# def tokenize_and_pad(dataframe):
#     sequences = tokenizer.texts_to_sequences(dataframe['text'])
#     padded_sequences = pad_sequences(sequences, maxlen=200, padding='post')
#     return padded_sequences
# X_train = tokenize_and_pad(train_ds)
# X_val = tokenize_and_pad(val_ds)
# X_test = tokenize_and_pad(test_ds)


y_train = train_ds['label']
y_val = val_ds['label']
y_test = test_ds['label']
# Build model
inputs = keras.Input(shape=(X_train.shape[1],))
x = layers.Dense(64, activation='relu')(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs=inputs, outputs=outputs, name='sentiment_model')
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=10, batch_size=32, validation_batch_size=32)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB
Epoch 1/10


2024-02-23 17:00:56.193188: W tensorflow/core/framework/op_kernel.cc:1827] INVALID_ARGUMENT: TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    return func(device, token, args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 146, in __call__
    outputs = self._call(device, args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 153, in _call
    ret = self._func(*args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args,

InvalidArgumentError: Graph execution error:

Detected at node EagerPyFunc defined at (most recent call last):
<stack traces unavailable>
Detected at node EagerPyFunc defined at (most recent call last):
<stack traces unavailable>
2 root error(s) found.
  (0) INVALID_ARGUMENT:  TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    return func(device, token, args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 146, in __call__
    outputs = self._call(device, args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 153, in _call
    ret = self._func(*args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/data_adapter.py", line 519, in py_method
    return [slice_array(inp) for inp in flat_inputs]

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/data_adapter.py", line 519, in <listcomp>
    return [slice_array(inp) for inp in flat_inputs]

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/data_adapter.py", line 515, in slice_array
    return training_utils.slice_arrays(

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/training_utils.py", line 47, in slice_arrays
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/training_utils.py", line 47, in <listcomp>
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/training_utils.py", line 47, in <listcomp>
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

TypeError: 'SparseTensor' object is not subscriptable


	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_8]]
  (1) INVALID_ARGUMENT:  TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    return func(device, token, args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 146, in __call__
    outputs = self._call(device, args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 153, in _call
    ret = self._func(*args)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/data_adapter.py", line 519, in py_method
    return [slice_array(inp) for inp in flat_inputs]

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/data_adapter.py", line 519, in <listcomp>
    return [slice_array(inp) for inp in flat_inputs]

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/data_adapter.py", line 515, in slice_array
    return training_utils.slice_arrays(

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/training_utils.py", line 47, in slice_arrays
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/training_utils.py", line 47, in <listcomp>
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

  File "/Users/alexismena/Documents/Deep_Learning_Projects/TensorFlow_Basics/.venv/lib/python3.10/site-packages/keras/src/engine/training_utils.py", line 47, in <listcomp>
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

TypeError: 'SparseTensor' object is not subscriptable


	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_227791]

In [58]:
# Iter 4
# Read dataset
sw = stopwords.words('english')
df_train = pd.read_csv('../data/01_IMDB_Train.csv')
df_val = pd.read_csv('../data/01_IMDB_Valid.csv')
df_test = pd.read_csv('../data/01_IMDB_Test.csv')
df_train.info()

# Clean data
def clean_text_fn(dataframe):
    dataframe['text'] = dataframe['text'].apply(lambda x: x.lower())
    dataframe['text'] = dataframe['text'].str.replace(r'(<.*?>)','',regex=True)
    dataframe['text'] = dataframe['text'].str.replace(r'''['!,.;]''', '', regex=True)
    dataframe['text'] = dataframe['text'].str.replace(r'[?]', ' question_mark ', regex=True)
    dataframe['text'] = dataframe['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))
    dataframe['text'] = dataframe['text'].str.strip()
    return dataframe
train_ds = clean_text_fn(df_train)
val_ds = clean_text_fn(df_val)
test_ds = clean_text_fn(df_test)

# Tokenize, pad
tokenizer = Tokenizer()

Unnamed: 0,text,label
0,grew (b 1965) watching loving thunderbirds mat...,0
1,put movie dvd player sat coke chips expectatio...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movies bor...,0
4,im die hard dads army fan nothing ever change ...,1
...,...,...
39995,"""western union"" something forgotten classic we...",1
39996,movie incredible piece work explores every noo...,1
39997,wife watched movie plan visit sicily stromboli...,0
39998,first watched flatliners amazed necessary feat...,1


In [59]:
# Iter 5: Damn, it's been tedious this...
# Read dataset
sw = stopwords.words('english')
df_train = pd.read_csv('../data/01_IMDB_Train.csv')
df_val = pd.read_csv('../data/01_IMDB_Valid.csv')
df_test = pd.read_csv('../data/01_IMDB_Test.csv')
df_train.info()

# Create tensorflow dataset
def make_data(dataset):
    return tf.data.Dataset.from_tensor_slices(
        (
            {'text':dataset['text']},
            {'label':dataset['label']}
        )
    )
train = make_data(df_train).batch(32)
val = make_data(df_val).batch(32)
test = make_data(df_test).batch(32)

# Vectorize datat
def custom_standardization_fn(string_tensor):
    stripped_string = tf.strings.strip(string_tensor)
    lowercase_string = tf.strings.lower(stripped_string)
    tagless_string = tf.strings.regex_replace(lowercase_string, '<.*?>', '')
    single_space_string = tf.strings.regex_replace(tagless_string, '\s+',' ')
    comma_space_string = tf.strings.regex_replace(single_space_string, '\s,\s',', ')
    simple_vowel_a_string = tf.strings.regex_replace(comma_space_string, 'a{3,}', 'a')
    simple_vowel_e_string = tf.strings.regex_replace(simple_vowel_a_string, 'e{3,}', 'e')
    simple_vowel_i_string = tf.strings.regex_replace(simple_vowel_e_string, 'i{3, }', 'i')
    simple_vowel_o_string = tf.strings.regex_replace(simple_vowel_i_string, 'o{3, }', 'o')
    simple_vowel_u_string = tf.strings.regex_replace(simple_vowel_o_string, 'u{3, }', 'u')
    stripped_string_again = tf.strings.strip(simple_vowel_u_string)
    return tf.strings.regex_replace(stripped_string_again,
                                    f"[{re.escape(string.punctuation)}]",'')

def create_vectorize_layer(train, feature):
    vectorize_layer = TextVectorization(
        standardize=custom_standardization_fn,
        max_tokens=10000,
        output_mode='int',
        output_sequence_length=500
    )
    vectorize_layer.adapt(train.map(lambda x, y: x[feature]))
    return vectorize_layer

vectorize_text = create_vectorize_layer(train, 'text')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


NameError: name 'train_df' is not defined

In [28]:
train_ds

Unnamed: 0,text,label
0,grew (b. 1965) watching loving thunderbirds. m...,0
1,"put movie dvd player, sat coke chips, expectat...",0
2,people know particular time past like feel nee...,0
3,"even though great interest biblical movies, bo...",0
4,im die hard dads army fan nothing ever change ...,1
...,...,...
39995,"""western union"" something forgotten classic we...",1
39996,movie incredible piece work. explores every no...,1
39997,wife watched movie plan visit sicily stromboli...,0
39998,"first watched flatliners, amazed. necessary fe...",1


In [None]:
# 1.3 Preprocess data in order to avoid html tags and show result
# df_interim = df_raw.copy()
# # Get original lenght of words
# df_interim['original_len'] = df_interim['review'].apply(lambda x: len(x.split(' '))) 
# # Supress html tags
# df_interim['user_review'] = df_interim['review'].str.replace(r'(<.*?>)','',regex=True)
# # In case of many white spaces, replace with only one white space
# df_interim['user_review'] = df_interim['user_review'].str.replace(r'\s+',' ',regex=True)
# # In case of a space followed by a comma, replace with a comma followed by a space
# df_interim['user_review'] = df_interim['user_review'].str.replace(r'\s,\s',', ',regex=True)
# # Replace backslashes
# df_interim['user_review'] = df_interim['user_review'].str.replace(r'\\','',regex=True)
# # In case of three or more consecutive letters, replace with only two or less consecutive letters
# df_interim['user_review'] = df_interim['user_review'].str.replace(r'([a-zA-Z])\1{2,}', r'\1', regex=True)
# # Strip white spaces at the beginning and at the end of the review
# df_interim['user_review'] = df_interim['user_review'].str.strip()
# # Convert labels into integers
# df_interim['label'] = df_interim['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
# # Drop duplicates
# df_interim = df_interim.drop_duplicates()
# # There's a particular repeated row
# df_interim = df_interim.drop([44855],axis=0)
# # Apply custom function to split long sentences into more 
# df_interim['reviews'] = df_interim['user_review'].apply(lambda x: sentence_fixed_split(x,1000))
# df_interim = df_interim.explode('reviews')

In [None]:
# Step 3. Create the deep learning model 
# 3.1 Prepare data to encode dependent variable as a numeric
train_df['sentiment'] = train_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
val_df['sentiment'] = val_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
test_df['sentiment'] = test_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_ds = tf.data.Dataset.from_tensor_slices((
    tf.cast(train_df['review'], tf.string),
    tf.cast(train_df['sentiment'], tf.int32)
))

val_ds = tf.data.Dataset.from_tensor_slices((
    tf.cast(val_df['review'], tf.string),
    tf.cast(val_df['sentiment'], tf.int32)
))

test_ds = tf.data.Dataset.from_tensor_slices((
    tf.cast(test_df['review'], tf.string),
    tf.cast(test_df['sentiment'], tf.int32)
))


In [None]:
# 3.2 Create custom functions for strings on the model pipeline
def custom_standardization_fn(string_tensor):
    stripped_string = tf.strings.strip(string_tensor)
    lowercase_string = tf.strings.lower(stripped_string)
    tagless_string = tf.strings.regex_replace(lowercase_string, '<.*?>', '')
    single_space_string = tf.strings.regex_replace(tagless_string, '\s+',' ')
    comma_space_string = tf.strings.regex_replace(single_space_string, '\s,\s',', ')
    simple_vowel_a_string = tf.strings.regex_replace(comma_space_string, 'a{3,}', 'a')
    simple_vowel_e_string = tf.strings.regex_replace(simple_vowel_a_string, 'e{3,}', 'e')
    simple_vowel_i_string = tf.strings.regex_replace(simple_vowel_e_string, 'i{3, }', 'i')
    simple_vowel_o_string = tf.strings.regex_replace(simple_vowel_i_string, 'o{3, }', 'o')
    simple_vowel_u_string = tf.strings.regex_replace(simple_vowel_o_string, 'u{3, }', 'u')
    stripped_string_again = tf.strings.strip(simple_vowel_u_string)
    return tf.strings.regex_replace(stripped_string_again,
                                    f"[{re.escape(string.punctuation)}]",'')
    
def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

In [None]:
# 3.3 Create the text vectorizer and apply to the data splits
MAX_VOCAB = 20000
MAX_LENGTH = 500
BATCH_SIZE = 32

text_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB,
    standardize=custom_standardization_fn,
    split=custom_split_fn,
    output_mode='multi_hot'
)

text_vectorizer.adapt(train_df.review.values)
X_train = text_vectorizer(train_df.review.values)
X_val = text_vectorizer(val_df.review.values)
X_test = text_vectorizer(test_df.review.values)
y_train = tf.convert_to_tensor(train_df.sentiment.values)
y_val = tf.convert_to_tensor(val_df.sentiment.values)
y_test = tf.convert_to_tensor(test_df.sentiment.values)


In [None]:
vocab = text_vectorizer.get_vocabulary()

In [None]:
def get_model(max_tokens=MAX_VOCAB, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
model = get_model()
model.summary()

In [None]:
model.fit(x=X_train,
          y=y_train,
          epochs=10)

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

# Load the IMDB dataset from CSV
data_path = '../data/01_IMDB_Dataset_HuggingFace.csv'  # Replace with the actual path
df = pd.read_csv(data_path)

# Preprocess the data
max_length = 256
X = df['review'].values
y = df['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize and pad sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_length)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_length)

# Define the model architecture
input_layer = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=10000, output_dim=16)(input_layer)
average_pooling_layer = GlobalAveragePooling1D()(embedding_layer)
dense_layer = Dense(16, activation='relu')(average_pooling_layer)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")
