## Import libraries

In [1]:
import numpy as np
import pandas as pd 
import re
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (Embedding, Bidirectional, LSTM, 
    Dense, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, 
    Attention, MultiHeadAttention, Input, GRU, Concatenate)
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.metrics import Accuracy, Recall, Precision
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Model
from nltk import FreqDist
import pickle

In [None]:
df = pd.read_csv("./Data/IMDB Dataset.csv")
df.head()

In [None]:
df.info()

In [None]:
df = df[['review_fr', 'sentiment']]
df.head()

## Process data

In [None]:
def cleaning_texts(texts):
    clean_texts = []
    for text in texts:
        text = re.sub(r'[^a-zA-Záéíóúüñàâäéèêëîïôœùûç\']', ' ', text)
        text = text.lower().strip()
        text = re.sub(r'\s+', ' ', text)
        clean_texts.append(text)
    return clean_texts

In [None]:
def tokenize_texts(texts):
    tokenized_texts = []
    for text in texts:
        tokenized_texts.append(text.split())
    return tokenized_texts

In [None]:
def remove_stopwords(texts):
    texts_without_stopwords = []
    nltk.download("stopwords")
    stop_words = set(stopwords.words("portuguese"))
    for text in texts:
        texts_without_stopwords.append([word for word in text if word not in stop_words])
    return texts_without_stopwords

In [None]:
def vectorize_texts(texts, vocabulary):
    vectoriced_texts = []
    for i, text in enumerate(texts):
        vectoriced_text = []
        for j, word in enumerate(text):
            if word in vocabulary:
                vectoriced_text.append(vocabulary[word]) 
            else:
                vectoriced_text.append(0) 
        vectoriced_texts.append(vectoriced_text)
    return vectoriced_texts

In [None]:
texts = df['review_pt'].to_list()
clean_texts = cleaning_texts(texts)
tokenized_texts = tokenize_texts(clean_texts)
tokenized_texts = remove_stopwords(tokenized_texts)

## Word Cloud

In [None]:
corpus = ' '.join(' '.join(text) for text in tokenized_texts)
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(corpus)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## Top words

In [None]:
words = []
for text in tokenized_texts:
    words.extend(text)
frequency = FreqDist(words)
top_20 = frequency.most_common(20)
words, freq = zip(*top_20)

In [None]:
plt.figure(figsize=(12, 6))
plt.barh(words, freq)
plt.xlabel('Frecuencia')
plt.ylabel('Palabras')
plt.title('Top 20 Palabras en el Conjunto de Textos')
plt.gca().invert_yaxis() 
plt.show()

## Word2Vec Model

In [None]:
model_word2vec = Word2Vec(tokenized_texts, vector_size=50, window=10, min_count=10, workers=4)
model_word2vec.train(tokenized_texts,total_examples=model_word2vec.corpus_count, epochs=20)
model_word2vec.save("./API/Models/Word2Vec_pt")

## Vectorize texts

In [None]:
model_word2vec = Word2Vec.load("./API/Models/Word2Vec_pt")
vocabulary = model_word2vec.wv.key_to_index
with open("./API/vocabulary/vocabulary_pt", "wb") as file:
    pickle.dump(vocabulary, file)

In [None]:
maxlen = 150
max_words = len(vocabulary)

vectorized_texts = vectorize_texts(tokenized_texts, vocabulary)
X = pad_sequences(vectorized_texts, maxlen=maxlen)
y = df['sentiment'].apply(lambda x : 1 if x == 'positive' else 0).to_numpy()

## Split dataset into training and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Build embedding matrix

In [None]:
embedding_matrix = np.zeros((max_words, 50))
for word, i in vocabulary.items():
    if i < max_words:
        if word in vocabulary:
            embedding_matrix[i] = model_word2vec.wv[word]

## Plot confusion matrix

In [None]:
def plot_confusion_matrix(y_true, y_pred):
    
    conf_matrix = confusion_matrix(y_true, y_pred) 
    
    fig, ax = plt.subplots(figsize=(3, 5))
    ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.8)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='large')
 
    plt.xlabel('Predictions', fontsize=12)
    plt.ylabel('Actuals', fontsize=12)
    plt.title('Confusion Matrix', fontsize=12)
    plt.show()

## Bidirectional LSTM Model

In [None]:
model_lstm = Sequential()
model_lstm.add(Input(shape=(maxlen,)))
model_lstm.add(Embedding(input_dim=max_words, output_dim=50, weights=[embedding_matrix], trainable=False))
model_lstm.add(Bidirectional(LSTM(64, return_sequences=True)))  
model_lstm.add(Bidirectional(LSTM(64, return_sequences=True))) 
model_lstm.add(Flatten())
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dense(1, activation='sigmoid')) 

In [None]:
model_lstm.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy', Recall(), Precision()]
)

model_lstm.summary()

In [None]:
checkpoint = ModelCheckpoint(
    './API/Models/model_lstm_fr.h5', 
    monitor='val_accuracy',
    save_best_only=True,
    mode='max', 
    verbose=0
)  

model_lstm_history = model_lstm.fit(
    X_train, y_train, validation_split=0.2, 
    epochs=10, batch_size=128, callbacks=[checkpoint]
)

In [None]:
model_lstm = load_model("./API/Models/model_lstm_fr.h5")
y_pred_lstm = np.round(model_lstm.predict(X_test).T).astype(int)[0

In [None]:
plot_confusion_matrix(y_test, y_pred_lstm)

In [None]:
print(classification_report(y_test,y_pred_lstm))

## Bidirectional GRU Model

In [None]:
model_gru = Sequential()
model_gru.add(Input(shape=(maxlen,)))
model_gru.add(Embedding(input_dim=max_words, output_dim=50, weights=[embedding_matrix], trainable=False))
model_gru.add(GRU(64, return_sequences=True))  
model_gru.add(GRU(64, return_sequences=True)) 
model_gru.add(Flatten())
model_gru.add(Dense(64, activation='relu'))
model_gru.add(Dense(1, activation='sigmoid')) 

In [None]:
model_gru.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy', Recall(), Precision()]
)

model_gru.summary()

In [None]:
checkpoint = ModelCheckpoint(
    './API/Models/model_gru_fr.h5', 
    monitor='val_accuracy',
    save_best_only=True,
    mode='max', 
    verbose=0
)  

model_gru_history = model_gru.fit(
    X_train, y_train, validation_split=0.2, 
    epochs=10, batch_size=128, callbacks=[checkpoint]
)

In [None]:
model_gru = load_model("./API/Models/model_gru_fr.h5")
y_pred_gru = np.round(model_gru.predict(X_test).T).astype(int)[0]

In [None]:
plot_confusion_matrix(y_test, y_pred_gru)

In [None]:
print(classification_report(y_test,y_pred_gru))

## CNN Model

In [None]:
model_cnn = Sequential()
model_cnn.add(Input(shape=(maxlen,)))
model_cnn.add(Embedding(input_dim=max_words, output_dim=50, weights=[embedding_matrix], trainable=False))
model_cnn.add(Conv1D(256, kernel_size=8, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Conv1D(64, kernel_size=4, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))

In [None]:
model_cnn.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy', Recall(), Precision()]
)

model_cnn.summary()

In [None]:
checkpoint = ModelCheckpoint(
    './API/Models/model_cnn_fr.h5', 
    monitor='val_accuracy',
    save_best_only=True,
    mode='max', 
    verbose=0
)  

model_cnn_history = model_cnn.fit(
    X_train, y_train, validation_split=0.2, 
    epochs=10, batch_size=128, callbacks=[checkpoint]
)

In [None]:
model_cnn = load_model("./API/Models/model_cnn_fr.h5")
y_pred_cnn = np.round(model_cnn.predict(X_test).T).astype(int)[0]

In [None]:
plot_confusion_matrix(y_test, y_pred_cnn)

In [None]:
print(classification_report(y_test,y_pred_cnn))

## Attention Model

In [None]:
inputs = Input(shape=(maxlen,))
x = Embedding(input_dim=max_words, output_dim=50, weights=[embedding_matrix], trainable=False)(inputs)
x = MultiHeadAttention(num_heads=16, key_dim=16, value_dim=16)(x, x)
x = MaxPooling1D(pool_size=2)(x)
x = MultiHeadAttention(num_heads=8, key_dim=16, value_dim=16)(x, x)
x = MaxPooling1D(pool_size=2)(x)
x = Flatten()(x)
x = Dense(units=128, activation='relu')(x)
x = Dense(units=1, activation='sigmoid')(x)
model_attention = Model(inputs, x)

In [None]:
model_attention.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy', Recall(), Precision()]
)
model_attention.summary()

In [None]:
checkpoint = ModelCheckpoint(
    './API/Models/model_attention_fr.h5', 
    monitor='val_accuracy',
    save_best_only=True,
    mode='max', 
    verbose=0
)  

model_attention_history = model_attention.fit(
    X_train, y_train, validation_split=0.2, 
    epochs=10, batch_size=128, callbacks=[checkpoint]
)

In [None]:
model_attention = load_model("./API/Models/model_attention_fr.h5")
y_pred_attention = np.round(model_attention.predict(X_test).T).astype(int)[0]

In [None]:
plot_confusion_matrix(y_test, y_pred_attention)

In [None]:
print(classification_report(y_test,y_pred_attention))

## LSTM+Attention Model

In [None]:
inputs = Input(shape=(maxlen,))
x = Embedding(input_dim=max_words, output_dim=50, weights=[embedding_matrix], trainable=False)(inputs)
lstm_1 = Bidirectional(LSTM(units=64, return_sequences=True))(x)
lstm_2 = Bidirectional(LSTM(units=64, return_sequences=True))(x)
x = MultiHeadAttention(num_heads=8, key_dim=16, value_dim=16)(lstm_1, lstm_2)
x = MultiHeadAttention(num_heads=4, key_dim=16, value_dim=16)(x, x)
x = Flatten()(x)
x = Dense(units=128, activation='relu')(x)
x = Dense(units=1, activation='sigmoid')(x)
model_lstm_attention = Model(inputs, x)

In [None]:
model_lstm_attention.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy', Recall(), Precision()]
)
model_lstm_attention.summary()

In [None]:
checkpoint = ModelCheckpoint(
    './API/Models/model_lstm_attention_fr.h5', 
    monitor='val_accuracy',
    save_best_only=True,
    mode='max', 
    verbose=0
)  

model_lstm_attention_history = model_lstm_attention.fit(
    X_train, y_train, validation_split=0.2, 
    epochs=10, batch_size=128, callbacks=[checkpoint]
)

In [None]:
model_lstm_attention = load_model("./API/Models/model_lstm_attention_fr.h5")
y_pred_lstm_attention = np.round(model_lstm_attention.predict(X_test).T).astype(int)[0]

In [None]:
plot_confusion_matrix(y_test, y_pred_lstm_attention)

In [None]:
print(classification_report(y_test,y_pred_lstm_attention))