
<b>INSTITUTO FEDERAL DE GOIÁS<br/>
PÓS-GRADUAÇÃO EM INTELIGÊNCIA ARTIFICIAL APLICADA<br/></b>
Disciplina: Processamento de Linguagem Natural  <br/>
Professor: Daniel Xavier de Sousa <br/>
Alunos: Wagner Silva, Cleibson, Marcos Rodrigues


---

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_curve, auc

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import seaborn as sbn

[nltk_data] Downloading package stopwords to /home/marcos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marcos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2023-09-06 23:58:35.699177: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-06 23:58:35.749555: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.


In [2]:
df = pd.read_csv(f'../datasets/buscape_preprocessed.csv')

In [3]:
df.shape

(133438, 2)

In [4]:
df.head(3)

Unnamed: 0,review_text,rating
0,estou muito satisfeito o visor e melhor do que...,1
1,muito boa o que gostei preco o que nao gostei ...,1
2,rapida otima qualidade de impressao e facil de...,1


In [5]:
df = df[:30000]

In [6]:
df['rating'].value_counts()

rating
1    27265
0     2735
Name: count, dtype: int64

In [7]:
documents = df['review_text'].tolist()
labels = df['rating']

In [8]:
# Steamming
stemmer = SnowballStemmer("portuguese")

def custom_tokenizer(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

tfidf_vectorizer = TfidfVectorizer(
    # Steamming
    tokenizer=custom_tokenizer,
    # Removing stopwords
    stop_words=stopwords.words('portuguese'),
    ngram_range=(1,2),
    min_df=8,
    max_df=0.3,
    max_features=5000
)
#tfidf_vectorizer.get_feature_names_out()

In [9]:
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)



In [10]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [11]:
X_train_documents, X_test_documents, y_train_labels, y_test_labels = train_test_split(tfidf_matrix, encoded_labels, test_size=0.2, random_state=42)

In [15]:
def begin_trainning(train_seq, input_shape, dropout, learning_rate, X_train_documents, y_train_labels):
    
    model = Sequential()
    model.add(Dense(24, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(dropout)),
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['acc'])
    
    k_folds = 10
    kfolds=KFold(n_splits=k_folds, shuffle=True, random_state=42)
    
    train_losses = []
    train_f1_scores = []
    train_accuracies = []
    train_history_accuracies = []
    
    val_losses = []
    val_f1_scores = []
    val_accuracies = []

    val_tprs = []
    val_aucs = []
    val_mean_fpr = np.linspace(0, 1, 100)
    
    for fold, (train_idx, test_idx) in enumerate(kfolds.split(X_train_documents), start=1):
        print(f'fold --> {fold}')

        X_train, X_test = X_train_documents[train_idx], X_train_documents[test_idx]
        y_train, y_test = y_train_labels[train_idx], y_train_labels[test_idx]

        model_history = model.fit(
            X_train.toarray(),
            y_train,
            epochs=5,
            batch_size=32,
            callbacks=[EarlyStopping('loss', patience=3)],
            validation_data=(X_test.toarray(), y_test),
            verbose=0
        )
        
        train_loss = model_history.history['loss']
        train_losses.append(train_loss)
        
        train_history_accuracie = model_history.history['acc']
        train_history_accuracies.append(train_history_accuracie)
        
        train_pred = (model.predict(X_train.toarray()) > 0.5).astype(int)
        
        train_f1_sc = f1_score(y_train, train_pred)
        train_f1_scores.append(train_f1_sc)
        
        train_acc= accuracy_score(y_train, train_pred)
        train_accuracies.append(train_acc)
        
        # -----------------------------------------------------------------------------------
        
        val_loss = model_history.history['val_loss']
        val_losses.append(val_loss)
    
        val_pred = (model.predict(X_test.toarray()) > 0.5).astype(int)
        
        val_f1_sc = f1_score(y_test, val_pred)
        val_f1_scores.append(val_f1_sc)
        
        val_accuracy = accuracy_score(y_test, val_pred)
        val_accuracies.append(val_accuracy)
        
        fpr, tpr, thresholds = roc_curve(y_test, val_pred)
        val_roc_auc = auc(fpr, tpr)
        val_tprs.append(np.interp(val_mean_fpr, fpr, tpr))
        val_aucs.append(val_roc_auc)
        
        # -----------------------------------------------------------------------------------
        
    print('--'*20)
    print(f'mean train losses: {np.mean(train_losses)}')
    print(f'mean train f1 scores: {np.mean(train_f1_scores)}')
    print(f'mean train accuracies: {np.mean(train_accuracies)}')
    print('--'*20)
    print(f'mean val losses: {np.mean(val_losses)}')
    print(f'mean val f1 scores: {np.mean(val_f1_scores)}')
    print(f'mean val accuracies: {np.mean(val_accuracies)}')
    print('--'*20)
    
    # -----------------------------------------------------------------------------------
    
    print(f'Saving train {train_seq} metrics...')
    data = {
        'Metric': [
            'train_mean_losses',
            'train_mean_f1_scores',
            'train_mean_accuracies',
            'val_mean_losses',
            'val_mean_f1_scores',
            'val_mean_accuracies'
        ],
        'Value': [
            np.mean(train_losses),
            np.mean(train_f1_scores),
            np.mean(train_accuracies),
            np.mean(val_losses),
            np.mean(val_f1_scores),
            np.mean(val_accuracies)
        ]                               
    }
    
    data_df = pd.DataFrame(data)
    data_df.to_csv(f'./reports/tfidf_mlp/training_{train_seq}.csv', index=False)
    print(f'Finished saving train {train_seq} metrics...')
    
    # -----------------------------------------------------------------------------------
    
    print('Saving graphs...')
    
    plot_eval_train_test_graph(model_history, 'loss', train_seq)
    plot_eval_train_test_graph(model_history, 'acc', train_seq)
    plot_train_folds(train_losses, 'loss')
    plot_train_folds(train_history_accuracies, 'acc')
    
    predict = np.round(model.predict(X_test.toarray()))
    accuracy = accuracy_score(y_test, predict)
    val_conf_matrix = confusion_matrix(y_test, predict)
    plot_confunsion_matrix(val_conf_matrix, 'val')
    
    val_mean_tpr = np.mean(val_tprs, axis=0)
    val_mean_auc = auc(val_mean_fpr, val_mean_tpr)
    plot_roc_auc_curve('val', val_mean_tpr, val_mean_auc, val_mean_fpr, train_seq)
    
    print('Finished Saving graphs...')

In [16]:
def plot_eval_train_test_graph(history, metric, train_seq):
    
    train_arr = history.history[f'{metric}']
    val_arr = history.history[f'val_{metric}']
    
    epochs = range(1, len(train_arr) + 1)
    
    graph_path = f'./graphs/01_tfidf_mlp_v_2/train_test_{metric}/train_seq_{train_seq}.png'
    
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_arr, 'b', label=f'Training {metric}')
    plt.plot(epochs, val_arr, 'r', label='Test Loss')

    plt.title(f'Train and Test {metric}')
    plt.xlabel('Epochs')
    plt.ylabel(f'{metric}')
    plt.legend()

    plt.grid()
    plt.savefig(f'{graph_path}')
    #plt.show()
    plt.close()
    
def plot_train_folds(history, metric):
    graph_path = f'./graphs/01_tfidf_mlp_v_2/folds_train_{metric}/train_seq_{train_seq}.png'
    
    plt.figure(figsize=(10, 6))
    for fold_num, fold_loss in enumerate(history):
        plt.plot(fold_loss, label=f'Fold {fold_num+1}')
    plt.xlabel('Epoch')
    plt.ylabel(f'Training {metric}')
    plt.title(f'Training {metric} per Epoch for Each Fold')
    plt.legend()
    plt.savefig(f'{graph_path}')
    #plt.show()
    plt.close()
    
def plot_confunsion_matrix(cm, metric):
    graph_path = f'./graphs/01_tfidf_mlp_v_2/confusion_matrix_{metric}/train_seq_{train_seq}.png'
    plt.figure(figsize=(6,4))
    sbn.heatmap(cm, annot=True, linewidth=0.2, annot_kws={'size':12}, fmt='.0f')
    plt.title(f"Confusion Matrix - {metric}", fontsize=12)
    plt.xlabel('Real')
    plt.ylabel('Predict')
    plt.savefig(f'{graph_path}')
    #plt.show()
    plt.close()
    
def plot_roc_auc_curve(desc, mean_tpr, mean_auc, mean_fpr, train_seq):
    graph_path = f'./graphs/01_tfidf_mlp_v_2/roc_curve_{desc}/train_seq_{train_seq}.png'
    plt.figure()
    plt.plot(mean_fpr, mean_tpr, color='b', label=f'{desc} Mean ROC curve (AUC = {mean_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{desc} Mean Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.savefig(graph_path)
    #plt.show()
    plt.close()

In [17]:
input_shape = X_train_documents.shape[1]
dropouts = [0.05, 0.15, 0.25, 0.35, 0.50]
learning_rates = [5e-4, 1e-3, 5e-3, 1e-2]
train_seq = 0
for index, dropout in enumerate(dropouts, start=1):
    for learning_rate in learning_rates:
        train_seq += 1
        print('--'*40)
        
        print(f'train {train_seq} initialized...')
        print(f'hyperparams: dropout: {dropout}, learning rate: {learning_rate}')
        
        begin_trainning(train_seq, input_shape, dropout, learning_rate, X_train_documents, y_train_labels)
        
        print('--'*40)

--------------------------------------------------------------------------------
train 1 initialized...
hyperparams: dropout: 0.05, learning rate: 0.0005
fold --> 1
fold --> 2
fold --> 3
fold --> 4
fold --> 5
fold --> 6
fold --> 7
fold --> 8
fold --> 9
fold --> 10
----------------------------------------
mean train losses: 0.04681286988779902
mean train f1 scores: 0.9953706084958582
mean train accuracies: 0.9915324074074074
----------------------------------------
mean val losses: 0.05303874962963164
mean val f1 scores: 0.9899908441147843
mean val accuracies: 0.9817083333333333
----------------------------------------
Saving train 1 metrics...
Finished saving train 1 metrics...
Saving graphs...
Finished Saving graphs...
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
train 2 initialized...
hyperparams: dropout: 0.05, learning rate: 0.001
fold --> 1
fold --> 2
fold --> 3
fol

  arr = asanyarray(a)


Finished Saving graphs...
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
train 20 initialized...
hyperparams: dropout: 0.5, learning rate: 0.01
fold --> 1
fold --> 2
fold --> 3
fold --> 4
fold --> 5
fold --> 6
fold --> 7
fold --> 8
fold --> 9
fold --> 10
----------------------------------------
mean train losses: 0.05763594560325146
mean train f1 scores: 0.996166003135798
mean train accuracies: 0.9929953703703702
----------------------------------------
mean val losses: 0.05468141617253423
mean val f1 scores: 0.9896770307218274
mean val accuracies: 0.9811250000000001
----------------------------------------
Saving train 20 metrics...
Finished saving train 20 metrics...
Saving graphs...
Finished Saving graphs...
--------------------------------------------------------------------------------


In [108]:
training_df = pd.read_csv('./reports/tfidf_mlp/training_1.csv')
training_df

Unnamed: 0,Metric,Value
0,train_mean_losses,0.37151
1,train_mean_f1_scores,0.953467
2,train_mean_accuracies,0.91125
3,val_mean_losses,0.319463
4,val_mean_f1_scores,0.953097
5,val_mean_accuracies,0.9105


In [109]:
training_df = pd.read_csv('./reports/tfidf_mlp/training_2.csv')
training_df

Unnamed: 0,Metric,Value
0,train_mean_losses,0.293503
1,train_mean_f1_scores,0.966494
2,train_mean_accuracies,0.93675
3,val_mean_losses,0.23314
4,val_mean_f1_scores,0.964149
5,val_mean_accuracies,0.93225
