
<b>INSTITUTO FEDERAL DE GOIÁS<br/>
PÓS-GRADUAÇÃO EM INTELIGÊNCIA ARTIFICIAL APLICADA<br/></b>
Disciplina: Processamento de Linguagem Natural  <br/>
Professor: Daniel Xavier de Sousa <br/>
Alunos: Wagner Silva, Cleibson, Marcos Rodrigues


---

In [156]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold

'''
confusion_matrix, 
roc_curve, 
auc, 
accuracy_score, 
log_loss, 
f1_score, 
recall_score, 
precision_score, 
classification_report, 
average_precision_score, 
precision_recall_curve, 
roc_curve, auc
'''

from sklearn.metrics import f1_score, accuracy_score

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sbn

[nltk_data] Downloading package stopwords to /home/marcos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marcos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [157]:
df = pd.read_csv(f'../datasets/buscape_preprocessed.csv')

In [158]:
df.shape

(133438, 2)

In [159]:
df.head(3)

Unnamed: 0,review_text,rating
0,estou muito satisfeito o visor e melhor do que...,1
1,muito boa o que gostei preco o que nao gostei ...,1
2,rapida otima qualidade de impressao e facil de...,1


In [160]:
df = df[:10000]

In [161]:
df['rating'].value_counts()

rating
1    9086
0     914
Name: count, dtype: int64

In [162]:
documents = df['review_text'].tolist()
labels = df['rating']

In [163]:
# Steamming
stemmer = SnowballStemmer("portuguese")

def custom_tokenizer(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

tfidf_vectorizer = TfidfVectorizer(
    # Steamming
    tokenizer=custom_tokenizer,
    # Removing stopwords
    stop_words=stopwords.words('portuguese'),
    ngram_range=(1,2),
    min_df=8,
    max_df=0.3,
    max_features=5000
)
#tfidf_vectorizer.get_feature_names_out()

In [164]:
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)



In [165]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [166]:
X_train_documents, X_test_documents, y_train_labels, y_test_labels = train_test_split(tfidf_matrix, encoded_labels, test_size=0.2, random_state=42)

In [209]:
def begin_trainning(train_seq, input_shape, dropout, learning_rate, X_train_documents, y_train_labels):
    
    model = Sequential()
    model.add(Dense(24, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(dropout)),
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['acc'])
    
    k_folds = 2
    kfolds=KFold(n_splits=k_folds, shuffle=True, random_state=42)
    
    train_losses = []
    train_f1_scores = []
    train_accuracies = []
    
    val_losses = []
    val_f1_scores = []
    val_accuracies = []

    for fold, (train_idx, test_idx) in enumerate(kfolds.split(X_train_documents), start=1):
        print(f'fold --> {fold}')

        X_train, X_test = X_train_documents[train_idx], X_train_documents[test_idx]
        y_train, y_test = y_train_labels[train_idx], y_train_labels[test_idx]

        model_history = model.fit(
            X_train.toarray(),
            y_train,
            epochs=2,
            batch_size=32,
            callbacks=[EarlyStopping('loss', patience=3)],
            validation_data=(X_test.toarray(), y_test),
            verbose=0
        )
        
        train_loss = model_history.history['loss']
        train_losses.append(train_loss)
        
        train_pred = (model.predict(X_train.toarray()) > 0.5).astype(int)
        
        train_f1_sc = f1_score(y_train, train_pred)
        train_f1_scores.append(train_f1_sc)
        
        train_acc= accuracy_score(y_train, train_pred)
        train_accuracies.append(train_acc)
        
        # -----------------------------------------------------------------------------------
        
        val_loss = model_history.history['val_loss']
        val_losses.append(val_loss)
    
        val_pred = (model.predict(X_test.toarray()) > 0.5).astype(int)
        
        val_f1_sc = f1_score(y_test, val_pred)
        val_f1_scores.append(val_f1_sc)
        
        val_accuracy = accuracy_score(y_test, val_pred)
        val_accuracies.append(val_accuracy)
        
    print('--'*20)
    print(f'mean train losses: {np.mean(train_losses)}')
    print(f'mean train f1 scores: {np.mean(train_f1_scores)}')
    print(f'mean train accuracies: {np.mean(train_accuracies)}')
    print('--'*20)
    print(f'mean val losses: {np.mean(val_losses)}')
    print(f'mean val f1 scores: {np.mean(val_f1_scores)}')
    print(f'mean val accuracies: {np.mean(val_accuracies)}')
    print('--'*20)
    
    print('saving graphs...')
    plot_eval_train_test_graph(model_history, 'loss', train_seq)
    plot_eval_train_test_graph(model_history, 'acc', train_seq)
    print('finished saving graphs...')

In [213]:
def plot_eval_train_test_graph(history, metric, train_seq):
    
    train_arr = history.history[f'{metric}']
    val_arr = history.history[f'val_{metric}']
    
    epochs = range(1, len(train_arr) + 1)
    
    graph_path = f'./graphs/01_tfidf_mlp_v_2/train_test_{metric}/train_seq_{train_seq}_eval_train_test_{metric}.png'
    
    # Create a plot
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_arr, 'b', label=f'Training {metric}')
    plt.plot(epochs, val_arr, 'r', label='Test Loss')

    # Labeling the plot
    plt.title(f'Train and Test {metric}')
    plt.xlabel('Epochs')
    plt.ylabel(f'{metric}')
    plt.legend()

    # Show the plot
    plt.grid()
    plt.savefig(f'{graph_path}')
    plt.close()

In [214]:
input_shape = X_train_documents.shape[1]
dropouts = [0.05, 0.15, 0.25, 0.35, 0.50]
learning_rates = [5e-4, 1e-3, 5e-3, 1e-2]
train_seq = 0
for index, dropout in enumerate(dropouts, start=1):
    for learning_rate in learning_rates:
        train_seq += 1
        print('--'*60)
        
        print(f'train {train_seq} initialized...')
        print(f'hyperparams: dropout: {dropout}, learning rate: {learning_rate}')
        
        begin_trainning(train_seq, input_shape, dropout, learning_rate, X_train_documents, y_train_labels)

        print(f'train {train_seq} finished')
        
        print('--'*60)

------------------------------------------------------------------------------------------------------------------------
train 1 initialized...
hyperparams: dropout: 0.05, learning rate: 0.0005
fold --> 1
fold --> 2
----------------------------------------
mean train losses: 0.38163502141833305
mean train f1 scores: 0.9529670182591282
mean train accuracies: 0.91025
----------------------------------------
mean val losses: 0.3300510346889496
mean val f1 scores: 0.9528477149932673
mean val accuracies: 0.9099999999999999
----------------------------------------
saving graphs...
finished saving graphs...
train 1 finished
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
train 2 initialized...
hyperparams: dropout: 0.05, learning rate: 0.001
fold --> 1
fold --> 2
---------------------------------------