# Pipeline de Treinamento RNN para Detecção de Texto
## (AI vs Human) - Multi Dataset

In [17]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from rnn import RNN  

## Função para Carregar Datasets

In [18]:
def load_dataset(dataset_version):
    base_path = f'../../datasets/{dataset_version}/'
    train_file = f'Dataset{dataset_version}_train_clean.csv'
    test_file = f'Dataset{dataset_version}_test_clean.csv'
    validation_file = f'Dataset{dataset_version}_validation_clean.csv'
    
    train = pd.read_csv(os.path.join(base_path, train_file), sep=',')
    test = pd.read_csv(os.path.join(base_path, test_file), sep=',')
    
    # Verifica se o arquivo de validação existe
    validation_path = os.path.join(base_path, validation_file)
    if os.path.exists(validation_path):
        validation = pd.read_csv(validation_path, sep=',')
    else:
        validation = None  # Define como None se não existir
    
    return train, test, validation

## Pré-processamento com TF-IDF

In [19]:
def tfidf_preprocessing(train, test, validation):
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_train = tfidf.fit_transform(train['text']).toarray()
    X_test = tfidf.transform(test['text']).toarray()
    
    y_train = train['Label'].map({'AI':1, 'Human':0}).values
    y_test = test['Label'].map({'AI':1, 'Human':0}).values
    
    # Processa validação apenas se existir
    if validation is not None:
        X_val = tfidf.transform(validation['text']).toarray()
        y_val = validation['Label'].map({'AI':1, 'Human':0}).values
    else:
        X_val, y_val = None, None  # Retorna None se não houver validação
    
    return X_train, y_train, X_test, y_test, tfidf, X_val, y_val

## Função de Treinamento da RNN

In [20]:
def train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size):
    # Reshape para formato sequencial
    X_train_seq = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test_seq = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    
    print("\nIniciando treinamento da RNN...")

    rnn = RNN(
        input_size=input_size,
        hidden_size=64,
        output_size=1,
        lr=0.08,
        dropout_rate=0.2
    )
    
    rnn.train(X_train_seq, y_train, epochs=100)
    
    rnn_pred = rnn.predict(X_test_seq)
    print("\nRelatório de Classificação:")
    print(classification_report(y_test, rnn_pred))
    print("Matriz de Confusão:")
    print(confusion_matrix(y_test, rnn_pred))
    
    return rnn

## Pipeline Principal

### Dataset 3

In [21]:
# Configurações
DATASET_VERSION = 3
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, tfidf, X_val, y_val = tfidf_preprocessing(train, test, validation)

# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(rnn_model, f)
with open(f'{SAVE_DIR}tfidf_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


def preprocessing_inference(df, vectorizer):
    X = vectorizer.transform(df['Text']).toarray()
    return X.reshape(X.shape[0], 1, X.shape[1])

# Carregar modelo
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'rb') as f:
    rnn_model = pickle.load(f)

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, tfidf)
predictions = rnn_model.predict(X_new)
df_output['Predicted'] = np.where(predictions >= 0.5, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))


Iniciando treinamento da RNN...
Epoch 0: Loss = 0.6931356813983852, Accuracy = 51.79%
Epoch 99: Loss = 0.6929723339739446, Accuracy = 64.75%

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.67      0.55      0.61      1500
           1       0.62      0.73      0.67      1500

    accuracy                           0.64      3000
   macro avg       0.65      0.64      0.64      3000
weighted avg       0.65      0.64      0.64      3000

Matriz de Confusão:
[[ 830  670]
 [ 405 1095]]

Pipeline concluído! Modelos salvos em 'modelos_rnn/'.

Relatório Final:
              precision    recall  f1-score   support

          AI       0.53      0.53      0.53        15
       Human       0.53      0.53      0.53        15

    accuracy                           0.53        30
   macro avg       0.53      0.53      0.53        30
weighted avg       0.53      0.53      0.53        30

Matriz de Confusão Final:
[[8 7]
 [7 8]]


### Dataset 4

In [22]:
# Configurações
DATASET_VERSION = 4
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, tfidf, X_val, y_val = tfidf_preprocessing(train, test, validation)

# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(rnn_model, f)
with open(f'{SAVE_DIR}tfidf_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


def preprocessing_inference(df, vectorizer):
    X = vectorizer.transform(df['Text']).toarray()
    return X.reshape(X.shape[0], 1, X.shape[1])

# Carregar modelo
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'rb') as f:
    rnn_model = pickle.load(f)

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, tfidf)
predictions = rnn_model.predict(X_new)
df_output['Predicted'] = np.where(predictions >= 0.5, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))


Iniciando treinamento da RNN...
Epoch 0: Loss = 0.6931392447928713, Accuracy = 50.70%
Epoch 99: Loss = 0.6931103941661526, Accuracy = 53.52%

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.54      0.54      0.54     22500
           1       0.54      0.54      0.54     22500

    accuracy                           0.54     45000
   macro avg       0.54      0.54      0.54     45000
weighted avg       0.54      0.54      0.54     45000

Matriz de Confusão:
[[12071 10429]
 [10456 12044]]

Pipeline concluído! Modelos salvos em 'modelos_rnn/'.

Relatório Final:
              precision    recall  f1-score   support

          AI       0.43      0.20      0.27        15
       Human       0.48      0.73      0.58        15

    accuracy                           0.47        30
   macro avg       0.45      0.47      0.43        30
weighted avg       0.45      0.47      0.43        30

Matriz de Confusão Final:
[[ 3 12]
 [ 4 11]]


### Dataset 5

In [23]:
# Configurações
DATASET_VERSION = 5
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, tfidf, X_val, y_val = tfidf_preprocessing(train, test, validation)

# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(rnn_model, f)
with open(f'{SAVE_DIR}tfidf_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


def preprocessing_inference(df, vectorizer):
    X = vectorizer.transform(df['Text']).toarray()
    return X.reshape(X.shape[0], 1, X.shape[1])

# Carregar modelo
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'rb') as f:
    rnn_model = pickle.load(f)

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, tfidf)
predictions = rnn_model.predict(X_new)
df_output['Predicted'] = np.where(predictions >= 0.5, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))


Iniciando treinamento da RNN...
Epoch 0: Loss = nan, Accuracy = 47.17%
Epoch 99: Loss = nan, Accuracy = 51.02%

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.45      1.00      0.62       100
           1       0.00      0.00      0.00       121

    accuracy                           0.45       221
   macro avg       0.23      0.50      0.31       221
weighted avg       0.20      0.45      0.28       221

Matriz de Confusão:
[[100   0]
 [121   0]]

Pipeline concluído! Modelos salvos em 'modelos_rnn/'.

Relatório Final:
              precision    recall  f1-score   support

          AI       0.00      0.00      0.00        15
       Human       0.50      1.00      0.67        15

    accuracy                           0.50        30
   macro avg       0.25      0.50      0.33        30
weighted avg       0.25      0.50      0.33        30

Matriz de Confusão Final:
[[ 0 15]
 [ 0 15]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Dataset 6

In [24]:
# Configurações
DATASET_VERSION = 6
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, tfidf, X_val, y_val = tfidf_preprocessing(train, test, validation)

# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(rnn_model, f)
with open(f'{SAVE_DIR}tfidf_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


def preprocessing_inference(df, vectorizer):
    X = vectorizer.transform(df['Text']).toarray()
    return X.reshape(X.shape[0], 1, X.shape[1])

# Carregar modelo
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'rb') as f:
    rnn_model = pickle.load(f)

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, tfidf)
predictions = rnn_model.predict(X_new)
df_output['Predicted'] = np.where(predictions >= 0.5, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))


Iniciando treinamento da RNN...
Epoch 0: Loss = 0.6932054546919564, Accuracy = 41.45%
Epoch 99: Loss = 0.6924002234256741, Accuracy = 51.80%

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.52      1.00      0.68       421
           1       0.00      0.00      0.00       391

    accuracy                           0.52       812
   macro avg       0.26      0.50      0.34       812
weighted avg       0.27      0.52      0.35       812

Matriz de Confusão:
[[421   0]
 [391   0]]

Pipeline concluído! Modelos salvos em 'modelos_rnn/'.

Relatório Final:
              precision    recall  f1-score   support

          AI       0.00      0.00      0.00        15
       Human       0.50      1.00      0.67        15

    accuracy                           0.50        30
   macro avg       0.25      0.50      0.33        30
weighted avg       0.25      0.50      0.33        30

Matriz de Confusão Final:
[[ 0 15]
 [ 0 15]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
