# Pipeline de Treinamento RNN para Detecção de Texto
## (AI vs Human) - Multi Dataset

In [13]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from rnn import RNN  

## Função para Carregar Datasets

In [14]:
def load_dataset(dataset_version):
    base_path = f'../../datasets/{dataset_version}/'
    train_file = f'Dataset{dataset_version}_train_clean.csv'
    test_file = f'Dataset{dataset_version}_test_clean.csv'
    validation_file = f'Dataset{dataset_version}_validation_clean.csv'
    
    train = pd.read_csv(os.path.join(base_path, train_file), sep=',')
    test = pd.read_csv(os.path.join(base_path, test_file), sep=',')
    
    # Verifica se o arquivo de validação existe
    validation_path = os.path.join(base_path, validation_file)
    if os.path.exists(validation_path):
        validation = pd.read_csv(validation_path, sep=',')
    else:
        validation = None  # Define como None se não existir
    
    return train, test, validation

## Pré-processamento com UNIgram

In [24]:
NAME = "unigram_be.keras"
from tensorflow.keras.layers import TextVectorization

# Configuração do TextVectorization para unigramas
text_vectorization_singlegram_be = TextVectorization(
    max_tokens=900,  # Número máximo de tokens
    output_mode="multi_hot",  # Representação multi-hot
    standardize="lower_and_strip_punctuation",  # Normalização do texto
)
def preprocess_text(text_ds):
    text_ds_2 = text_ds['text']
    text_vectorization_singlegram_be.adapt(text_ds_2)
    vectorized_text = text_ds_2.map(lambda x: text_vectorization_singlegram_be(x))
    return np.array(list(vectorized_text)) 

def unigram_preprocessing(train, test, validation):
    # Pré-processa apenas a coluna de texto
    X_train = preprocess_text(train)
    X_test = preprocess_text(test)
    
    # Converte os rótulos para valores numéricos
    y_train = train['Label'].map({'AI': 1, 'Human': 0}).values
    y_test = test['Label'].map({'AI': 1, 'Human': 0}).values
    
    # Processa validação apenas se existir
    X_val = None
    y_val = None
    if validation is not None:
        X_val = preprocess_text(validation)
        y_val = validation['Label'].map({'AI': 1, 'Human': 0}).values
    
    return X_train, y_train, X_test, y_test, X_val, y_val

## Função de Treinamento da RNN

In [16]:
def train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size):
    # Reshape para formato sequencial
    X_train_seq = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test_seq = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    
    print("\nIniciando treinamento da RNN...")

    rnn = RNN(
        input_size=input_size,
        hidden_size=64,
        output_size=1,
        lr=0.08,
        dropout_rate=0.2
    )
    
    rnn.train(X_train_seq, y_train, epochs=100)
    
    rnn_pred = rnn.predict(X_test_seq)
    print("\nRelatório de Classificação:")
    print(classification_report(y_test, rnn_pred))
    print("Matriz de Confusão:")
    print(confusion_matrix(y_test, rnn_pred))
    
    return rnn

## Pipeline Principal

### Dataset 3

In [17]:
# Configurações
DATASET_VERSION = 3
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, X_val, y_val = unigram_preprocessing(train, test, validation)
print(f"Forma de X_train: {X_train.shape}")
# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(rnn_model, f)


print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


def preprocessing_inference(df, vectorizer):
    text_data = df['Text']  
    X = vectorizer(text_data).numpy()  
    return X.reshape(X.shape[0], 1, X.shape[1])  # Ajusta para o formato sequencial

# Certifique-se de que o TextVectorization foi adaptado
text_vectorization_singlegram_be.adapt(train['text'])  

# Carregar modelo
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'rb') as f:
    rnn_model = pickle.load(f)

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, text_vectorization_singlegram_be)
predictions = rnn_model.predict(X_new)
df_output['Predicted'] = np.where(predictions >= 0.5, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))

Forma de X_train: (14000, 5000)

Iniciando treinamento da RNN...
Epoch 0: Loss = 0.6930487900649178, Accuracy = 50.50%
Epoch 99: Loss = 0.3420467668577256, Accuracy = 95.27%

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.69      0.80      0.74      1500
           1       0.76      0.63      0.69      1500

    accuracy                           0.72      3000
   macro avg       0.72      0.72      0.72      3000
weighted avg       0.72      0.72      0.72      3000

Matriz de Confusão:
[[1204  296]
 [ 549  951]]

Pipeline concluído! Modelos salvos em 'modelos_rnn/'.
     ID                                               Text
0  D1-1  The cell cycle, or cell-division cycle, is the...
1  D1-2  The cell cycle is the process by which a cell ...
2  D1-3  Photons, in many atomic models in physics, are...
3  D1-4  A photon is a fundamental particle of light an...
4  D1-5  According to the theory of plate tectonics, Ea...

Relatório Fin

### Dataset 4

In [None]:
# Configurações

DATASET_VERSION = 4
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, X_val, y_val = unigram_preprocessing(train, test, validation)
print(f"Forma de X_train: {X_train.shape}")
# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(rnn_model, f)


print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


def preprocessing_inference(df, vectorizer):
    print(df.head())
    text_data = df['Text']  
    X = vectorizer(text_data).numpy()  
    return X.reshape(X.shape[0], 1, X.shape[1])  # Ajusta para o formato sequencial

# Certifique-se de que o TextVectorization foi adaptado
text_vectorization_singlegram_be.adapt(train['text'])  

# Carregar modelo
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'rb') as f:
    rnn_model = pickle.load(f)

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, text_vectorization_singlegram_be)
predictions = rnn_model.predict(X_new)
df_output['Predicted'] = np.where(predictions >= 0.5, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))

KeyboardInterrupt: 

### Dataset 5

In [23]:
# Configurações
DATASET_VERSION = 5
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, X_val, y_val = unigram_preprocessing(train, test, validation)
print(f"Forma de X_train: {X_train.shape}")
# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(rnn_model, f)


print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


def preprocessing_inference(df, vectorizer):
    print(df.head())
    text_data = df['Text']  
    X = vectorizer(text_data).numpy()  
    return X.reshape(X.shape[0], 1, X.shape[1])  # Ajusta para o formato sequencial

# Certifique-se de que o TextVectorization foi adaptado
text_vectorization_singlegram_be.adapt(train['text'])  

# Carregar modelo
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'rb') as f:
    rnn_model = pickle.load(f)

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, text_vectorization_singlegram_be)
predictions = rnn_model.predict(X_new)
df_output['Predicted'] = np.where(predictions >= 0.5, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))

RuntimeError: When using `output_mode=multi_hot` and `pad_to_max_tokens=False`, the vocabulary size cannot be changed after the layer is called. Old vocab size is 5000, new vocab size is 925

### Dataset 6

In [22]:
# Configurações
DATASET_VERSION = 6
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, X_val, y_val = unigram_preprocessing(train, test, validation)
print(f"Forma de X_train: {X_train.shape}")
# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(rnn_model, f)


print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


def preprocessing_inference(df, vectorizer):
    print(df.head())
    text_data = df['Text']  
    X = vectorizer(text_data).numpy()  
    return X.reshape(X.shape[0], 1, X.shape[1])  # Ajusta para o formato sequencial

# Certifique-se de que o TextVectorization foi adaptado
text_vectorization_singlegram_be.adapt(train['text'])  

# Carregar modelo
with open(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.pkl', 'rb') as f:
    rnn_model = pickle.load(f)

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, text_vectorization_singlegram_be)
predictions = rnn_model.predict(X_new)
df_output['Predicted'] = np.where(predictions >= 0.5, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))

Forma de X_train: (3245, 5000)

Iniciando treinamento da RNN...
Epoch 0: Loss = 0.6939806997847989, Accuracy = 39.69%
Epoch 99: Loss = 0.13701189397312408, Accuracy = 99.35%

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.78      0.94      0.85       421
           1       0.92      0.71      0.80       391

    accuracy                           0.83       812
   macro avg       0.85      0.83      0.83       812
weighted avg       0.85      0.83      0.83       812

Matriz de Confusão:
[[396  25]
 [112 279]]

Pipeline concluído! Modelos salvos em 'modelos_rnn/'.
     ID                                               Text
0  D1-1  The cell cycle, or cell-division cycle, is the...
1  D1-2  The cell cycle is the process by which a cell ...
2  D1-3  Photons, in many atomic models in physics, are...
3  D1-4  A photon is a fundamental particle of light an...
4  D1-5  According to the theory of plate tectonics, Ea...

Relatório Final:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
