# Pipeline de Treinamento RNN para Detecção de Texto
## (AI vs Human) - Multi Dataset

In [6]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import SGD

## Função para Carregar Datasets (mantida igual)

In [7]:
def load_dataset(dataset_version):
    base_path = f'../../datasets/{dataset_version}/'
    train_file = f'Dataset{dataset_version}_train_clean.csv'
    test_file = f'Dataset{dataset_version}_test_clean.csv'
    validation_file = f'Dataset{dataset_version}_validation_clean.csv'
    
    train = pd.read_csv(os.path.join(base_path, train_file), sep=',')
    test = pd.read_csv(os.path.join(base_path, test_file), sep=',')
    
    validation_path = os.path.join(base_path, validation_file)
    if os.path.exists(validation_path):
        validation = pd.read_csv(validation_path, sep=',')
    else:
        validation = None
    
    return train, test, validation

## Pré-processamento com TF-IDF (mantido igual)

In [8]:
def tfidf_preprocessing(train, test, validation):
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_train = tfidf.fit_transform(train['text']).toarray()
    X_test = tfidf.transform(test['text']).toarray()
    
    y_train = train['Label'].map({'AI':1, 'Human':0}).values
    y_test = test['Label'].map({'AI':1, 'Human':0}).values
    
    if validation is not None:
        X_val = tfidf.transform(validation['text']).toarray()
        y_val = validation['Label'].map({'AI':1, 'Human':0}).values
    else:
        X_val, y_val = None, None
    
    return X_train, y_train, X_test, y_test, tfidf, X_val, y_val

## Nova Função de Treinamento com Keras RNN

In [9]:
def train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size):
    # Reshape para formato sequencial
    X_train_seq = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test_seq = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    
    print("\nIniciando treinamento da RNN...")

    model = Sequential([
        SimpleRNN(64, input_shape=(1, input_size), activation='tanh', dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    
    optimizer = SGD(learning_rate=0.08)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    validation_data = (X_val.reshape(X_val.shape[0], 1, X_val.shape[1]), y_val) if X_val is not None else None
    
    model.fit(
        X_train_seq, y_train,
        epochs=100,
        validation_data=validation_data,
        verbose=1
    )
    
    # Avaliação
    loss, accuracy = model.evaluate(X_test_seq, y_test, verbose=0)
    print(f"\nTest Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
    
    # Previsões
    rnn_pred = (model.predict(X_test_seq) > 0.5).astype(int)
    print("\nRelatório de Classificação:")
    print(classification_report(y_test, rnn_pred))
    print("Matriz de Confusão:")
    print(confusion_matrix(y_test, rnn_pred))
    
    return model

## Pipeline Principal Modificado

In [12]:
DATASET_VERSION = 3
SAVE_DIR = 'modelos_rnn/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, tfidf, X_val, y_val = tfidf_preprocessing(train, test, validation)

# Treinar RNN
rnn_model = train_evaluate_rnn(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
rnn_model.save(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.keras')
with open(f'{SAVE_DIR}tfidf_dataset{DATASET_VERSION}_rnn_simples.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("\nPipeline concluído! Modelos salvos em 'modelos_rnn/'.")


Iniciando treinamento da RNN...


  super().__init__(**kwargs)


Epoch 1/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6242 - loss: 0.6740
Epoch 2/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8738 - loss: 0.5036
Epoch 3/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9033 - loss: 0.3166
Epoch 4/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9208 - loss: 0.2395
Epoch 5/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9260 - loss: 0.2057
Epoch 6/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9383 - loss: 0.1859
Epoch 7/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9410 - loss: 0.1725
Epoch 8/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9415 - loss: 0.1663
Epoch 9/100
[1m438/438[0m [32

## Código de Inferência Modificado

In [11]:
def preprocessing_inference(df, vectorizer):
    X = vectorizer.transform(df['Text']).toarray()
    return X.reshape(X.shape[0], 1, X.shape[1])

# Carregar modelo
rnn_model = load_model(f'{SAVE_DIR}rnn_dataset{DATASET_VERSION}.keras')

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, tfidf)
predictions = (rnn_model.predict(X_new) > 0.5).astype(int)
df_output['Predicted'] = np.where(predictions == 1, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step

Relatório Final:
              precision    recall  f1-score   support

          AI       0.60      0.20      0.30        15
       Human       0.52      0.87      0.65        15

    accuracy                           0.53        30
   macro avg       0.56      0.53      0.47        30
weighted avg       0.56      0.53      0.47        30

Matriz de Confusão Final:
[[ 3 12]
 [ 2 13]]
