# Pipeline de Treinamento GRU para Detecção de Texto
## (AI vs Human) - Multi Dataset

In [6]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [7]:
def load_dataset(dataset_version):
    base_path = f'../../datasets/{dataset_version}/'
    train_file = f'Dataset{dataset_version}_train_clean.csv'
    test_file = f'Dataset{dataset_version}_test_clean.csv'
    validation_file = f'Dataset{dataset_version}_validation_clean.csv'
    
    train = pd.read_csv(os.path.join(base_path, train_file), sep=',')
    test = pd.read_csv(os.path.join(base_path, test_file), sep=',')
    
    validation_path = os.path.join(base_path, validation_file)
    if os.path.exists(validation_path):
        validation = pd.read_csv(validation_path, sep=',')
    else:
        validation = None
    
    return train, test, validation

In [8]:
NAME = "unigram_be.keras"
from tensorflow.keras.layers import TextVectorization

# Configuração do TextVectorization para unigramas
text_vectorization_singlegram_be = TextVectorization(
    max_tokens=5000,  # Número máximo de tokens
    output_mode="multi_hot",  # Representação multi-hot
    standardize="lower_and_strip_punctuation",  # Normalização do texto
)
def preprocess_text(text_ds):
    text_ds_2 = text_ds['text']
    text_vectorization_singlegram_be.adapt(text_ds_2)
    vectorized_text = text_ds_2.map(lambda x: text_vectorization_singlegram_be(x))
    return np.array(list(vectorized_text)) 

def unigram_preprocessing(train, test, validation):
    # Pré-processa apenas a coluna de texto
    X_train = preprocess_text(train)
    X_test = preprocess_text(test)
    
    # Converte os rótulos para valores numéricos
    y_train = train['Label'].map({'AI': 1, 'Human': 0}).values
    y_test = test['Label'].map({'AI': 1, 'Human': 0}).values
    
    # Processa validação apenas se existir
    X_val = None
    y_val = None
    if validation is not None:
        X_val = preprocess_text(validation)
        y_val = validation['Label'].map({'AI': 1, 'Human': 0}).values
    
    return X_train, y_train, X_test, y_test, X_val, y_val

## Função de Treinamento Modificada para GRU

In [9]:
def train_evaluate_gru(X_train, y_train, X_test, y_test, X_val, y_val, input_size):
    # Reshape para formato sequencial
    X_train_seq = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test_seq = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    
    print("\nIniciando treinamento da GRU...")

    model = Sequential([
        GRU(64, 
            input_shape=(1, input_size),
            dropout=0.2,
            recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=0.001),
        metrics=['accuracy']
    )
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    validation_data = (X_val.reshape(X_val.shape[0], 1, X_val.shape[1]), y_val) if X_val is not None else None
    
    history = model.fit(
        X_train_seq, y_train,
        epochs=100,
        validation_data=validation_data,
        callbacks=[early_stop],
        verbose=1
    )
    
    # Avaliação
    loss, accuracy = model.evaluate(X_test_seq, y_test, verbose=0)
    print(f"\nTest Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
    
    # Previsões
    gru_pred = (model.predict(X_test_seq) > 0.5).astype(int)
    print("\nRelatório de Classificação:")
    print(classification_report(y_test, gru_pred))
    print("Matriz de Confusão:")
    print(confusion_matrix(y_test, gru_pred))
    
    return model

## Pipeline Principal para GRU

In [None]:
DATASET_VERSION = 3
SAVE_DIR = 'modelos_gru/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, X_val, y_val = unigram_preprocessing(train, test, validation)

# Treinar GRU
gru_model = train_evaluate_gru(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
gru_model.save(f'{SAVE_DIR}gru_dataset{DATASET_VERSION}.keras')
with open(f'{SAVE_DIR}tfidf_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("\nPipeline concluído! Modelos salvos em 'modelos_gru/'.")

2025-04-02 16:45:52.903783: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 64212024 exceeds 10% of free system memory.


## Código de Inferência para GRU

In [None]:
def preprocessing_inference(df, vectorizer):
    text_data = df['Text']  
    X = vectorizer(text_data).numpy()  
    return X.reshape(X.shape[0], 1, X.shape[1]) 

# Carregar modelo
gru_model = load_model(f'{SAVE_DIR}gru_dataset{DATASET_VERSION}.keras')

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
text_vectorization_singlegram_be.adapt(train['text'])  

X_new = preprocessing_inference(df_input, text_vectorization_singlegram_be)
predictions = (gru_model.predict(X_new) > 0.5).astype(int)
df_output['Predicted'] = np.where(predictions == 1, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 258ms/step

Relatório Final:
              precision    recall  f1-score   support

          AI       0.50      0.20      0.29        15
       Human       0.50      0.80      0.62        15

    accuracy                           0.50        30
   macro avg       0.50      0.50      0.45        30
weighted avg       0.50      0.50      0.45        30

Matriz de Confusão Final:
[[ 3 12]
 [ 3 12]]
