# Pipeline de Treinamento LSTM para Detecção de Texto
## (AI vs Human) - Multi Dataset

In [6]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [7]:
def tfidf_preprocessing(train, test, validation):
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_train = tfidf.fit_transform(train['text']).toarray()
    X_test = tfidf.transform(test['text']).toarray()
    
    y_train = train['Label'].map({'AI':1, 'Human':0}).values
    y_test = test['Label'].map({'AI':1, 'Human':0}).values
    
    if validation is not None:
        X_val = tfidf.transform(validation['text']).toarray()
        y_val = validation['Label'].map({'AI':1, 'Human':0}).values
    else:
        X_val, y_val = None, None
    
    return X_train, y_train, X_test, y_test, tfidf, X_val, y_val

In [8]:
def load_dataset(dataset_version):
    base_path = f'../../datasets/{dataset_version}/'
    train_file = f'Dataset{dataset_version}_train_clean.csv'
    test_file = f'Dataset{dataset_version}_test_clean.csv'
    validation_file = f'Dataset{dataset_version}_validation_clean.csv'
    
    train = pd.read_csv(os.path.join(base_path, train_file), sep=',')
    test = pd.read_csv(os.path.join(base_path, test_file), sep=',')
    
    validation_path = os.path.join(base_path, validation_file)
    if os.path.exists(validation_path):
        validation = pd.read_csv(validation_path, sep=',')
    else:
        validation = None
    
    return train, test, validation

## Função de Treinamento Modificada para LSTM

In [9]:
def train_evaluate_lstm(X_train, y_train, X_test, y_test, X_val, y_val, input_size):
    # Reshape para formato sequencial
    X_train_seq = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test_seq = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    
    print("\nIniciando treinamento da LSTM...")

    model = Sequential([
        LSTM(64, 
             input_shape=(1, input_size),
             dropout=0.2,
             recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=0.001),
        metrics=['accuracy']
    )
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    validation_data = (X_val.reshape(X_val.shape[0], 1, X_val.shape[1]), y_val) if X_val is not None else None
    
    history = model.fit(
        X_train_seq, y_train,
        epochs=100,
        validation_data=validation_data,
        callbacks=[early_stop],
        verbose=1
    )
    
    # Avaliação
    loss, accuracy = model.evaluate(X_test_seq, y_test, verbose=0)
    print(f"\nTest Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
    
    # Previsões
    lstm_pred = (model.predict(X_test_seq) > 0.5).astype(int)
    print("\nRelatório de Classificação:")
    print(classification_report(y_test, lstm_pred))
    print("Matriz de Confusão:")
    print(confusion_matrix(y_test, lstm_pred))
    
    return model

## Pipeline Principal para LSTM

In [12]:
DATASET_VERSION = 3
SAVE_DIR = 'modelos_lstm/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Carregar dados
train, test, validation = load_dataset(DATASET_VERSION)

# Pré-processamento
X_train, y_train, X_test, y_test, tfidf, X_val, y_val = tfidf_preprocessing(train, test, validation)

# Treinar LSTM
lstm_model = train_evaluate_lstm(X_train, y_train, X_test, y_test, X_val, y_val, input_size=X_train.shape[1])

# Salvar modelos
lstm_model.save(f'{SAVE_DIR}lstm_dataset{DATASET_VERSION}.keras')
with open(f'{SAVE_DIR}tfidf_dataset{DATASET_VERSION}.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("\nPipeline concluído! Modelos salvos em 'modelos_lstm/'.")


Iniciando treinamento da LSTM...
Epoch 1/100


  super().__init__(**kwargs)


[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8364 - loss: 0.5101
Epoch 2/100
[1m 44/438[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 4ms/step - accuracy: 0.9508 - loss: 0.1564

  current = self.get_monitor_value(logs)


[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9441 - loss: 0.1604
Epoch 3/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9476 - loss: 0.1387
Epoch 4/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9490 - loss: 0.1324
Epoch 5/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9501 - loss: 0.1321
Epoch 6/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9526 - loss: 0.1185
Epoch 7/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9520 - loss: 0.1278
Epoch 8/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9510 - loss: 0.1283
Epoch 9/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9561 - loss: 0.1208
Epoch 10/100
[1m438/438[0m [32m━━━━━━━━━━

In [13]:
def preprocessing_inference(df, vectorizer):
    X = vectorizer.transform(df['Text']).toarray()
    return X.reshape(X.shape[0], 1, X.shape[1])

# Carregar modelo
lstm_model = load_model(f'{SAVE_DIR}lstm_dataset{DATASET_VERSION}.keras')

# Carregar dados
df_input = pd.read_csv('../../datasets/val/dataset1_inputs.csv', sep='\t')
df_output = pd.read_csv('../../datasets/val/dataset1_outputs.csv', sep='\t')

# Previsões
X_new = preprocessing_inference(df_input, tfidf)
predictions = (lstm_model.predict(X_new) > 0.5).astype(int)
df_output['Predicted'] = np.where(predictions == 1, 'AI', 'Human')

# Métricas
print("\nRelatório Final:")
print(classification_report(df_output['Label'], df_output['Predicted']))
print("Matriz de Confusão Final:")
print(confusion_matrix(df_output['Label'], df_output['Predicted']))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step

Relatório Final:
              precision    recall  f1-score   support

          AI       0.67      0.13      0.22        15
       Human       0.52      0.93      0.67        15

    accuracy                           0.53        30
   macro avg       0.59      0.53      0.44        30
weighted avg       0.59      0.53      0.44        30

Matriz de Confusão Final:
[[ 2 13]
 [ 1 14]]
