In [19]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [20]:
# ==============================================
# Pré-processamento com Pandas/Scikit-learn
# ==============================================

# Carregar e combinar datasets
#inputs_df = pd.read_csv('Dataset6_clean_input.csv')
#outputs_df = pd.read_csv('Dataset6_clean_output.csv')
#df = pd.merge(inputs_df, outputs_df, on='ID')
df = pd.read_csv('../Tarefa_1/Dataset6_clean.csv' , sep=',')
# Vetorização do texto
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['Text']).toarray()
y = df['Label'].map({'AI':1, 'Human':0}).values

# Divisão treino-teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    )

print(X_train)
print(X_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [39]:
# ==============================================
# Modelos Implementados com Numpy
# ==============================================

class LogisticRegression:
    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            linear = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(linear)
            
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear)
        return (y_pred > 0.5).astype(int)

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, lr=0.01, dropout_rate=0.0):
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))
        self.lr = lr
        self.dropout_rate = dropout_rate  # Fraction of neurons to drop (0.0 means no dropout)
        self.loss_history = []  # To monitor loss over epochs

    def _relu(self, Z):
        return np.maximum(0, Z)

    def _relu_deriv(self, Z):
        return (Z > 0).astype(float)

    def _sigmoid(self, Z):
        return 1 / (1 + np.exp(-Z))

    def forward(self, X, training=True):
        # First layer forward
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self._relu(self.z1)
        
        # Apply dropout during training
        if training and self.dropout_rate > 0.0:
            # Create dropout mask: 1 means keep, 0 means drop.
            self.dropout_mask = (np.random.rand(*self.a1.shape) > self.dropout_rate).astype(float)
            # Apply inverted dropout scaling so that no scaling is needed at test time.
            self.a1 *= self.dropout_mask
            self.a1 /= (1.0 - self.dropout_rate)
        else:
            # If not training, no dropout mask is used.
            self.dropout_mask = None
        
        # Second layer forward
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.output = self._sigmoid(self.z2)
        return self.output

    def compute_loss(self, y, output):
        # Binary cross-entropy loss with a small epsilon for numerical stability
        epsilon = 1e-15
        y = y.reshape(-1, 1)
        loss = -np.mean(y * np.log(output + epsilon) + (1 - y) * np.log(1 - output + epsilon))
        return loss

    def backward(self, X, y, output):
        m = y.shape[0]
        y = y.reshape(-1, 1)
        
        # Compute gradients for the output layer
        dz2 = output - y
        dW2 = (1/m) * np.dot(self.a1.T, dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        # Backpropagate to the hidden layer
        dz1 = np.dot(dz2, self.W2.T) * self._relu_deriv(self.z1)
        # If dropout was applied, propagate the mask
        if self.dropout_rate > 0.0 and self.dropout_mask is not None:
            dz1 *= self.dropout_mask
        
        dW1 = (1/m) * np.dot(X.T, dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2

    def train(self, X, y, epochs=1000, print_loss=True):
        for epoch in range(epochs):
            # Forward pass with dropout enabled
            output = self.forward(X, training=True)
            
            # Compute and store loss
            loss = self.compute_loss(y, output)
            self.loss_history.append(loss)
            
            # Optionally print loss and accuracy at intervals
            if print_loss and (epoch % 100 == 0 or epoch == epochs - 1):
                predictions = self.predict(X)
                accuracy = np.mean(predictions.flatten() == y.flatten())
                print(f"Epoch {epoch}: Loss = {loss}, Accuracy = {accuracy*100:.2f}%")
            
            # Backward pass and weight update
            dW1, db1, dW2, db2 = self.backward(X, y, output)
            self.W1 -= self.lr * dW1
            self.b1 -= self.lr * db1
            self.W2 -= self.lr * dW2
            self.b2 -= self.lr * db2

    def predict(self, X):
        # During prediction, disable dropout by setting training=False
        output = self.forward(X, training=False)
        return (output > 0.5).astype(int)


In [40]:
# Treinar Regressão Logística
print("Training Logistic Regression...")
lr = LogisticRegression(lr=0.08, n_iters=1000)
lr.fit(X_train, y_train)
lr_acc = np.mean(lr.predict(X_test) == y_test)
lr_pred = lr.predict(X_test)
print(classification_report(y_test,lr_pred))
print(confusion_matrix(y_test,lr_pred))
print(f"LR Test Accuracy: {lr_acc:.2f}")


Training Logistic Regression...
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       422
           1       0.99      0.91      0.95       390

    accuracy                           0.95       812
   macro avg       0.96      0.95      0.95       812
weighted avg       0.96      0.95      0.95       812

[[420   2]
 [ 36 354]]
LR Test Accuracy: 0.95


In [44]:
# ==============================================
# Treino e Avaliação
# ==============================================

# Treinar Rede Neural
print("\nTraining Neural Network...")
nn = NeuralNetwork(input_size=X_train.shape[1], hidden_size=64, 
                   output_size=1, lr=0.08)
nn.train(X_train, y_train, epochs=1000)
nn_pred = nn.predict(X_test)
print(classification_report(y_test,nn_pred))
print(confusion_matrix(y_test,nn_pred))


Training Neural Network...
Epoch 0: Loss = 0.6931757891583803, Accuracy = 45.02%
Epoch 100: Loss = 0.6924810097283001, Accuracy = 51.77%
Epoch 200: Loss = 0.6923632564757726, Accuracy = 51.77%
Epoch 300: Loss = 0.6921573640763262, Accuracy = 51.77%
Epoch 400: Loss = 0.6917250808366587, Accuracy = 51.77%
Epoch 500: Loss = 0.6908074167676949, Accuracy = 51.77%
Epoch 600: Loss = 0.6888703905370681, Accuracy = 51.77%
Epoch 700: Loss = 0.6847995957659427, Accuracy = 51.77%
Epoch 800: Loss = 0.6763488311519362, Accuracy = 54.58%
Epoch 900: Loss = 0.6592872855392184, Accuracy = 75.84%
Epoch 999: Loss = 0.6270542090362031, Accuracy = 90.60%
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       422
           1       1.00      0.80      0.89       390

    accuracy                           0.90       812
   macro avg       0.92      0.90      0.90       812
weighted avg       0.92      0.90      0.90       812

[[421   1]
 [ 79 311]]
