In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [28]:
# ==============================================
# Pré-processamento com Pandas/Scikit-learn
# ==============================================

# Carregar e combinar datasets
#inputs_df = pd.read_csv('Dataset6_clean_input.csv')
#outputs_df = pd.read_csv('Dataset6_clean_output.csv')
#df = pd.merge(inputs_df, outputs_df, on='ID')
df = pd.read_csv('../Tarefa_1/Dataset6_clean.csv' , sep=',')
# Vetorização do texto
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['Text']).toarray()
y = df['Label'].map({'AI':1, 'Human':0}).values

# Divisão treino-teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    )

print(X_train)
print(X_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [29]:
# ==============================================
# Modelos Implementados com Numpy
# ==============================================

class LogisticRegression:
    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            linear = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(linear)
            
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear)
        return (y_pred > 0.5).astype(int)

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, lr=0.01, dropout_rate=0.0):
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))
        self.lr = lr
        self.dropout_rate = dropout_rate  # Fraction of neurons to drop (0.0 means no dropout)
        self.loss_history = []  # To monitor loss over epochs

    def _relu(self, Z):
        return np.maximum(0, Z)

    def _relu_deriv(self, Z):
        return (Z > 0).astype(float)

    def _sigmoid(self, Z):
        return 1 / (1 + np.exp(-Z))

    def forward(self, X, training=True):
        # First layer forward
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self._relu(self.z1)
        
        # Apply dropout during training
        if training and self.dropout_rate > 0.0:
            # Create dropout mask: 1 means keep, 0 means drop.
            self.dropout_mask = (np.random.rand(*self.a1.shape) > self.dropout_rate).astype(float)
            # Apply inverted dropout scaling so that no scaling is needed at test time.
            self.a1 *= self.dropout_mask
            self.a1 /= (1.0 - self.dropout_rate)
        else:
            # If not training, no dropout mask is used.
            self.dropout_mask = None
        
        # Second layer forward
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.output = self._sigmoid(self.z2)
        return self.output

    def compute_loss(self, y, output):
        # Binary cross-entropy loss with a small epsilon for numerical stability
        epsilon = 1e-15
        y = y.reshape(-1, 1)
        loss = -np.mean(y * np.log(output + epsilon) + (1 - y) * np.log(1 - output + epsilon))
        return loss

    def backward(self, X, y, output):
        m = y.shape[0]
        y = y.reshape(-1, 1)
        
        # Compute gradients for the output layer
        dz2 = output - y
        dW2 = (1/m) * np.dot(self.a1.T, dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        # Backpropagate to the hidden layer
        dz1 = np.dot(dz2, self.W2.T) * self._relu_deriv(self.z1)
        # If dropout was applied, propagate the mask
        if self.dropout_rate > 0.0 and self.dropout_mask is not None:
            dz1 *= self.dropout_mask
        
        dW1 = (1/m) * np.dot(X.T, dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2

    def train(self, X, y, epochs=1000, print_loss=True):
        for epoch in range(epochs):
            # Forward pass with dropout enabled
            output = self.forward(X, training=True)
            
            # Compute and store loss
            loss = self.compute_loss(y, output)
            self.loss_history.append(loss)
            
            # Optionally print loss and accuracy at intervals
            if print_loss and (epoch % 100 == 0 or epoch == epochs - 1):
                predictions = self.predict(X)
                accuracy = np.mean(predictions.flatten() == y.flatten())
                print(f"Epoch {epoch}: Loss = {loss}, Accuracy = {accuracy*100:.2f}%")
            
            # Backward pass and weight update
            dW1, db1, dW2, db2 = self.backward(X, y, output)
            self.W1 -= self.lr * dW1
            self.b1 -= self.lr * db1
            self.W2 -= self.lr * dW2
            self.b2 -= self.lr * db2

    def predict(self, X):
        # During prediction, disable dropout by setting training=False
        output = self.forward(X, training=False)
        return (output > 0.5).astype(int)


In [30]:
class RNN:
    def __init__(self, input_size, hidden_size, output_size, lr=0.01, dropout_rate=0.0):
        """
        Initializes the RNN with given parameters.
        
        Parameters:
        - input_size: Number of features per time step.
        - hidden_size: Number of hidden units.
        - output_size: Number of output units (for binary classification, this is 1).
        - lr: Learning rate.
        - dropout_rate: Fraction of hidden units to drop during training (0.0 means no dropout).
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lr = lr
        self.dropout_rate = dropout_rate

        # Weight initialization:
        # Input to hidden weights
        self.W_xh = np.random.randn(input_size, hidden_size) * 0.01
        # Hidden to hidden weights (recurrent connections)
        self.W_hh = np.random.randn(hidden_size, hidden_size) * 0.01
        # Bias for hidden layer
        self.b_h = np.zeros((1, hidden_size))
        # Hidden to output weights
        self.W_hy = np.random.randn(hidden_size, output_size) * 0.01
        # Bias for output layer
        self.b_y = np.zeros((1, output_size))
        
        self.loss_history = []  # To store loss values over epochs

    def _tanh(self, x):
        """Tanh activation function."""
        return np.tanh(x)

    def _tanh_deriv(self, x):
        """Derivative of tanh activation function: 1 - tanh(x)^2."""
        return 1 - np.tanh(x) ** 2

    def _sigmoid(self, x):
        """Sigmoid activation function."""
        return 1 / (1 + np.exp(-x))
    
    def forward(self, X, training=True):
        """
        Forward pass through the RNN.
        
        Parameters:
        - X: Input array with shape (batch_size, seq_length, input_size)
        - training: Boolean flag indicating whether we are training (enables dropout)
        
        Returns:
        - output: Final predictions with shape (batch_size, output_size)
        """
        batch_size, seq_length, _ = X.shape
        # Initialize arrays to store hidden states and pre-activation values
        self.h = np.zeros((batch_size, seq_length, self.hidden_size))
        self.z = np.zeros((batch_size, seq_length, self.hidden_size))
        
        # Initialize previous hidden state as zeros
        h_prev = np.zeros((batch_size, self.hidden_size))
        
        # Process each time step
        for t in range(seq_length):
            # Linear combination for hidden state at time t
            self.z[:, t, :] = np.dot(X[:, t, :], self.W_xh) + np.dot(h_prev, self.W_hh) + self.b_h
            h_current = self._tanh(self.z[:, t, :])
            
            # Apply dropout to the hidden state if in training mode
            if training and self.dropout_rate > 0.0:
                # Create dropout mask: 1 indicates keep neuron, 0 indicates drop
                dropout_mask = (np.random.rand(*h_current.shape) > self.dropout_rate).astype(float)
                h_current *= dropout_mask
                # Inverted dropout scaling to maintain the expected value at test time
                h_current /= (1.0 - self.dropout_rate)
            
            self.h[:, t, :] = h_current
            h_prev = h_current
        
        # Use the last hidden state to compute the output
        self.output_linear = np.dot(h_prev, self.W_hy) + self.b_y
        self.output = self._sigmoid(self.output_linear)
        return self.output

    def compute_loss(self, y, output):
        """
        Computes binary cross-entropy loss.
        
        Parameters:
        - y: True labels with shape (batch_size,)
        - output: Predicted outputs with shape (batch_size, 1)
        
        Returns:
        - loss: Average binary cross-entropy loss over the batch
        """
        epsilon = 1e-15  # Small constant for numerical stability
        y = y.reshape(-1, 1)
        loss = -np.mean(y * np.log(output + epsilon) + (1 - y) * np.log(1 - output + epsilon))
        return loss

    def backward(self, X, y, output):
        """
        Performs backpropagation through time (BPTT) to compute gradients.
        
        Parameters:
        - X: Input array with shape (batch_size, seq_length, input_size)
        - y: True labels with shape (batch_size,)
        - output: Predicted outputs from the forward pass
        
        Returns:
        - dW_xh: Gradient for input-to-hidden weights
        - dW_hh: Gradient for hidden-to-hidden weights
        - db_h: Gradient for hidden biases
        - dW_hy: Gradient for hidden-to-output weights
        - db_y: Gradient for output biases
        """
        batch_size, seq_length, _ = X.shape
        y = y.reshape(-1, 1)
        
        # Gradients for output layer
        d_output = output - y  # (batch_size, 1)
        dW_hy = np.dot(self.h[:, -1, :].T, d_output) / batch_size
        db_y = np.sum(d_output, axis=0, keepdims=True) / batch_size
        
        # Initialize gradients for recurrent weights and biases
        dW_xh = np.zeros_like(self.W_xh)
        dW_hh = np.zeros_like(self.W_hh)
        db_h = np.zeros_like(self.b_h)
        
        # Gradient from the output layer to the last hidden state
        dh = np.dot(d_output, self.W_hy.T)  # (batch_size, hidden_size)
        dh_next = np.zeros((batch_size, self.hidden_size))
        
        # Backpropagation through time (iterate backwards over time steps)
        for t in reversed(range(seq_length)):
            # Total gradient for the hidden state at time t
            dh_total = dh + dh_next
            dz = dh_total * self._tanh_deriv(self.z[:, t, :])
            
            # Gradients for the input-to-hidden weights
            dW_xh += np.dot(X[:, t, :].T, dz) / batch_size
            
            # Determine the previous hidden state (zero if t == 0)
            h_prev = self.h[:, t-1, :] if t > 0 else np.zeros((batch_size, self.hidden_size))
            dW_hh += np.dot(h_prev.T, dz) / batch_size
            db_h += np.sum(dz, axis=0, keepdims=True) / batch_size
            
            # Propagate gradient to previous time step
            dh_next = np.dot(dz, self.W_hh.T)
        
        return dW_xh, dW_hh, db_h, dW_hy, db_y

    def train(self, X, y, epochs=1000, print_loss=True):
        """
        Trains the RNN over a specified number of epochs.
        
        Parameters:
        - X: Input array with shape (batch_size, seq_length, input_size)
        - y: Labels with shape (batch_size,)
        - epochs: Number of training iterations
        - print_loss: Flag to print loss and accuracy at intervals
        """
        for epoch in range(epochs):
            # Forward pass (with dropout enabled during training)
            output = self.forward(X, training=True)
            
            # Compute and store loss
            loss = self.compute_loss(y, output)
            self.loss_history.append(loss)
            
            # Optionally print loss and accuracy every 100 epochs (or on the final epoch)
            if print_loss and (epoch % 100 == 0 or epoch == epochs - 1):
                predictions = self.predict(X)
                accuracy = np.mean(predictions.flatten() == y.flatten())
                print(f"Epoch {epoch}: Loss = {loss}, Accuracy = {accuracy*100:.2f}%")
            
            # Backward pass and weight update
            dW_xh, dW_hh, db_h, dW_hy, db_y = self.backward(X, y, output)
            self.W_xh -= self.lr * dW_xh
            self.W_hh -= self.lr * dW_hh
            self.b_h -= self.lr * db_h
            self.W_hy -= self.lr * dW_hy
            self.b_y -= self.lr * db_y

    def predict(self, X):
        """
        Predicts the output for a given input.
        
        Parameters:
        - X: Input array with shape (batch_size, seq_length, input_size)
        
        Returns:
        - Binary predictions with shape (batch_size, output_size)
        """
        # Disable dropout during prediction
        output = self.forward(X, training=False)
        return (output > 0.5).astype(int)

In [31]:
# Treinar Regressão Logística
print("Training Logistic Regression...")
lr = LogisticRegression(lr=0.08, n_iters=1000)
lr.fit(X_train, y_train)
lr_acc = np.mean(lr.predict(X_test) == y_test)
lr_pred = lr.predict(X_test)
print(classification_report(y_test,lr_pred))
print(confusion_matrix(y_test,lr_pred))
print(f"LR Test Accuracy: {lr_acc:.2f}")


Training Logistic Regression...
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       422
           1       0.99      0.91      0.95       390

    accuracy                           0.95       812
   macro avg       0.96      0.95      0.95       812
weighted avg       0.96      0.95      0.95       812

[[420   2]
 [ 36 354]]
LR Test Accuracy: 0.95


In [34]:
# Treinar Rede Neural
print("\nTraining Neural Network...")
nn = NeuralNetwork(input_size=X_train.shape[1], hidden_size=64, 
                   output_size=1, lr=0.08, dropout_rate=0.2)
nn.train(X_train, y_train, epochs=1000)
nn_pred = nn.predict(X_test)
print(classification_report(y_test,nn_pred))
print(confusion_matrix(y_test,nn_pred))


Training Neural Network...
Epoch 0: Loss = 0.6931665233546033, Accuracy = 47.83%
Epoch 100: Loss = 0.6924588774727781, Accuracy = 51.77%
Epoch 200: Loss = 0.6923222058228619, Accuracy = 51.77%
Epoch 300: Loss = 0.6920778759246174, Accuracy = 51.77%
Epoch 400: Loss = 0.6916446413645758, Accuracy = 51.77%
Epoch 500: Loss = 0.6907545403856307, Accuracy = 51.77%
Epoch 600: Loss = 0.6890333150173179, Accuracy = 51.77%
Epoch 700: Loss = 0.685667152598539, Accuracy = 51.77%
Epoch 800: Loss = 0.6793924252709512, Accuracy = 53.16%
Epoch 900: Loss = 0.6671298750840063, Accuracy = 70.48%
Epoch 999: Loss = 0.6458058621854854, Accuracy = 88.94%
              precision    recall  f1-score   support

           0       0.83      1.00      0.90       422
           1       1.00      0.77      0.87       390

    accuracy                           0.89       812
   macro avg       0.91      0.88      0.89       812
weighted avg       0.91      0.89      0.89       812

[[421   1]
 [ 89 301]]


In [33]:
X_train_seq = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_seq = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

print("\nTraining RNN...")
# Ensure that X_train_seq has shape (batch_size, seq_length, input_size)
rnn = RNN(input_size=X_train_seq.shape[2], hidden_size=64, 
          output_size=1, lr=0.08, dropout_rate=0.2)
rnn.train(X_train_seq, y_train, epochs=1000)
rnn_pred = rnn.predict(X_test_seq)
print(classification_report(y_test, rnn_pred))
print(confusion_matrix(y_test, rnn_pred))


Training RNN...
Epoch 0: Loss = 0.6931534880106127, Accuracy = 48.47%
Epoch 100: Loss = 0.6922954745947354, Accuracy = 51.77%
Epoch 200: Loss = 0.6918956117584041, Accuracy = 51.77%
Epoch 300: Loss = 0.6911021904864952, Accuracy = 51.77%
Epoch 400: Loss = 0.6894707604885866, Accuracy = 51.77%
Epoch 500: Loss = 0.6859929063216463, Accuracy = 51.77%
Epoch 600: Loss = 0.6786207268005767, Accuracy = 53.19%
Epoch 700: Loss = 0.6636614135041534, Accuracy = 72.36%
Epoch 800: Loss = 0.6342197246335989, Accuracy = 90.26%
Epoch 900: Loss = 0.5819973560504849, Accuracy = 95.16%
Epoch 999: Loss = 0.5041302239109895, Accuracy = 96.73%
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       422
           1       0.99      0.93      0.96       390

    accuracy                           0.96       812
   macro avg       0.97      0.96      0.96       812
weighted avg       0.96      0.96      0.96       812

[[419   3]
 [ 27 363]]
