## Experimentação no laboratório de Machine Learning

#### Construção do modelo de Regressão Logística para o dataset Iris

In [48]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score

In [49]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [50]:
minmax_scaler = MinMaxScaler()
X_normalized = minmax_scaler.fit_transform(X)

standard_scaler = StandardScaler()
X_standardized = standard_scaler.fit_transform(X)

In [51]:
X_normalized_bias = np.c_[np.ones(X_normalized.shape[0]), X_normalized]
X_standardized_bias = np.c_[np.ones(X_standardized.shape[0]), X_standardized]

In [52]:
def sigmoid(z):
    z_clipped = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z_clipped))

In [53]:
def cost(theta, X, y):
  m = len(y)

  y = y.reshape(-1, 1) if y.ndim == 1 else y

  h = sigmoid(np.dot(X, theta))
  epsilon = 1e-10

  h = np.clip(h, epsilon, 1 - epsilon)

  J = (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

  grad = (1/m) * np.dot(X.T, (h - y))

  return J, grad

In [54]:
initial_theta = np.zeros(X.shape[1]+1).reshape(X.shape[1]+1, 1)

def gradient_descent(
    X,
    y,
    theta = initial_theta,
    alpha = 0.01,
    num_iterations = 1500):
  m, n_plus_1 = X.shape
  theta = np.zeros((n_plus_1, 1))

  J_history = []

  y = y.reshape(-1, 1) if y.ndim == 1 else y

  for iteration in range(num_iterations):
    J, grad = cost(theta, X, y)
    theta = theta - alpha * grad
    J_history.append(J)

  return theta, J_history

In [55]:
def train(X, y, alpha=0.01, num_iterations=1500):
    num_classes = len(np.unique(y))
    all_theta = []

    for c in range(num_classes):
        y_c = (y == c).astype(int)
        theta_c, _ = gradient_descent(X, y_c, alpha, num_iterations)
        all_theta.append(theta_c)

    return np.array(all_theta).reshape(num_classes, X.shape[1], 1)

In [56]:
def predict(X, all_theta):
  num_classes = all_theta.shape[0]
  probabilities = np.array([sigmoid(np.dot(X, all_theta[c])) for c in range(num_classes)])
  predictions = np.argmax(probabilities, axis=0)

  return predictions.flatten()

In [57]:
K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

formatted_datasets = {
    "Normalized": X_normalized_bias,
    "Standardized": X_standardized_bias
}

results = {}

for name, X_data in formatted_datasets.items():
  fold_accuracies = []
  fold_confusion_matrices = []


  for fold, (train_index, test_index) in enumerate(skf.split(X_data, y)):
    X_train, X_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y[train_index], y[test_index]

    trained_all_theta = train(X_train, y_train)
    y_pred = predict(X_test, trained_all_theta)

    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)

    cm = confusion_matrix(y_test, y_pred)
    fold_confusion_matrices.append(cm)

    print(f"Fold {fold+1}:")
    print(f"  Acurácia: {accuracy:.4f}")
    print("  Matriz de Confusão:")
    print(cm)
    print("-" * 30)

    avg_accuracy = np.mean(fold_accuracies)
    results[name] = {
        "acuracias": fold_accuracies,
        "acuracia_media": avg_accuracy,
        "matrizes_confusao": fold_confusion_matrices
    }
    print(f"Acurácia Média para o Dataset {name}: {avg_accuracy:.4f}")

Fold 1:
  Acurácia: 0.8333
  Matriz de Confusão:
[[10  0  0]
 [ 0  9  1]
 [ 0  4  6]]
------------------------------
Acurácia Média para o Dataset Normalized: 0.8333
Fold 2:
  Acurácia: 0.7667
  Matriz de Confusão:
[[10  0  0]
 [ 4  3  3]
 [ 0  0 10]]
------------------------------
Acurácia Média para o Dataset Normalized: 0.8000
Fold 3:
  Acurácia: 0.7000
  Matriz de Confusão:
[[9 1 0]
 [0 9 1]
 [0 7 3]]
------------------------------
Acurácia Média para o Dataset Normalized: 0.7667
Fold 4:
  Acurácia: 0.7667
  Matriz de Confusão:
[[10  0  0]
 [ 1  3  6]
 [ 0  0 10]]
------------------------------
Acurácia Média para o Dataset Normalized: 0.7667
Fold 5:
  Acurácia: 0.7667
  Matriz de Confusão:
[[10  0  0]
 [ 0 10  0]
 [ 0  7  3]]
------------------------------
Acurácia Média para o Dataset Normalized: 0.7667
Fold 1:
  Acurácia: 0.8667
  Matriz de Confusão:
[[10  0  0]
 [ 0  6  4]
 [ 0  0 10]]
------------------------------
Acurácia Média para o Dataset Standardized: 0.8667
Fold 2:
  A

In [58]:
for name, data in results.items():
    print(f"\nDataset: {name}")
    print(f"Acurácias Individuais por Fold: {[f'{acc:.4f}' for acc in data['acuracias']]}")
    print(f"Acurácia Média: {data['acuracia_media']:.4f}")
    print("Matrizes de Confusão para cada Fold:")
    for i, cm in enumerate(data['matrizes_confusao']):
        print(f"Fold {i+1}:\n{cm}")


Dataset: Normalized
Acurácias Individuais por Fold: ['0.8333', '0.7667', '0.7000', '0.7667', '0.7667']
Acurácia Média: 0.7667
Matrizes de Confusão para cada Fold:
Fold 1:
[[10  0  0]
 [ 0  9  1]
 [ 0  4  6]]
Fold 2:
[[10  0  0]
 [ 4  3  3]
 [ 0  0 10]]
Fold 3:
[[9 1 0]
 [0 9 1]
 [0 7 3]]
Fold 4:
[[10  0  0]
 [ 1  3  6]
 [ 0  0 10]]
Fold 5:
[[10  0  0]
 [ 0 10  0]
 [ 0  7  3]]

Dataset: Standardized
Acurácias Individuais por Fold: ['0.8667', '0.8333', '0.6000', '0.7667', '0.7667']
Acurácia Média: 0.7667
Matrizes de Confusão para cada Fold:
Fold 1:
[[10  0  0]
 [ 0  6  4]
 [ 0  0 10]]
Fold 2:
[[10  0  0]
 [ 0 10  0]
 [ 0  5  5]]
Fold 3:
[[10  0  0]
 [ 1  8  1]
 [ 0 10  0]]
Fold 4:
[[10  0  0]
 [ 0  4  6]
 [ 0  1  9]]
Fold 5:
[[10  0  0]
 [ 0 10  0]
 [ 0  7  3]]
