In [269]:
import numpy as np
import pandas as pd
import seaborn as sns

# Código 1)

Aqui estamos fazendo o aumento do conjunto de treinamento e estamos implementando a função da distância de manhattan.

Para comparar o uso das duas distâncias, iremos utilizar a função do **kfold_cross_validation** visto na última aula e implementada para realizar a etapa 2 das observações propostas pelo relatório.

In [270]:
treino = [
    ([1, 2], 0),
    ([2, 3], 0),
    ([3, 3], 0),
    ([5, 4], 0),
    ([6, 5], 1),
    ([7, 7], 1),
    ([8, 6], 1),
    ([9, 9], 2),
    ([10, 10], 2),
    ([13, 11], 2),
    ([12, 13], 2),
    ([20, 19], 3),
    ([23, 20], 3),
    ([28, 22], 3),
    ([25, 21], 3),
]

def calcular_distancia_euclides(p1, p2):
  return ((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)**0.5

def calcular_distancia_manhattan(p1, p2):
  return abs(p1[0] - p2[0]) + abs(p1[1] - p2[1])

Para fazer a comparação do uso das duas distâncias, nós usaremos o **Kfold Cross Validation** para calcular a acurácia utilizando as duas funções, com distâncias diferentes para o mesmo dataset.

In [271]:
X = np.array([ponto for ponto, _ in treino])
y = np.array([classe for _, classe in treino])

In [272]:
def knn(novo_ponto, treino, k = 3, distancia_fn = "euclides"):
  distancias = []
  for ponto, classe in treino:
    if distancia_fn == "euclides":
      distancia = calcular_distancia_euclides(novo_ponto, ponto)
    elif distancia_fn == "manhattan":
      distancia = calcular_distancia_manhattan(novo_ponto, ponto)
    else:
      raise ValueError("Função de distância inválida")

    distancias.append((distancia, classe))

  distancias.sort(key = lambda x: x[0])
  k_vizinhos = distancias[:k]

  contagem = {}

  for _, classe in k_vizinhos:
    if classe not in contagem:
      contagem[classe] = 0
    contagem[classe] += 1

  return max(contagem, key = contagem.get)

def k_fold_cross_validation_with_distance(X, y, k_folds=10, k_neighbors=5, distancia_fn = "euclides"):
    n = len(X)
    fold_size = n // k_folds
    indices = np.arange(n)
    np.random.shuffle(indices)

    scores = []
    for fold in range(k_folds):
        start = fold * fold_size
        end   = (fold + 1) * fold_size if fold < k_folds - 1 else n

        val_idx   = indices[start:end]
        train_idx = np.concatenate([indices[:start], indices[end:]])

        X_train, y_train = X[train_idx], y[train_idx]
        X_val,   y_val   = X[val_idx],   y[val_idx]

        treino_fold = list(zip(X_train.tolist(), y_train.tolist()))
        correct = 0
        for xi, yi in zip(X_val, y_val):
            pred = knn(xi.tolist(),
                       treino_fold,
                       k_neighbors,
                       distancia_fn)
            if pred == yi:
                correct += 1

        acc = correct / len(X_val)
        scores.append(acc)

    return scores

In [273]:
distances = ["euclides", "manhattan"]
results = []

for dist in distances:
    scores = k_fold_cross_validation_with_distance(
        X, y,
        k_folds=10,
        k_neighbors=4,
        distancia_fn=dist
    )

    results.append({
        "distancia":    dist,
        "mean_accuracy": np.mean(scores),
        "std_accuracy":  np.std(scores)
    })

pd.DataFrame(results)

Unnamed: 0,distancia,mean_accuracy,std_accuracy
0,euclides,0.733333,0.416333
1,manhattan,0.966667,0.1


O que podemos concluir da comparação entre as distâncias de Euclides e Manhattan, é que quando variamos o número de "k_vizinhos" para cada distância, o modelo nos da uma acurácia diferente onde não temos uma certeza de qual é melhor que o outro, por conta da forma que o cálculo é feito.

Quando aumentamos o número de vizinhos, a distância euclidiana **considerará mais pontos distantes** por conta da sua fórmula, entretanto, a de Manhattan faz isso de outra forma pelo somatório dos módulos, considerando menos pontos.

# Código 2)

In [274]:
def load_dataset(dataset = 'local'):
  if dataset == 'local':
    return pd.read_csv('titanic.csv')

  return sns.load_dataset('titanic')

titanic_df = load_dataset('titanic')
local_titanic_df = load_dataset()

In [275]:
def preprocess_titanic_df(df, columns_map):
  selected_cols = list(columns_map.values())
  df = df[selected_cols].dropna().copy()

  sex_col = columns_map['sex']
  df[sex_col] = df[sex_col].map({'male': 1, 'female': 0})

  return df

columns_map_original = {
    'survived': 'survived',
    'pclass': 'pclass',
    'sex': 'sex',
    'age': 'age',
    'fare': 'fare'
}

columns_map_local = {
    'survived': 'Survived',
    'pclass': 'Pclass',
    'sex': 'Sex',
    'age': 'Age',
    'fare': 'Fare'
}


titanic_df = preprocess_titanic_df(titanic_df, columns_map_original)
local_titanic_df = preprocess_titanic_df(local_titanic_df, columns_map_local)

In [276]:
def extract_and_normalize_features(df, feature_cols, target_col):
  X = df[feature_cols].values
  y = df[target_col].values

  X_normalized = (X - X.mean(axis=0)) / X.std(axis=0)

  return X_normalized, y

feature_cols_original = ['pclass', 'sex', 'age', 'fare']
target_col_original = 'survived'

feature_cols_local = ['Pclass', 'Sex', 'Age', 'Fare']
target_col_local = 'Survived'

X_original, y_original = extract_and_normalize_features(
    titanic_df,
    feature_cols_original,
    target_col_original)

X_local, y_local = extract_and_normalize_features(
    local_titanic_df,
    feature_cols_local,
    target_col_local)

y_local = y_local.astype(int)

In [277]:
def euclidean_distance(x1, x2):
  return np.sqrt(np.sum((x1 - x2)**2))

In [278]:
def knn_predict(X_train, y_train, x_test, k = 10):
  distances = [euclidean_distance(x_test, x_train) for x_train in X_train]
  k_indices = np.argsort(distances)[:k]
  k_nearest_labels = y_train[k_indices]
  counts = np.bincount(k_nearest_labels)
  return np.argmax(counts)


def k_fold_cross_validation(X, y, k_folds=10, k_neighbors=5):
    fold_size = len(X) // k_folds
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    scores = []

    for fold in range(k_folds):
        start = fold * fold_size
        end = (start + fold_size) if fold != k_folds - 1 else len(X)

        val_indices = indices[start:end]
        train_indices = np.concatenate((indices[:start], indices[end:]))

        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[val_indices], y[val_indices]

        correct = 0
        for i in range(len(X_val)):
            prediction = knn_predict(X_train, y_train, X_val[i], k_neighbors)
            if prediction == y_val[i]:
                correct += 1

        accuracy = correct / len(X_val)
        scores.append(accuracy)
        print(f"Fold {fold + 1}: Accuracy = {accuracy:.4f}")

    print("Average accuracy:", np.mean(scores))

In [279]:
k_fold_cross_validation(X_original, y_original, k_folds = 10, k_neighbors = 5)

Fold 1: Accuracy = 0.8592
Fold 2: Accuracy = 0.8028
Fold 3: Accuracy = 0.7042
Fold 4: Accuracy = 0.8451
Fold 5: Accuracy = 0.7042
Fold 6: Accuracy = 0.8451
Fold 7: Accuracy = 0.8592
Fold 8: Accuracy = 0.8169
Fold 9: Accuracy = 0.7746
Fold 10: Accuracy = 0.8667
Average accuracy: 0.8077934272300469


In [280]:
k_fold_cross_validation(X_local, y_local, k_folds = 10, k_neighbors = 5)

Fold 1: Accuracy = 0.8028
Fold 2: Accuracy = 0.7887
Fold 3: Accuracy = 0.9155
Fold 4: Accuracy = 0.7324
Fold 5: Accuracy = 0.8028
Fold 6: Accuracy = 0.8451
Fold 7: Accuracy = 0.7887
Fold 8: Accuracy = 0.9155
Fold 9: Accuracy = 0.7606
Fold 10: Accuracy = 0.7467
Average accuracy: 0.8098779342723004


# Código 3)

Implementação do KNN do scikit-learn para fazer a comparação com o código 2

In [281]:
import numpy as np

class KNNClassifier:
    def __init__(self, n_neighbors=5, metric='euclidean'):
        self.n_neighbors = n_neighbors
        self.metric = metric

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def _compute_distance(self, x1, x2):
        if self.metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        else:
            raise ValueError("Distance not supported")

    def predict(self, X):
        X = np.array(X)
        predictions = []

        for x in X:
            distances = [self._compute_distance(x, x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.n_neighbors]
            k_nearest_labels = self.y_train[k_indices]

            counts = np.bincount(k_nearest_labels)
            prediction = np.argmax(counts)
            predictions.append(prediction)

        return np.array(predictions)

In [282]:
knn = KNNClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_original, y_original)

pred_original = knn.predict(X_original)

accuracy_original = np.mean(pred_original == y_original)

print(accuracy_original)

0.8683473389355743


In [283]:
knn = KNNClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_local, y_local)

pred_local = knn.predict(X_local)

accuracy_local = np.mean(pred_local == y_local)

print(accuracy_local)

0.8683473389355743
