In [410]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
from scipy.spatial.distance import pdist, squareform
from scipy.spatial import distance
import pandas as pd
import numpy as np
from collections import defaultdict

In [411]:
wine = pd.read_csv('/Users/azulmakk/Universidad/Analisis Multivariado/TP/winequality-red.csv', header=None,
                   names=['pH', 'alcohol', 'fixed.acidity', 'free sulfur dioxide','quality'])
wine.iloc[:, :-1] = wine.iloc[:, :-1].apply(pd.to_numeric, errors='coerce')

wine = wine[wine['quality'].apply(lambda x: str(x).isdigit())]

selected_quality = ["5", "6", "7"]
sample = wine[wine['quality'].isin(selected_quality)]

sample_size = 300
sample = sample.sample(n=sample_size, random_state=42)
sample['quality'] = sample['quality'].astype('category')

In [412]:
def calculo_indicadores(predicciones, etiquetas):
    matriz_confusion = confusion_matrix(etiquetas, predicciones, labels=np.unique(etiquetas))
    
    accuracy = accuracy_score(etiquetas, predicciones)
    
    precision = precision_score(etiquetas, predicciones, average='weighted', zero_division=0)
    
    recall = recall_score(etiquetas, predicciones, average='weighted', zero_division=0)
    
    f1 = f1_score(etiquetas, predicciones, average='weighted', zero_division=0)

    indicadores = {"Matriz de confusion": matriz_confusion,"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}
    return indicadores

## _k_ Nearest Neighbour

### Distancias Euclideas

In [413]:
X = sample[['pH', 'alcohol', 'fixed.acidity', 'free sulfur dioxide']]
y = sample['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [414]:
param_grid = {'n_neighbors': list(range(1, 50))}

knnEucli = KNeighborsClassifier()
grid_searchEucli = GridSearchCV(knnEucli, param_grid, cv=5)

In [415]:
grid_searchEucli.fit(X_train, y_train)

In [416]:
best_modelEucli = grid_searchEucli.best_estimator_

predictionsEucli = best_modelEucli.predict(X_test)
indicadoresEucli = calculo_indicadores(predictionsEucli, y_test)

print("Best k:", best_modelEucli.get_params()['n_neighbors'])
print("Matriz de Confusion:\n", indicadoresEucli["Matriz de confusion"])
print("Accuracy:", indicadoresEucli["Accuracy"])
print("Precision:", indicadoresEucli["Precision"])
print("Recall:", indicadoresEucli["Recall"])
print("F1 Score:", indicadoresEucli["F1 Score"])

Best k: 16
Matriz de Confusion:
 [[20  9  1]
 [ 5 19  1]
 [ 0  3  2]]
Accuracy: 0.6833333333333333
Precision: 0.6970430107526882
Recall: 0.6833333333333333
F1 Score: 0.6834114959114959


### Distancia de Mahattan

In [417]:
knnCityBlock = KNeighborsClassifier(metric='manhattan')

grid_searchCityBlock = GridSearchCV(knnCityBlock, param_grid, cv=5)

grid_searchCityBlock.fit(X_train, y_train)

best_modelCityBlock = grid_searchCityBlock.best_estimator_

predictionsCityBlock = best_modelCityBlock.predict(X_test)
indicadoresCityBlock = calculo_indicadores(predictionsCityBlock, y_test)

accuracyCityBlock = accuracy_score(y_test, predictionsCityBlock)
precisionCityBlock = precision_score(y_test, predictionsCityBlock, average='weighted')
recallCityBlock = recall_score(y_test, predictionsCityBlock, average='weighted')
f1CityBlock = f1_score(y_test, predictionsCityBlock, average='weighted')

print("Best k:", best_modelCityBlock.get_params()['n_neighbors'])
print("Matriz de Confusion:\n", indicadoresCityBlock["Matriz de confusion"])
print("Accuracy:", accuracyCityBlock)
print("Precision:", precisionCityBlock)
print("Recall:", recallCityBlock)
print("F1 Score:", f1CityBlock)

Best k: 33
Matriz de Confusion:
 [[21  9  0]
 [ 7 18  0]
 [ 0  5  0]]
Accuracy: 0.65
Precision: 0.609375
Recall: 0.65
F1 Score: 0.6252268602540834


  _warn_prf(average, modifier, msg_start, len(result))


### Distancia de Chebychev

In [418]:
knn_chebyshev = KNeighborsClassifier(metric='chebyshev')

grid_search_chebyshev = GridSearchCV(knn_chebyshev, param_grid, cv=5)

grid_search_chebyshev.fit(X_train, y_train)

best_model_chebyshev = grid_search_chebyshev.best_estimator_

predictions_chebyshev = best_model_chebyshev.predict(X_test)
indicadores_chebyshev = calculo_indicadores(predictions_chebyshev, y_test)

accuracy_chebyshev = accuracy_score(y_test, predictions_chebyshev)
precision_chebyshev = precision_score(y_test, predictions_chebyshev, average='weighted')
recall_chebyshev = recall_score(y_test, predictions_chebyshev, average='weighted')
f1_chebyshev = f1_score(y_test, predictions_chebyshev, average='weighted')

print("Best k:", best_model_chebyshev.get_params()['n_neighbors'])
print("Matriz de Confusion:\n", indicadores_chebyshev["Matriz de confusion"])
print("Accuracy:", accuracy_chebyshev)
print("Precision:", precision_chebyshev)
print("Recall:", recall_chebyshev)
print("F1 Score:", f1_chebyshev)

Best k: 6
Matriz de Confusion:
 [[18 11  1]
 [ 6 16  3]
 [ 1  3  1]]
Accuracy: 0.5833333333333334
Precision: 0.5988888888888888
Recall: 0.5833333333333334
F1 Score: 0.5863636363636364


### Distancias de Canberra

In [419]:
canberra_distance = pairwise_distances(X_train, X_train, metric='canberra')

knn_canberra = KNeighborsClassifier(metric='precomputed')

grid_search_canberra = GridSearchCV(knn_canberra, param_grid, cv=5)

grid_search_canberra.fit(canberra_distance, y_train)

best_model_canberra = grid_search_canberra.best_estimator_

canberra_distance_test = pairwise_distances(X_test, X_train, metric='canberra')

predictions_canberra = best_model_canberra.predict(canberra_distance_test)
indicadores_canberra = calculo_indicadores(predictions_chebyshev, y_test)

accuracy_canberra = accuracy_score(y_test, predictions_canberra)
precision_canberra = precision_score(y_test, predictions_canberra, average='weighted')
recall_canberra = recall_score(y_test, predictions_canberra, average='weighted')
f1_canberra = f1_score(y_test, predictions_canberra, average='weighted')

print("Best k:", best_model_canberra.get_params()['n_neighbors'])
print("Matriz de Confusion:\n", indicadores_canberra["Matriz de confusion"])
print("Accuracy:", accuracy_canberra)
print("Precision:", precision_canberra)
print("Recall:", recall_canberra)
print("F1 Score:", f1_canberra)

Best k: 12
Matriz de Confusion:
 [[18 11  1]
 [ 6 16  3]
 [ 1  3  1]]
Accuracy: 0.6833333333333333
Precision: 0.7196180555555555
Recall: 0.6833333333333333
F1 Score: 0.6739766081871343


# Algoritmo de elaboración propia

In [420]:
def calculate_group_probabilitiesEucli(X_train, y_train, X_test, k_neighbors):
    probabilities = []

    for i in range(len(X_test)):
        distances = [(j, distance.euclidean(X_test.iloc[i], X_train.iloc[j])) for j in range(len(X_train))]
        distances.sort(key=lambda x: x[1])
        neighbors = distances[:k_neighbors]

        group_probabilities = defaultdict(float)
        total_distance = 0

        for idx, dist in neighbors:
            group = y_train.iloc[idx]
            weight = 1 / (dist + 1e-6)  
            group_probabilities[group] += weight
            total_distance += weight

        normalized_probabilities = {group: prob / total_distance for group, prob in group_probabilities.items()}
        probabilities.append(normalized_probabilities)

    return probabilities

k_neighborsEucli = best_modelEucli.get_params()['n_neighbors']
probabilitiesEucli = calculate_group_probabilitiesEucli(X_train, y_train, X_test, k_neighborsEucli)

threshold = 0.8

predictions_binaryEucli = []
for prob_dict in probabilitiesEucli:
    predicted_classEucli = max(prob_dict.items(), key=lambda x: x[1])[0]
    predictions_binaryEucli.append(predicted_classEucli)

y_true = y_test.astype(str)

confusion_matrix_binaryEucli = confusion_matrix(y_true, predictions_binaryEucli)
accuracy_binaryEucli = accuracy_score(y_true, predictions_binaryEucli)
precision_binaryEucli = precision_score(y_true, predictions_binaryEucli, average='weighted')
recall_binaryEucli = recall_score(y_true, predictions_binaryEucli, average='weighted')
f1_binaryEucli = f1_score(y_true, predictions_binaryEucli, average='weighted')

print("Matriz de Confusion:\n", confusion_matrix_binaryEucli)
print("Accuracy:", accuracy_binaryEucli)
print("Precision:", precision_binaryEucli)
print("Recall:", recall_binaryEucli)
print("F1 Score:", f1_binaryEucli)

Matriz de Confusion:
 [[19 11  0]
 [ 4 21  0]
 [ 1  2  2]]
Accuracy: 0.7
Precision: 0.7365196078431372
Recall: 0.7
F1 Score: 0.696081068962425


### Distancias de Manhattan

In [421]:
def calculate_group_probabilitiesManhattan(X_train, y_train, X_test, k_neighbors):
    probabilities = []

    for i in range(len(X_test)):
        manhattan_distances = [(j, distance.cityblock(X_test.iloc[i], X_train.iloc[j])) for j in range(len(X_train))]
        manhattan_distances.sort(key=lambda x: x[1])
        neighbors = manhattan_distances[:k_neighbors]

        group_probabilities = defaultdict(float)
        total_distance = 0

        for idx, dist in neighbors:
            group = y_train.iloc[idx]
            weight = 1 / (dist + 1e-6)  
            group_probabilities[group] += weight
            total_distance += weight

        normalized_probabilities = {group: prob / total_distance for group, prob in group_probabilities.items()}
        probabilities.append(normalized_probabilities)

    return probabilities

k_neighborsManhattan = best_modelCityBlock.get_params()['n_neighbors']

probabilitiesManhattan = calculate_group_probabilitiesManhattan(X_train, y_train, X_test, k_neighborsManhattan)

threshold = 0.8

predictions_binaryManhattan = []
for prob_dict in probabilitiesManhattan:
    predicted_classManhattan = max(prob_dict.items(), key=lambda x: x[1])[0]
    predictions_binaryManhattan.append(predicted_classManhattan)

y_true = y_test.astype(str)
confusion_matrix_binaryManhattan = confusion_matrix(y_true, predictions_binaryManhattan)
accuracy_binaryManhattan = accuracy_score(y_true, predictions_binaryManhattan)
precision_binaryManhattan = precision_score(y_true, predictions_binaryManhattan, average='weighted')
recall_binaryManhattan = recall_score(y_true, predictions_binaryManhattan, average='weighted')
f1_binaryManhattan = f1_score(y_true, predictions_binaryManhattan, average='weighted')

print("Matriz de Confusión:\n", confusion_matrix_binaryManhattan)
print("Accuracy:", accuracy_binaryManhattan)
print("Precision:", precision_binaryManhattan)
print("Recall:", recall_binaryManhattan)
print("F1 Score:", f1_binaryManhattan)

Matriz de Confusión:
 [[21  9  0]
 [ 7 18  0]
 [ 1  4  0]]
Accuracy: 0.65
Precision: 0.6040044493882092
Recall: 0.65
F1 Score: 0.6237893462469732


  _warn_prf(average, modifier, msg_start, len(result))


### Distancias de Chevychev

In [422]:
import numpy as np
from scipy.spatial import distance
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from collections import defaultdict

def calculate_group_probabilitiesChebyshev(X_train, y_train, X_test, k_neighbors):
    probabilities = []

    for i in range(len(X_test)):
        chebyshev_distances = [(j, distance.chebyshev(X_test.iloc[i], X_train.iloc[j])) for j in range(len(X_train))]
        chebyshev_distances.sort(key=lambda x: x[1])
        neighbors = chebyshev_distances[:k_neighbors]

        group_probabilities = defaultdict(float)
        total_distance = 0

        for idx, dist in neighbors:
            group = y_train.iloc[idx]
            weight = 1 / (dist + 1e-6)  # Añadir pequeño valor para evitar división por cero
            group_probabilities[group] += weight
            total_distance += weight

        normalized_probabilities = {group: prob / total_distance for group, prob in group_probabilities.items()}
        probabilities.append(normalized_probabilities)

    return probabilities

k_neighborsChebyshev = best_model_chebyshev.get_params()['n_neighbors']

probabilitiesChebyshev = calculate_group_probabilitiesChebyshev(X_train, y_train, X_test, k_neighborsChebyshev)

threshold = 0.8

predictions_binaryChebyshev = []
for prob_dict in probabilitiesChebyshev:
    predicted_classChebyshev = max(prob_dict.items(), key=lambda x: x[1])[0]
    predictions_binaryChebyshev.append(predicted_classChebyshev)

y_true = y_test.astype(str)
confusion_matrix_binaryChebyshev = confusion_matrix(y_true, predictions_binaryChebyshev)
accuracy_binaryChebyshev = accuracy_score(y_true, predictions_binaryChebyshev)
precision_binaryChebyshev = precision_score(y_true, predictions_binaryChebyshev, average='weighted')
recall_binaryChebyshev = recall_score(y_true, predictions_binaryChebyshev, average='weighted')
f1_binaryChebyshev = f1_score(y_true, predictions_binaryChebyshev, average='weighted')

print("Matriz de Confusión:\n", confusion_matrix_binaryChebyshev)
print("Accuracy:", accuracy_binaryChebyshev)
print("Precision:", precision_binaryChebyshev)
print("Recall:", recall_binaryChebyshev)
print("F1 Score:", f1_binaryChebyshev)

Matriz de Confusión:
 [[15 14  1]
 [ 4 18  3]
 [ 1  3  1]]
Accuracy: 0.5666666666666667
Precision: 0.6059523809523809
Recall: 0.5666666666666667
F1 Score: 0.5666666666666667


### Distancia de Canberra

In [423]:
def calculate_group_probabilitiesCanberra(X_train, y_train, X_test, k_neighbors):
    probabilities = []

    for i in range(len(X_test)):
        canberra_distances = [(j, distance.canberra(X_test.iloc[i], X_train.iloc[j])) for j in range(len(X_train))]
        canberra_distances.sort(key=lambda x: x[1])
        neighbors = canberra_distances[:k_neighbors]

        group_probabilities = defaultdict(float)
        total_distance = 0

        for idx, dist in neighbors:
            group = y_train.iloc[idx]
            weight = 1 / (dist + 1e-6)  
            group_probabilities[group] += weight
            total_distance += weight

        normalized_probabilities = {group: prob / total_distance for group, prob in group_probabilities.items()}
        probabilities.append(normalized_probabilities)

    return probabilities

k_neighborsCanberra = best_model_canberra.get_params()['n_neighbors']

probabilitiesCanberra = calculate_group_probabilitiesCanberra(X_train, y_train, X_test, k_neighborsCanberra)

threshold = 0.8

predictions_binaryCanberra = []
for prob_dict in probabilitiesCanberra:
    predicted_classCanberra = max(prob_dict.items(), key=lambda x: x[1])[0]
    predictions_binaryCanberra.append(predicted_classCanberra)

y_true = y_test.astype(str)
confusion_matrix_binaryCanberra = confusion_matrix(y_true, predictions_binaryCanberra)
accuracy_binaryCanberra = accuracy_score(y_true, predictions_binaryCanberra)
precision_binaryCanberra = precision_score(y_true, predictions_binaryCanberra, average='weighted')
recall_binaryCanberra = recall_score(y_true, predictions_binaryCanberra, average='weighted')
f1_binaryCanberra = f1_score(y_true, predictions_binaryCanberra, average='weighted')

print("Matriz de Confusión:\n", confusion_matrix_binaryCanberra)
print("Accuracy:", accuracy_binaryCanberra)
print("Precision:", precision_binaryCanberra)
print("Recall:", recall_binaryCanberra)
print("F1 Score:", f1_binaryCanberra)

Matriz de Confusión:
 [[21  9  0]
 [ 4 20  1]
 [ 0  4  1]]
Accuracy: 0.7
Precision: 0.7141919191919192
Recall: 0.7
F1 Score: 0.692984027466786
