# 

In [16]:
import sklearn.datasets as datasets
import numpy as np
import matplotlib.pyplot as plt

In [17]:
# Load 3 different datasets for minimum distance and 1NN classifier
# 1. Iris dataset
iris = datasets.load_iris()
iris_data = iris.data
iris_target = iris.target
iris_feature_names = iris.feature_names
iris_target_names = iris.target_names

# 2. Wine dataset

wine = datasets.load_wine()
wine_data = wine.data
wine_target = wine.target
wine_feature_names = wine.feature_names
wine_target_names = wine.target_names

# 3. Breast cancer dataset
cancer = datasets.load_breast_cancer()
cancer_data = cancer.data
cancer_target = cancer.target
cancer_feature_names = cancer.feature_names
cancer_target_names = cancer.target_names


In [18]:
#find centroid of each class

def find_centroid(data, target):
    centroid = []
    for i in np.unique(target):
        centroid.append(np.mean(data[target == i], axis=0))
    return np.array(centroid)

def find_min_distance(new_sample, centroid):
    # Calculate the distance between the new sample and the centroid of each class
    distance = np.linalg.norm(new_sample - centroid, axis=1)
    # Return class with minimum distance
    return np.argmin(distance)

def find_1_nearest_neighbor(new_sample, data, target):
    # Calculate the distance between the new sample and each sample in the dataset
    distance = np.linalg.norm(data - new_sample, axis=1)
    # Return the class of the sample with minimum distance
    return target[np.argmin(distance)]



In [19]:
#Validate models using Holdout stratified  (70/30), 10 fold cross validation and leave one out cross validation
def holdout_stratified(data, target, classifier):
    # Split the dataset into training and testing sets
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, stratify=target)
    # Train the classifier
    centroid = find_centroid(X_train, y_train)
    # Test the classifier
    correct = 0
    for i in range(len(X_test)):
        if classifier(X_test[i], centroid) == y_test[i]:
            correct += 1
    return correct / len(X_test)

def k_fold_cross_validation(data, target, classifier, k):
    # Split the dataset into k folds
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=k)
    accuracy = 0
    for train_index, test_index in kf.split(data):
        # Train the classifier
        centroid = find_centroid(data[train_index], target[train_index])
        # Test the classifier
        correct = 0
        for i in test_index:
            if classifier(data[i], centroid) == target[i]:
                correct += 1
        accuracy += correct / len(test_index)
    return accuracy / k

def leave_one_out_cross_validation(data, target, classifier):
    # Split the dataset into n folds
    from sklearn.model_selection import LeaveOneOut
    loo = LeaveOneOut()
    accuracy = 0
    for train_index, test_index in loo.split(data):
        # Train the classifier
        centroid = find_centroid(data[train_index], target[train_index])
        # Test the classifier
        if classifier(data[test_index], centroid) == target[test_index]:
            accuracy += 1
    return accuracy / len(data)

In [20]:
# Performance measures (Accuracy, Confusion matrix)

def confusion_matrix(data, target, classifier):
    # Initialize the confusion matrix
    confusion_matrix = np.zeros((len(np.unique(target)), len(np.unique(target))))
    # Calculate the confusion matrix
    for i in range(len(data)):
        confusion_matrix[target[i], classifier(data[i], find_centroid(data, target))] += 1
    return confusion_matrix
    
def accuracy(data, target, classifier):
    # Calculate the accuracy of the classifier
    return np.trace(confusion_matrix(data, target, classifier)) / len(data)
    

In [21]:
# Results with different datasets and classifiers using different validation methods

# Iris dataset
print("------------------Iris dataset------------------")


print("-----------------Minimum distance-----------------")
print("Holdout stratified:", holdout_stratified(iris_data, iris_target, find_min_distance))
print("10-fold cross validation:", k_fold_cross_validation(iris_data, iris_target, find_min_distance, 10))
print("Leave-one-out cross validation:", leave_one_out_cross_validation(iris_data, iris_target, find_min_distance))
print("Confusion matrix:")
print(confusion_matrix(iris_data, iris_target, find_min_distance))
print("Accuracy:", accuracy(iris_data, iris_target, find_min_distance))


print("-----------------1NN-----------------")
print("Holdout stratified:", holdout_stratified(iris_data, iris_target, lambda x, _: find_1_nearest_neighbor(x, iris_data, iris_target)))
print("10-fold cross validation:", k_fold_cross_validation(iris_data, iris_target, lambda x, _: find_1_nearest_neighbor(x, iris_data, iris_target), 10))
print("Leave-one-out cross validation:", leave_one_out_cross_validation(iris_data, iris_target, lambda x, _: find_1_nearest_neighbor(x, iris_data, iris_target)))
print("Confusion matrix:")
print(confusion_matrix(iris_data, iris_target, lambda x, _: find_1_nearest_neighbor(x, iris_data, iris_target)))
print("Accuracy:", accuracy(iris_data, iris_target, lambda x, _: find_1_nearest_neighbor(x, iris_data, iris_target)))



------------------Iris dataset------------------
-----------------Minimum distance-----------------
Holdout stratified: 0.8888888888888888
10-fold cross validation: 0.9266666666666667
Leave-one-out cross validation: 0.92
Confusion matrix:
[[50.  0.  0.]
 [ 0. 46.  4.]
 [ 0.  7. 43.]]
Accuracy: 0.9266666666666666
-----------------1NN-----------------
Holdout stratified: 1.0
10-fold cross validation: 1.0
Leave-one-out cross validation: 1.0
Confusion matrix:
[[50.  0.  0.]
 [ 0. 50.  0.]
 [ 0.  0. 50.]]
Accuracy: 1.0


In [22]:
# Wine dataset
print("------------------Wine dataset------------------")

print("-----------------Minimum distance classifier-----------------")
print("Minimum distance classifier")
print("Holdout stratified:", holdout_stratified(wine_data, wine_target, find_min_distance))
print("10-fold cross validation:", k_fold_cross_validation(wine_data, wine_target, find_min_distance, 10))
print("Leave-one-out cross validation:", leave_one_out_cross_validation(wine_data, wine_target, find_min_distance))
print("Confusion matrix:")
print(confusion_matrix(wine_data, wine_target, find_min_distance))
print("Accuracy:", accuracy(wine_data, wine_target, find_min_distance))

print("-----------------1NN-----------------")
print("Holdout stratified:", holdout_stratified(wine_data, wine_target, lambda x, _: find_1_nearest_neighbor(x, wine_data, wine_target)))
print("10-fold cross validation:", k_fold_cross_validation(wine_data, wine_target, lambda x, _: find_1_nearest_neighbor(x, wine_data, wine_target), 10))
print("Leave-one-out cross validation:", leave_one_out_cross_validation(wine_data, wine_target, lambda x, _: find_1_nearest_neighbor(x, wine_data, wine_target)))
print("Confusion matrix:")
print(confusion_matrix(wine_data, wine_target, lambda x, _: find_1_nearest_neighbor(x, wine_data, wine_target)))
print("Accuracy:", accuracy(wine_data, wine_target, lambda x, _: find_1_nearest_neighbor(x, wine_data, wine_target)))

------------------Wine dataset------------------
-----------------Minimum distance classifier-----------------
Minimum distance classifier
Holdout stratified: 0.7037037037037037
10-fold cross validation: 0.7183006535947711
Leave-one-out cross validation: 0.7247191011235955
Confusion matrix:
[[50.  0.  9.]
 [ 3. 49. 19.]
 [ 1. 17. 30.]]
Accuracy: 0.7247191011235955
-----------------1NN-----------------
Holdout stratified: 1.0
10-fold cross validation: 1.0
Leave-one-out cross validation: 1.0
Confusion matrix:
[[59.  0.  0.]
 [ 0. 71.  0.]
 [ 0.  0. 48.]]
Accuracy: 1.0


In [23]:
# Breast cancer dataset
print("------------------Breast cancer dataset------------------")

print("-----------------Minimum distance classifier-----------------")
print("Holdout stratified:", holdout_stratified(cancer_data, cancer_target, find_min_distance))
print("10-fold cross validation:", k_fold_cross_validation(cancer_data, cancer_target, find_min_distance, 10))
print("Leave-one-out cross validation:", leave_one_out_cross_validation(cancer_data, cancer_target, find_min_distance))
print("Confusion matrix:")
print(confusion_matrix(cancer_data, cancer_target, find_min_distance))
print("Accuracy:", accuracy(cancer_data, cancer_target, find_min_distance))

print("-----------------1NN-----------------")
print("Holdout stratified:", holdout_stratified(cancer_data, cancer_target, lambda x, _: find_1_nearest_neighbor(x, cancer_data, cancer_target)))
print("10-fold cross validation:", k_fold_cross_validation(cancer_data, cancer_target, lambda x, _: find_1_nearest_neighbor(x, cancer_data, cancer_target), 10))
print("Leave-one-out cross validation:", leave_one_out_cross_validation(cancer_data, cancer_target, lambda x, _: find_1_nearest_neighbor(x, cancer_data, cancer_target)))
print("Confusion matrix:")
print(confusion_matrix(cancer_data, cancer_target, lambda x, _: find_1_nearest_neighbor(x, cancer_data, cancer_target)))
print("Accuracy:", accuracy(cancer_data, cancer_target, lambda x, _: find_1_nearest_neighbor(x, cancer_data, cancer_target)))

------------------Breast cancer dataset------------------
-----------------Minimum distance classifier-----------------
Holdout stratified: 0.8947368421052632
10-fold cross validation: 0.8876253132832082
Leave-one-out cross validation: 0.8910369068541301
Confusion matrix:
[[154.  58.]
 [  4. 353.]]
Accuracy: 0.8910369068541301
-----------------1NN-----------------
Holdout stratified: 1.0
10-fold cross validation: 1.0
Leave-one-out cross validation: 1.0
Confusion matrix:
[[212.   0.]
 [  0. 357.]]
Accuracy: 1.0
