In [None]:
import pandas as pd

# Define file paths (update these paths according to your file locations)
file_path1 = 'path/to/hayes_roth.csv'
file_path2 = 'path/to/car_evaluation.csv'
file_path3 = 'path/to/breast_cancer.csv'

# Read the datasets
breast_cancer = pd.read_csv("/content/breast-cancer.csv")  # Assuming no header in the file
car_evaluation = pd.read_csv("/content/car.csv")
hayes_roth = pd.read_csv("/content/hayes-roth.csv", header=None)

# Define column names for each dataset
hayes_roth_columns = ['name', 'hobby', 'age','education level', 'marital status','class']  # Update with actual column names
car_evaluation_columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
breast_cancer_columns = ['class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']
# Assign the column names
hayes_roth.columns = hayes_roth_columns
car_evaluation.columns = car_evaluation_columns
breast_cancer.columns = breast_cancer_columns


In [None]:
car_evaluation.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [None]:
breast_cancer.head(5)

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
2,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
3,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
4,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no


In [None]:
hayes_roth.head(5)

Unnamed: 0,name,hobby,age,education level,marital status,class
0,92,2,1,1,2,1
1,10,2,1,3,2,2
2,83,3,1,4,1,3
3,61,2,4,2,2,3
4,107,1,1,3,4,3


In [None]:
print(breast_cancer.shape,breast_cancer.info())
print(car_evaluation.shape,car_evaluation.info())
print(hayes_roth.shape,hayes_roth.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   class        285 non-null    object
 1   age          285 non-null    object
 2   menopause    285 non-null    object
 3   tumor-size   285 non-null    object
 4   inv-nodes    285 non-null    object
 5   node-caps    285 non-null    object
 6   deg-malig    285 non-null    int64 
 7   breast       285 non-null    object
 8   breast-quad  285 non-null    object
 9   irradiat     285 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.4+ KB
(285, 10) None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1727 non-null   object
 1   maint     1727 non-null   object
 2   doors     1727 non-null   object
 3   persons   1727 non-null   object
 4 

In [None]:
BC_classes = breast_cancer['class'].value_counts()
CE_classes = car_evaluation['class'].value_counts()
HR_classes = hayes_roth['class'].value_counts()

# Print the unique classes and counts in the desired format
print("Breast Cancer classes:")
for class_name, count in BC_classes.items():
    print(f"{class_name}: {count}")

print("\nCar Evaluation classes:")
for class_name, count in CE_classes.items():
    print(f"{class_name}: {count}")

print("\nHayes Roth classes:")
for class_name, count in HR_classes.items():
    print(f"{class_name}: {count}")

Breast Cancer classes:
no-recurrence-events: 200
recurrence-events: 85

Car Evaluation classes:
unacc: 1209
acc: 384
good: 69
vgood: 65

Hayes Roth classes:
1: 51
2: 51
3: 30


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
car_evaluation['class'] = label_encoder.fit_transform(car_evaluation['class'])

# Encode class labels for Breast Cancer dataset
breast_cancer['class'] = label_encoder.fit_transform(breast_cancer['class'])

In [None]:
def null_values(df, dataset_name):
    df.replace('?', pd.NA, inplace=True)
    null_counts = df.isnull().sum()
    print(f"Null values in {dataset_name}:")
    print(null_counts)
    df=df.dropna()  # dropping null values
null_values(breast_cancer, "Breast Cancer")
null_values(car_evaluation, "Car Evaluation")
null_values(hayes_roth, "Hayes Roth")

Null values in Breast Cancer:
class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiat       0
dtype: int64
Null values in Car Evaluation:
buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64
Null values in Hayes Roth:
name               0
hobby              0
age                0
education level    0
marital status     0
class              0
dtype: int64


In [None]:
def label_encoding(df):
    df.dropna(subset=df.select_dtypes(include=['object']).columns, inplace=True)
    for column in df.columns:
        if df[column].dtype == 'object':
            label_encoder = LabelEncoder()
            df[column] = label_encoder.fit_transform(df[column])
    return df

# Perform label encoding on each dataset
breast_cancer = label_encoding(breast_cancer)
car_evaluation = label_encoding(car_evaluation)
hayes_roth = label_encoding(hayes_roth)
print(breast_cancer.head(5))


   class  age  menopause  tumor-size  inv-nodes  node-caps  deg-malig  breast  \
0      0    2          2           3          0          0          2       1   
1      0    2          2           3          0          0          2       0   
2      0    4          0           2          0          0          2       1   
3      0    2          2           0          0          0          2       1   
4      0    4          0           2          0          0          2       0   

   breast-quad  irradiat  
0            4         0  
1            1         0  
2            2         0  
3            3         0  
4            1         0  


In [None]:
breast_cancer_class = breast_cancer['class'].values
car_evaluation_class = car_evaluation['class'].values
hayes_roth_class = hayes_roth['class'].values
breast_cancer_data = breast_cancer.drop(columns=['class'],axis=1).values
car_evaluation_data = car_evaluation.drop(columns=['class'],axis=1).values
hayes_roth_data = hayes_roth.drop(columns=['class'],axis=1).values

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from scipy import stats

In [None]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

def manhattan_distance(a, b):
    return np.sum(np.abs(a - b))


def cosine_similarity(a, b):
    # Convert to numpy arrays
    a = np.array(a)
    b = np.array(b)

    # Check for zero vectors
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 0.0  # or np.nan if you prefer to signify an undefined similarity

    # Calculate cosine similarity
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def cosine_distance(a, b):
    return 1 - cosine_similarity(a, b)

def mahalanobis_distance(a, b, covariance_matrix):
    diff = a - b
    return np.sqrt(np.dot(np.dot(diff, np.linalg.inv(covariance_matrix)), diff.T))


In [None]:
def discernibility_vector(P, T):
    unique_classes = np.unique(T)
    z = np.zeros(len(P))
    for i in range(len(P)):
        # For each training pattern, set the discernibility score based on the class distribution
        class_counts = Counter([T[j] for j in range(len(P)) if np.array_equal(P[i], P[j])])
        z[i] = 1.0 / len(class_counts)  # A basic rule: discernibility is inverse to class variety
    return z


In [None]:
class CustomKNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        if self.distance_metric == 'mahalanobis':
            self.covariance_matrix = np.cov(X_train.T)

    def _distance(self, x, x_train):
        if self.distance_metric == 'euclidean':
            return euclidean_distance(x, x_train)
        elif self.distance_metric == 'manhattan':
            return manhattan_distance(x, x_train)
        elif self.distance_metric == 'cosine':
            return cosine_distance(x, x_train)
        elif self.distance_metric == 'mahalanobis':
            return mahalanobis_distance(x, x_train, self.covariance_matrix)

    def predict(self, X_test):
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)

    def _predict(self, x):
        distances = [self._distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [None]:
class DkNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, P, T):
        self.P = P
        self.T = T
        self.z = discernibility_vector(P, T)

    def _distance(self, a, b):
        if self.distance_metric == 'euclidean':
            return euclidean_distance(a, b)
        elif self.distance_metric == 'manhattan':
            return manhattan_distance(a, b)
        elif self.distance_metric == 'cosine':
            return cosine_distance(a, b)
        elif self.distance_metric == 'mahalanobis':
            if not hasattr(self, 'covariance_matrix'):
                self.covariance_matrix = np.cov(self.P.T)
            return mahalanobis_distance(a, b, self.covariance_matrix)
        else:
            raise ValueError("Unsupported distance metric.")

    def D_predict(self, PT):
        predictions = []
        for i in range(len(PT)):
            D = np.array([self._distance(PT[i], p) for p in self.P])  # Use the new distance function
            sorted_indices = np.argsort(D)
            sorted_distances = D[sorted_indices]
            dk = sorted_distances[:self.k]
            neighbors_indices = sorted_indices[:self.k]
            epsilon = 1e-10  # Prevent division by zero
            v = np.array([self.z[neighbors_indices[m]] / (dk[m] + epsilon) for m in range(self.k)])

            scores = {}
            unique_classes = np.unique(self.T)
            for c in unique_classes:
                Cj = [m for m in neighbors_indices if self.T[m] == c]
                if len(Cj) > 0:
                    scores[c] = np.mean([v[neighbors_indices.tolist().index(m)] for m in Cj])

            predicted_class = max(scores, key=scores.get)
            predictions.append(predicted_class)

        return np.array(predictions)


In [None]:
def cross_validate_knn(X, y, k=3, distance_metric='euclidean'):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    fold_results = {'Fold': [], 'Custom KNN': [], 'Sklearn KNN': [], 'D-KNN': []}

    custom_accuracies = []
    sklearn_accuracies = []
    D_accuracies = []

    fold = 1
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Custom KNN
        custom_knn = CustomKNN(k=k, distance_metric=distance_metric)
        custom_knn.fit(X_train, y_train)
        y_pred_custom = custom_knn.predict(X_test)
        custom_acc = accuracy_score(y_test, y_pred_custom)
        custom_accuracies.append(custom_acc)

        # Sklearn KNN
        sklearn_knn = KNeighborsClassifier(n_neighbors=k)
        sklearn_knn.fit(X_train, y_train)
        y_pred_sklearn = sklearn_knn.predict(X_test)
        sklearn_acc = accuracy_score(y_test, y_pred_sklearn)
        sklearn_accuracies.append(sklearn_acc)

        # D-KNN
        dknn = DkNN(k=k, distance_metric=distance_metric)
        dknn.fit(X_train, y_train)
        predictions = dknn.D_predict(X_test)
        D_accuracy = accuracy_score(y_test, predictions)
        D_accuracies.append(D_accuracy)

        fold_results['Fold'].append(fold)
        fold_results['Custom KNN'].append(round(custom_acc, 4))
        fold_results['Sklearn KNN'].append(round(sklearn_acc, 4))
        fold_results['D-KNN'].append(round(D_accuracy, 4))

        fold += 1

    custom_mean = round(np.mean(custom_accuracies), 4)
    sklearn_mean = round(np.mean(sklearn_accuracies), 4)
    D_mean = round(np.mean(D_accuracies), 4)

    results_df = pd.DataFrame(fold_results)
    mean_row = {'Fold': 'Mean', 'Custom KNN': custom_mean, 'Sklearn KNN': sklearn_mean, 'D-KNN': D_mean}
    results_df = pd.concat([results_df, pd.DataFrame([mean_row])], ignore_index=True)

    print(results_df.to_string(index=False))

    # Print the mean accuracies
    print(f"\nMean Accuracy of Custom KNN: {custom_mean}")
    print(f"Mean Accuracy of Sklearn KNN: {sklearn_mean}")
    print(f"Mean Accuracy of D-KNN: {D_mean}")

    # Perform hypothesis tests with descriptive names
    perform_hypothesis_test(custom_accuracies, sklearn_accuracies, "Custom KNN", "Sklearn KNN")
    perform_hypothesis_test(custom_accuracies, D_accuracies, "Custom KNN", "D-KNN")
    perform_hypothesis_test(sklearn_accuracies, D_accuracies, "Sklearn KNN", "D-KNN")


In [None]:
def perform_hypothesis_test(algo1, algo2, algo1_name, algo2_name):
    """Perform a paired t-test between two algorithms and display results."""

    # Check if there is enough variability
    if len(algo1) < 2 or len(algo2) < 2:
        print(f"Not enough data points to perform t-test between {algo1_name} and {algo2_name}.")
        return

    if np.std(algo1) == 0 or np.std(algo2) == 0:
        print(f"Insufficient variability in accuracy scores; cannot perform t-test between {algo1_name} and {algo2_name}.")
        return

    # Perform paired t-test
    t_statistic, p_value = stats.ttest_rel(algo1, algo2)

    print(f"\nPaired t-test results between {algo1_name} and {algo2_name}:")
    print(f"T-statistic: {t_statistic:.4f}, P-value: {p_value:.4f}")

    # Determine significance
    alpha = 0.05
    if p_value < alpha:
        print(f"Significant difference in accuracy between {algo1_name} and {algo2_name}.")
    else:
        print(f"No significant difference in accuracy between {algo1_name} and {algo2_name}.")


In [None]:
def main():
    datasets = [
        (breast_cancer_data, breast_cancer_class),  # Breast Cancer dataset (features, labels)
        (car_evaluation_data, car_evaluation_class),  # Car Evaluation dataset (features, labels)
        (hayes_roth_data, hayes_roth_class)  # Hayes-Roth dataset (features, labels)
    ]

    dataset_names = ["Breast Cancer", "Car Evaluation", "Hayes-Roth"]

    # Get user input for distance metric
    print("Select a distance metric:")
    print("1. Euclidean")
    print("2. Manhattan")
    print("3. Cosine")
    print("4. Mahalanobis")

    choice = input("Enter the number corresponding to your choice (1-4): ")
    if choice == '1':
        distance_metric = 'euclidean'
    elif choice == '2':
        distance_metric = 'manhattan'
    elif choice == '3':
        distance_metric = 'cosine'
    elif choice == '4':
        distance_metric = 'mahalanobis'
    else:
        print("Invalid choice! Defaulting to Euclidean distance.")
        distance_metric = 'euclidean'

    # Run KNN on all datasets with the specified distance metric
    k = int(input("Enter the value of k (number of neighbors): "))

    for dataset, name in zip(datasets, dataset_names):
        X, y = dataset
        print(f"\nRunning kNN on {name} dataset:")
        cross_validate_knn(X, y, k=k, distance_metric=distance_metric)

if __name__ == "__main__":
    main()

Select a distance metric:
1. Euclidean
2. Manhattan
3. Cosine
4. Mahalanobis
Enter the number corresponding to your choice (1-4): 3
Enter the value of k (number of neighbors): 3

Running kNN on Breast Cancer dataset:
Fold  Custom KNN  Sklearn KNN  D-KNN
   1      0.5357       0.6429 0.5714
   2      0.5714       0.7857 0.6786
   3      0.7143       0.7857 0.6786
   4      0.7857       0.7143 0.8571
   5      0.7143       0.7143 0.6429
   6      0.7857       0.8214 0.7857
   7      0.6667       0.6296 0.6296
   8      0.7778       0.8148 0.6296
   9      0.6296       0.7778 0.6667
  10      0.6296       0.6667 0.7037
Mean      0.6811       0.7353 0.6844

Mean Accuracy of Custom KNN: 0.6811
Mean Accuracy of Sklearn KNN: 0.7353
Mean Accuracy of D-KNN: 0.6844

Paired t-test results between Custom KNN and Sklearn KNN:
T-statistic: -2.0099, P-value: 0.0753
No significant difference in accuracy between Custom KNN and Sklearn KNN.

Paired t-test results between Custom KNN and D-KNN:
T-statisti