In [22]:
from importlib import reload
import tools.preprocess as preprocess
import tools.knn as knn

reload(preprocess)
reload(knn)

from tools.preprocess import load_datasets, preprocess_mushrooms_datasets
import pandas as pd


In [3]:
# Load mushroom train and test dataframes

train_dfs = load_datasets('datasetsCBR/mushroom/mushroom.fold.*.train.arff')
test_dfs = load_datasets('datasetsCBR/mushroom/mushroom.fold.*.test.arff')

train_dfs[0].head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,b'x',b's',b'w',b't',b'l',b'f',b'c',b'b',b'k',b'e',b'c',b's',b's',b'w',b'w',b'p',b'w',b'o',b'p',b'k',b'n',b'm',b'e'
1,b'k',b'y',b'e',b'f',b's',b'f',b'c',b'n',b'b',b't',b'?',b'k',b's',b'p',b'w',b'p',b'w',b'o',b'e',b'w',b'v',b'd',b'p'
2,b'x',b'y',b'y',b't',b'a',b'f',b'c',b'b',b'n',b'e',b'c',b's',b's',b'w',b'w',b'p',b'w',b'o',b'p',b'k',b's',b'm',b'e'
3,b'x',b'y',b'w',b't',b'p',b'f',b'c',b'n',b'k',b'e',b'e',b's',b's',b'w',b'w',b'p',b'w',b'o',b'p',b'k',b'v',b'u',b'p'
4,b'x',b'y',b'n',b'f',b'f',b'f',b'c',b'n',b'b',b't',b'?',b's',b'k',b'w',b'p',b'p',b'w',b'o',b'e',b'w',b'v',b'l',b'p'


In [4]:
# Check if there are any missing values in the dfs
pd.concat(test_dfs + train_dfs).isnull().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises?                    0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
class                       0
dtype: int64

In [5]:
# Preprocess the mushrooms train and test dataframes
# For this dataset, we only need to convert the data to numerical values (via Label Encoding) as 
# there are no missing values or different ranges (hence no normalisation required)
for df in train_dfs + test_dfs:
    preprocess_mushrooms_datasets(df)

train_dfs[0]

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,5,2,8,1,3,1,0,0,4,0,2,2,2,7,7,0,2,1,4,2,2,3,0
1,3,3,2,0,7,1,0,1,0,1,0,1,2,6,7,0,2,1,0,7,4,0,1
2,5,3,9,1,0,1,0,0,5,0,2,2,2,7,7,0,2,1,4,2,3,3,0
3,5,3,8,1,6,1,0,1,4,0,3,2,2,7,7,0,2,1,4,2,4,5,1
4,5,3,4,0,2,1,0,1,0,1,0,2,1,7,6,0,2,1,0,7,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7308,5,3,4,1,3,1,0,0,10,0,4,2,3,7,7,0,2,1,4,2,3,4,0
7309,5,3,9,0,2,1,0,0,7,0,1,1,1,0,4,0,2,1,2,1,4,1,1
7310,5,2,8,0,1,1,1,1,5,0,1,2,2,7,7,0,2,1,4,2,4,0,1
7311,2,3,4,0,2,1,0,1,0,1,0,1,2,7,7,0,2,1,0,7,4,4,1


In [25]:
import numpy as np
from tools.knn import KNNClassifier
from tools.distance import EuclideanDistance, ManhattanDistance, MahalanobisDistance
from tools.voting import InverseDistanceWeightedVote, MajorityClassVote, ShepardsWorkVote
from sklearn.model_selection import cross_val_score


# Test different k values
k_values = [1, 3, 5, 7]

# Covariance matrix for Mahalanobis Distance
covariance_matrix = np.cov(train_dfs[0].drop(columns=['class']).to_numpy(), rowvar=False)

# Test different distance metrics
distance_functions = [
    ManhattanDistance(),  
    EuclideanDistance(),
    MahalanobisDistance(covariance_matrix)
]

# Test different voting schemes
voting_schemes = [
    MajorityClassVote(),
    InverseDistanceWeightedVote(),
    ShepardsWorkVote()
]

# TODO: Implement different weightings
weights_list = [
    None,  # Equal weight
]

# Equal weighting for now
sample_weighting = np.ones(len(train_dfs[0].columns) - 1)

# Track best score and corresponding params that achieved that score
best_score = 0
best_params = {}

X_train = train_dfs[0].drop(columns=['class']).to_numpy()  # Drop the class column (target variable)
y_train = train_dfs[0]['class'].to_numpy()  # The class column is the target variable

X_test = test_dfs[0].drop(columns=['class']).to_numpy()  # Same for test set
y_test = test_dfs[0]['class'].to_numpy()  # The class column in the test set

# Loop through combinations and test each combination
for k in k_values:
    for distance_func in distance_functions:
        for voting_func in voting_schemes:
            for weights in weights_list:
                knn = KNNClassifier(k = k,
                                    distance_func = distance_func,
                                    voting_func = voting_func,
                                    weights = sample_weighting
                                    )

                scores = cross_val_score(knn,
                                         X_train,
                                         y_train, 
                                         v=5,
                                         scoring='accuracy'
                                         )
                
                avg_score = np.mean(scores)

                if avg_score > best_score:
                    best_score = avg_score
                    best_params = {
                        'k': k,
                        'distance_func': distance_func.__class__.__name__,
                        'voting_func': voting_func.__class__.__name__,
                        'weights': 'Equal' if weights is None else 'Custom'
                    }

# Print the best combination of parameters
print(f"Best Score: {best_score}")
print(f"Best Params: {best_params}")

Avg score: 1.0
Avg score: 1.0
Avg score: 1.0
Avg score: 0.9994530848948013
Avg score: 0.9994530848948013
Avg score: 0.9994530848948013
Avg score: 1.0
Avg score: 1.0
Avg score: 1.0
Avg score: 0.9989062632953483
Avg score: 0.9979492319905596
Avg score: 0.999316379494938
Avg score: 1.0
Avg score: 0.9984959600842676
Avg score: 1.0
Avg score: 0.9984960535900129
Avg score: 0.9923427209984916
Avg score: 0.999316379494938
Avg score: 1.0
Avg score: 0.9976755406735967
Avg score: 1.0
Avg score: 0.9979491384848143
Avg score: 0.9855053003731815
Avg score: 0.999316379494938
Best Score: 1.0
Best Params: {'k': 1, 'distance_func': 'ManhattanDistance', 'voting_func': 'MajorityClassVote', 'weights': 'Equal'}
