In [1]:
from importlib import reload
import tools.KacperWork.preprocess as preprocess
import tools.KacperWork.knn as knn

reload(preprocess)
reload(knn)

from tools.KacperWork.preprocess import load_datasets, preprocess_mushrooms_datasets
import pandas as pd


In [2]:
# Load mushroom train and test dataframes

train_dfs = load_datasets('../data/raw/mushroom/mushroom.fold.*.train.arff')
test_dfs = load_datasets('../data/raw/mushroom/mushroom.fold.*.test.arff')

train_dfs[0].head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,b'f',b'y',b'y',b'f',b'f',b'f',b'c',b'b',b'h',b'e',...,b'b',b'p',b'p',b'w',b'o',b'l',b'h',b'v',b'g',b'p'
1,b'f',b's',b'b',b't',b'f',b'f',b'c',b'b',b'p',b't',...,b'w',b'w',b'p',b'w',b'o',b'p',b'h',b's',b'g',b'p'
2,b'x',b'y',b'g',b't',b'n',b'f',b'c',b'b',b'w',b'e',...,b'w',b'w',b'p',b'w',b't',b'p',b'w',b'y',b'p',b'e'
3,b'x',b'y',b'e',b't',b'n',b'f',b'c',b'b',b'n',b't',...,b'g',b'g',b'p',b'w',b'o',b'p',b'k',b'y',b'd',b'e'
4,b'x',b's',b'g',b'f',b'c',b'f',b'w',b'n',b'p',b'e',...,b'w',b'w',b'p',b'w',b'o',b'p',b'k',b's',b'd',b'p'


In [3]:
# Check if there are any missing values in the dfs
pd.concat(test_dfs + train_dfs).isnull().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises?                    0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
class                       0
dtype: int64

In [4]:
# Preprocess the mushrooms train and test dataframes
# For this dataset, we only need to convert the data to numerical values (via Label Encoding) as 
# there are no missing values or different ranges (hence no normalisation required)
for df in train_dfs + test_dfs:
    preprocess_mushrooms_datasets(df)

train_dfs[0]

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,2,3,9,0,2,1,0,0,3,0,...,0,6,0,2,1,2,1,4,1,1
1,2,2,0,1,2,1,0,0,7,1,...,7,7,0,2,1,4,1,3,1,1
2,5,3,3,1,5,1,0,0,10,0,...,7,7,0,2,2,4,7,5,4,0
3,5,3,2,1,5,1,0,0,5,1,...,3,3,0,2,1,4,2,5,0,0
4,5,2,3,0,1,1,1,1,7,0,...,7,7,0,2,1,4,2,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7307,5,3,4,1,3,1,0,0,10,0,...,7,7,0,2,1,4,2,3,4,0
7308,5,3,9,0,2,1,0,0,7,0,...,0,4,0,2,1,2,1,4,1,1
7309,5,2,8,0,1,1,1,1,5,0,...,7,7,0,2,1,4,2,4,0,1
7310,2,3,4,0,2,1,0,1,0,1,...,7,7,0,2,1,0,7,4,4,1


In [5]:
from tools.KacperWork.knn_utils import prepare_data, calculate_feature_weights, find_best_params, plot_precision_recall_curve, train_and_evaluate_final_model
from tools.KacperWork.distance import EuclideanDistance, ManhattanDistance, MahalanobisDistance
from tools.KacperWork.voting import InverseDistanceWeightedVote, MajorityClassVote, ShepardsWorkVote
from tools.KacperWork.knn import KNNClassifier
import numpy as np

k_values = [1, 3, 5, 7]

# Covariance matrix for Mahalanobis Distance
covariance_matrix = np.cov(train_dfs[0].drop(columns=['class']).to_numpy(), rowvar=False)

X_train, y_train, X_test, y_test = prepare_data(train_dfs, test_dfs)

normalized_info_gain_weights, normalized_relief_weights = calculate_feature_weights(X_train, y_train)

weights_list = [
    None,  # Equal weight
    normalized_info_gain_weights,  # Information Gain (mutual information)
    normalized_relief_weights  # ReliefF
]

distance_functions = [
    ManhattanDistance(),  
    EuclideanDistance(),
    MahalanobisDistance(covariance_matrix)
]

voting_schemes = [
    MajorityClassVote(),
    InverseDistanceWeightedVote(),
    ShepardsWorkVote()
]

# Track best score and corresponding params that achieved that score
best_score, best_params = find_best_params(X_train, y_train, k_values, distance_functions, voting_schemes, weights_list)

# Print the best combination of parameters
print(f"Best Score: {best_score}")
print(f"Best Params: {best_params}")

knn_final = KNNClassifier(k=best_params['k'],
                          distance_func=distance_functions[0] if best_params['distance_func'] == 'ManhattanDistance' else 
                          distance_functions[1] if best_params['distance_func'] == 'EuclideanDistance' else
                          distance_functions[2],
                          voting_func=voting_schemes[0] if best_params['voting_func'] == 'MajorityClassVote' else 
                          voting_schemes[1] if best_params['voting_func'] == 'InverseDistanceWeightedVote' else
                          voting_schemes[2],
                          weights=weights_list[0] if best_params['weights'] == 'Equal' else weights_list[1] if best_params['weights'] == 'Custom' else weights_list[2])

plot_precision_recall_curve(knn_final, X_train, y_train)

train_and_evaluate_final_model(best_params, distance_functions, voting_schemes, weights_list, X_train, y_train, X_test, y_test)

ModuleNotFoundError: No module named 'sklearn_relief'