In [1]:
from importlib import reload
import tools.KacperWork.preprocess as preprocess
import tools.KacperWork.knn as knn

reload(preprocess)
reload(knn)

from tools.KacperWork.preprocess import load_datasets, preprocess_hepatitis_datasets
import pandas as pd


In [2]:
# Load mushroom train and test dataframes

train_dfs = load_datasets('../data/raw/hepatitis/hepatitis.fold.*.train.arff')
test_dfs = load_datasets('../data/raw/hepatitis/hepatitis.fold.*.test.arff')

train_dfs[0].head()

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,Class
0,50.0,b'female',b'no',b'no',b'yes',b'no',b'no',b'no',b'no',b'no',b'no',b'no',b'no',0.9,135.0,42.0,3.5,,b'no',b'LIVE'
1,45.0,b'male',b'no',b'no',b'yes',b'yes',b'no',b'yes',b'no',b'yes',b'no',b'no',b'no',1.0,85.0,75.0,,,b'no',b'LIVE'
2,54.0,b'female',b'no',b'no',b'yes',b'yes',b'no',b'?',b'?',b'yes',b'no',b'yes',b'no',3.9,120.0,28.0,3.5,43.0,b'yes',b'DIE'
3,35.0,b'female',b'no',b'no',b'yes',b'no',b'no',b'?',b'?',b'yes',b'yes',b'yes',b'no',1.5,138.0,58.0,2.6,,b'yes',b'DIE'
4,24.0,b'female',b'no',b'no',b'yes',b'no',b'no',b'yes',b'no',b'no',b'no',b'no',b'no',1.0,,34.0,4.1,,b'yes',b'LIVE'


In [3]:
import numpy as np

# Combine the test and train dataframes and replace "?" entries with NaN and sum up all null values
pd.concat(test_dfs + train_dfs).replace(b'?', np.nan).isnull().sum()

# There are missing values, therefore we need to either drop the rows or imputate values

AGE                  0
SEX                  0
STEROID             10
ANTIVIRALS           0
FATIGUE             10
MALAISE             10
ANOREXIA            10
LIVER_BIG          100
LIVER_FIRM         110
SPLEEN_PALPABLE     50
SPIDERS             50
ASCITES             50
VARICES             50
BILIRUBIN           60
ALK_PHOSPHATE      290
SGOT                40
ALBUMIN            160
PROTIME            670
HISTOLOGY            0
Class                0
dtype: int64

In [4]:
# Preprocess the hepatitis train and test dataframes
# For this dataset, we need to convert several columns from categorical to numerical values (via Label Encoding) 
# to ensure all columns are of the same type (numerical). We also need to normalise the numerical columns (via Min-Max Scaling) and
# fill in missing entries in the data (via Simple Imputer)
for df in train_dfs + test_dfs:    
    preprocess_hepatitis_datasets(df)

train_dfs[0]

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,Class
0,0.605634,0,0,0,1,0,0,0,0,0,0,0,0,0.069444,0.405204,0.044164,0.325581,0.61,0,1
1,0.535211,1,0,0,1,1,0,1,0,1,0,0,0,0.083333,0.219331,0.096215,0.402101,0.61,0,1
2,0.661972,0,0,0,1,1,0,1,0,1,0,1,0,0.486111,0.349442,0.022082,0.325581,0.43,1,0
3,0.394366,0,0,0,1,0,0,1,0,1,1,1,0,0.152778,0.416357,0.069401,0.116279,0.61,1,0
4,0.239437,0,0,0,1,0,0,1,0,0,0,0,0,0.083333,0.294634,0.031546,0.465116,0.61,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0.366197,0,0,0,1,1,0,1,0,0,0,1,0,0.041667,0.137546,0.104101,0.209302,0.31,1,0
134,0.338028,0,0,1,0,0,0,1,0,0,0,0,0,0.041667,0.074349,0.059937,0.441860,0.80,0,1
135,1.000000,0,1,0,1,0,0,1,0,0,0,0,0,0.041667,0.260223,0.028391,0.441860,0.61,0,1
136,0.380282,0,1,0,0,0,0,1,0,0,0,0,0,0.069444,0.256506,0.022082,0.441860,0.75,0,1


In [5]:
# Check if there are any missing values in the dfs
pd.concat(test_dfs + train_dfs).isnull().sum()

# No missing values, therefore simple imputer was successful.

AGE                0
SEX                0
STEROID            0
ANTIVIRALS         0
FATIGUE            0
MALAISE            0
ANOREXIA           0
LIVER_BIG          0
LIVER_FIRM         0
SPLEEN_PALPABLE    0
SPIDERS            0
ASCITES            0
VARICES            0
BILIRUBIN          0
ALK_PHOSPHATE      0
SGOT               0
ALBUMIN            0
PROTIME            0
HISTOLOGY          0
Class              0
dtype: int64

In [6]:
from tools.KacperWork.knn_utils import prepare_data, calculate_feature_weights, find_best_params, plot_precision_recall_curve, train_and_evaluate_final_model
from tools.KacperWork.distance import EuclideanDistance, ManhattanDistance, MahalanobisDistance
from tools.KacperWork.voting import InverseDistanceWeightedVote, MajorityClassVote, ShepardsWorkVote
from tools.KacperWork.knn import KNNClassifier
import numpy as np

k_values = [1, 3, 5, 7]

# Covariance matrix for Mahalanobis Distance
covariance_matrix = np.cov(train_dfs[0].drop(columns=['Class']).to_numpy(), rowvar=False)

X_train, y_train, X_test, y_test = prepare_data(train_dfs, test_dfs)

normalized_info_gain_weights, normalized_relief_weights = calculate_feature_weights(X_train, y_train)

weights_list = [
    None,  # Equal weight
    normalized_info_gain_weights,  # Information Gain (mutual information)
    normalized_relief_weights  # ReliefF
]

distance_functions = [
    ManhattanDistance(),  
    EuclideanDistance(),
    MahalanobisDistance(covariance_matrix)
]

voting_schemes = [
    MajorityClassVote(),
    InverseDistanceWeightedVote(),
    ShepardsWorkVote()
]

# Track best score and corresponding params that achieved that score
best_score, best_params = find_best_params(X_train, y_train, k_values, distance_functions, voting_schemes, weights_list)

# Print the best combination of parameters
print(f"Best Score: {best_score}")
print(f"Best Params: {best_params}")

knn_final = KNNClassifier(k=best_params['k'],
                          distance_func=distance_functions[0] if best_params['distance_func'] == 'ManhattanDistance' else 
                          distance_functions[1] if best_params['distance_func'] == 'EuclideanDistance' else
                          distance_functions[2],
                          voting_func=voting_schemes[0] if best_params['voting_func'] == 'MajorityClassVote' else 
                          voting_schemes[1] if best_params['voting_func'] == 'InverseDistanceWeightedVote' else
                          voting_schemes[2],
                          weights=weights_list[0] if best_params['weights'] == 'Equal' else weights_list[1] if best_params['weights'] == 'Custom' else weights_list[2])

plot_precision_recall_curve(knn_final, X_train, y_train)

train_and_evaluate_final_model(best_params, distance_functions, voting_schemes, weights_list, X_train, y_train, X_test, y_test)

ModuleNotFoundError: No module named 'sklearn_relief'