In [11]:
import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer, cohen_kappa_score, classification_report
from sklearn.model_selection import  GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector


In [2]:
df_dme = pd.read_csv('Texture_DME.csv')
df_normal = pd.read_csv('Texture_NORMAL.csv')

n_dme = df_dme.shape[0]
n_normal = df_normal.shape[0]

label_one = pd.Series([1] * n_dme )
label_one.name = 'Label'
label_zero = pd.Series([0] * n_normal )
label_zero.name = 'Label'

df_dme.drop(df_dme.columns[0], axis = 1, inplace = True)
df_normal.drop(df_normal.columns[0], axis = 1, inplace = True)

pandas_normal = pd.concat([label_zero, df_normal], axis=1)
pandas_dme = pd.concat([label_one, df_dme], axis=1)

Data = pd.concat([pandas_normal, pandas_dme], axis=0, ignore_index=True)

X = Data.drop('Label', axis=1)
y = Data['Label']


In [3]:
# columns_n = np.arange(0, 144, 1)
# np.random.seed(123)
# selected_numbers = np.random.choice(columns_n, size=10, replace=False)
# print(selected_numbers)
# X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, selected_numbers ], y, test_size=0.1, random_state=100, stratify=y)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100, stratify=y)

k_fold = 4
kfold = StratifiedKFold(n_splits = k_fold, random_state=100, shuffle=True) # StratifiedKFold is an extension of KFold where it maintains the distribution of the target variable within each fold.


In [5]:

knn_SFS = SequentialFeatureSelector(estimator = KNeighborsClassifier(),
n_features_to_select = 'auto',
direction = 'forward',
scoring = make_scorer(cohen_kappa_score),
cv = k_fold,
n_jobs = -1)

pipe = Pipeline(steps=[
                 ('scale', RobustScaler()),
                 ('sfs', knn_SFS), # inner knn
                 ('knn2', KNeighborsClassifier()) # outer knn
                 ])

param_grid = {
    'sfs__estimator__n_neighbors': np.arange(3, 10, 3), # inner knn
    'knn2__n_neighbors': np.arange(3, 10, 3) # outer knn
 }

knn_model = GridSearchCV(estimator = pipe,
                  param_grid = param_grid,
                  scoring = make_scorer(cohen_kappa_score),
                  n_jobs = -1,
                  cv = k_fold,
                  refit = True,
                  verbose = 1)

knn_model.fit(X_train, y_train)

Fitting 4 folds for each of 9 candidates, totalling 36 fits


In [6]:
selected = knn_model.best_estimator_.named_steps["sfs"].get_support(indices=True) 

X_train_new, X_test_new,  y_train_new, y_test_new = X_train.iloc[:, selected], X_test.iloc[:, selected],  y_train * 1, y_test * 1


### Model final

In [7]:
pipe_final_knn = Pipeline(steps=[
                 ('scale', RobustScaler()),
                 ('knn', KNeighborsClassifier())
                 ])

best_paramsKNN = {'knn__n_neighbors': knn_model.best_params_['knn2__n_neighbors'] }

pipe_final_knn.set_params(**best_paramsKNN)
pipe_final_knn.fit(X_train_new, y_train_new)

predictions_train = pipe_final_knn.predict(X_train_new)
predictions_test = pipe_final_knn.predict(X_test_new)

metrics_knn = {'Model_train': cohen_kappa_score(predictions_train, y_train_new), 'Model_test':cohen_kappa_score(predictions_test, y_test_new)}


In [12]:
print(classification_report(y_train, predictions_train))

print(classification_report(y_test, predictions_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        45
           1       1.00      1.00      1.00        45

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



### Save the results in csv 

In [13]:
selected_keys = list(knn_model.cv_results_.keys())[list(knn_model.cv_results_.keys()).index('params'):]
df = pd.DataFrame({key: knn_model.cv_results_[key] for key in selected_keys if key in knn_model.cv_results_})
df_params = pd.json_normalize(df['params'])
df_combined = pd.concat([df.drop(columns=['params']), df_params], axis=1)

data_dict = {
    'Sheet1': df_combined,
    'Sheet2': {
        'best_params': knn_model.best_params_,
        'best_score': knn_model.best_score_,
        'best_params_final': best_paramsKNN,
        'selected_columns': np.array(X_train.columns)[selected],
        'model_train': metrics_knn
    }
}

with pd.ExcelWriter('Result_Knn_CV_Final.xlsx') as writer:
    for sheet_name, data in data_dict.items():
        if isinstance(data, pd.DataFrame):
            data.to_excel(writer, sheet_name=sheet_name)
        else:
            pd.DataFrame([data]).to_excel(writer, sheet_name=sheet_name, index=False, header=False)


In [9]:
# print(knn_model.best_estimator_.named_steps['knn2'].n_neighbors)
# print(knn_model.best_estimator_.named_steps['knn2'].effective_metric_params_)
# print(knn_model.best_estimator_.named_steps['knn2'].effective_metric_)
# print(knn_model.best_estimator_.named_steps['knn2'].n_features_in_)
# print(knn_model.best_estimator_.named_steps['knn2'].n_samples_fit_)

# print(knn_model.best_estimator_.named_steps['sfs'].scoring)
# print(knn_model.best_estimator_.named_steps['sfs'].get_params() )
# print(knn_model.best_estimator_.named_steps['sfs'].get_feature_names_out() )
# print(knn_model.best_estimator_.named_steps['sfs'].support_)
# ## print(knn_model.best_estimator_.named_steps['sfs'].feature_names_in_)
