In [1]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline



# Problem Overview
Cardiomyopathy is a disease that weakens the heart muscle. This makes it harder for your heart to pump blood.
In this classification problem we`ll try to detect this disease using one of machine learning methods - K-Nearest-Neighbours
(if you need further information about this method: https://scikit-learn.org/stable/modules/neighbors.html).
In the directory there is file called task_data which contains data of different hearts.
Numbers in second columns state whether heart is diseased or not:
**1 (diseased heart)**,
**0 (healthy heart)**  
  
K-Nearest-Neighbours method will be used to learn from data and predict outcomes for new cases.



### K-Nearest Neighbours

In [2]:
'''
Loading data into pandas DataFrame
'''
data = pd.read_csv('task_data.csv')   


'''
Let's see that certain columns have ',' as a separator instead of '.' so we need to exchange it
'''
def dot_for_comma(data):
    if not isinstance(data, pd.DataFrame):
        return data

    data['Inscribed circle radius'] = data['Inscribed circle radius'].str.replace(',', '.')
    data['Heart perimeter'] = data['Heart perimeter'].str.replace(',', '.')
    data['CTR - Cardiothoracic Ratio'] = data['CTR - Cardiothoracic Ratio'].str.replace(',', '.')
    return data


data = dot_for_comma(data)
    


'''
We create 2 Dataframes: first with our features, and second with outcomes
'''
X = data.drop('Cardiomegaly', axis = 1)    
Y = data['Cardiomegaly'].copy()

'''
Let`s split our dataset into training and tested data
parameter named 'random state' makes the data divided into permanent groups
'''

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.28, random_state=42) 






### GridSearchCV
GridSearchCV is one of better tools to be used in machine learning. "It is particularly useful for hyperparameter tuning, where the goal is to find the best combination of parameters that result in the highest model performance."

In [3]:
'''
class sklearn.model_selection.GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None,
verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)

How to choose scoring parameter?
First of all we need to define if we have classification or regression problem. In our case we're checking wheter a person
is diseased or not. Therefore it wil be classification problem. In short:
- accuracy tells us how many tests were right (in %)

- precision : (people who were actually deseased/people who model marked as deseased) 
  (e.g. people who model mark as deseased: 30, people who were actually deseased: 25, people marked by model as healthy: 5 
  -> precision scoring: 25/30 = 83% )

- recall :  (people who were actually deseased and marked as deseased) / (all of people that were deseased)
(e.g. people who model mark as deseased and who were actually deseased: 25 (as above). 
However the exact number of deseased people were higher: 35, so: recall scoring = 25/35 = 71%)

- f1 is between precision and recall
'''


param_grid = {
    "model__n_neighbors": [3, 5],  # Number of neighbors to consider
    "model__weights": ["uniform", "distance"],   # How neighbors contribute to the prediction (distance means that points who are
                                                 # closer have bigger impact)
    "model__metric": ["minkowski", "manhattan", "euclidean", "chebyshev"],  # Distance metrics to test
                                                                            # Different ways to measure distance between points
}


'''
Pipeline is similar as before
'''
pipe_knn = ImbPipeline(steps=[
    ("scaler", StandardScaler()),  
    ("smote", SMOTE(random_state=42, k_neighbors=2)), # creates artificial samples for minority class
    ("model", KNeighborsClassifier())       
])



cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # shuffling here makes huge difference
# because the data is mixed before splitting

'''
It's time to initialize Grid Search for our model, which will check every possible 
combination given in our param_grid dictionary.
'''


grid_search = GridSearchCV(
    estimator = pipe_knn,
    param_grid = param_grid,
    scoring = 'f1',     
    verbose=1,              # Display progress in the console (values can be from 0 to 3 - it depends on how much info you want to see)
    cv = cv_strategy,
    n_jobs=-1               # it engages all your processes to make the processing faster (other values: -2,1,2)
)

***

### GridSearch learning on raw data

In [4]:
grid_search.fit(x_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy (averaged CV): {(grid_search.best_score_*100):.1f} %")


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'model__metric': 'manhattan', 'model__n_neighbors': 5, 'model__weights': 'distance'}
Best accuracy (averaged CV): 83.2 %


### Checking results for best parameters

In [5]:
'''
We can use pipeline from imblearn.pipeline to speed up all the processes. It's better than classic pipeline because 
it has SMOTE method which create artificial samples for minority class if we do not have enough data 
'''

pipe_knn = ImbPipeline(steps=[    
    ("scaler", StandardScaler()),     # Standardizing features is crucial in machine learning to get outcome based on all of them (features)
                                      # (if there were a huge difference between values, the result may depend on only one feature)       
    
    ("model", KNeighborsClassifier(   
        n_neighbors = 5,              # Number of neighbors used to predict value (less is better for little amount of data)
        
        weights='distance',           # Weight controls how much influence each neighbor has when making a prediction.                                      
                                      # closer neighbors of a query point will have a greater influence than neighbors which are further away.        
       
        metric='manhattan'            
    ))
])

'''
Avoid this line below before using cross_val_score (it can lead to data leakage and scores can be higher)
You need to use it after cross_val_score
'''
#pipe_knn.fit(x_train, y_train)


'''
First param: estimator: model to fit the data
cv: how many divisions (folds) are to be made
It's important that fitting data is inside cross_val_score()
cross_val_score() returns array of scores of the estimator for each run of the cross validation.
'''
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Not only does it divide but also shuffles
cv_scores = cross_val_score(pipe_knn , x_train, y_train, cv=cv_strategy, scoring='f1')
print(f"CV scores: {np.round(cv_scores, 2)}")
print(f"Mean: {np.mean(cv_scores):.3f}")
print(f"Standard deviation: {np.std(cv_scores):.3f}")



CV scores: [0.8  0.75 0.86 1.   0.89]
Mean: 0.859
Standard deviation: 0.085


### Final outcome on tested data

In [6]:
pipe_knn.fit(x_train, y_train)  
outcome = pipe_knn.predict(x_test)
print(f"{(accuracy_score(y_test, outcome)*100):.1f} % of result accuracy")

72.7 % of result accuracy



| type of model          |   KNN    |
|------------------------|----------|
| accuracy of best model |  84.2%   |
| result accuracy        |  72.7%   |
