In [78]:
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_predict, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# Problem Overview
Cardiomyopathy is a disease that weakens the heart muscle. This makes it harder for your heart to pump blood.
In this classification problem we`ll try to detect this disease using one of machine learning methods - K-Nearest-Neighbours
(if you need further information about this method: https://scikit-learn.org/stable/modules/neighbors.html).
In the directory there is file called task_data which contains data of different hearts.
Numbers in second columns state whether heart is diseased or not:
**1 (diseased heart)**,
**0 (healthy heart)**  
  
K-Nearest-Neighbours method will be used to learn from data and predict outcomes for new cases.



In [79]:
'''
Loading data into pandas DataFrame
'''
data = pd.read_csv('task_data.csv')   


'''
Let's see that certain columns have ',' as a separator instead of '.' so we need to exchange it
'''
def dot_for_comma(data):
    if not isinstance(data, pd.DataFrame):
        return data

    data['Inscribed circle radius'] = data['Inscribed circle radius'].str.replace(',', '.')
    data['Heart perimeter'] = data['Heart perimeter'].str.replace(',', '.')
    data['CTR - Cardiothoracic Ratio'] = data['CTR - Cardiothoracic Ratio'].str.replace(',', '.')
    return data


data = dot_for_comma(data)
    


'''
We create 2 Dataframes: first with our features, and second with outcomes
'''

X = data.drop('Cardiomegaly', axis = 1)    
Y = data['Cardiomegaly'].copy()

'''
Let`s split our dataset into training and tested data
parameter named 'random state' makes the data divided into permanent groups
'''

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 






In [80]:
'''
We can use pipeline to speed up all the processes
'''

pipe_knn = Pipeline(steps=[    
    ("scaler", StandardScaler()),     # Standardizing features is crucial in machine learning to get outcome based on all of them (features)
                                      # (if there were a huge difference between values, the result may depend on only one feature)
    ("model", KNeighborsClassifier(   
        n_neighbors = 3,              # Number of neighbors used to predict value (less is better for little amount of data)
        
        weights='distance',           # Weight controls how much influence each neighbor has when making a prediction.                                      
                                      # closer neighbors of a query point will have a greater influence than neighbors which are further away.        
       
        metric='manhattan'            
    ))
])


'''
Avoid this line below if u use cross_val_score (it can lead to data leakage and scores can be higher) 
'''
#pipe_knn.fit(x_train, y_train)


'''
First param: estimator: model to fit the data
cv: how many divisions (folds) are to be made
cross_val_score() returns array of scores of the estimator for each run of the cross validation.
'''
cv_score = np.round(cross_val_score(pipe_knn, x_train, y_train, cv = 5), 2)   # only 2 digits after comma

print("Scores of training data cross-validation (each fold):")
print(cv_score)
print(f"\nCross-validation mean score: {np.mean(cv_score):.3}")
print(f"Standard deviation of CV score: {np.std(cv_score):.3f}")


# Because of the fact that we have't got many samples, it is possible to have sometimes 100% accuracy



Scores of training data cross-validation (each fold):
[1.   0.83 0.67 0.83 1.  ]

Cross-validation mean score: 0.866
Standard deviation of CV score: 0.124


***