In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
dt = {}
for column in df.columns:
    count = len(df[column].unique())
    dt[column] = [count]

pd.DataFrame(dt).transpose()

Unnamed: 0,0
age,41
sex,2
cp,4
trtbps,49
chol,152
fbs,2
restecg,3
thalachh,91
exng,2
oldpeak,40


In [5]:
len(df['caa'].unique())

5

In [4]:
tar_col = ['output']
cat_col = ['sex','cp','fbs','restecg','slp','thall','exng','caa']
con_col = [col for col in df.columns if col not in cat_col and col not in tar_col ]


In [5]:
def cal_robust_stats(data,lower_percetile=25,upper_percentile=75):
    median = np.median(data)
    lower_quartile = np.percentile(data,lower_percetile)
    upper_quartile = np.percentile(data,upper_percentile)
    iqr = upper_quartile-lower_quartile
    v_col = data[(data <= lower_quartile - 1.5 * iqr) | (data >= upper_quartile + 1.5 * iqr)]
    perc = np.shape(v_col)[0] * 100.0 / np.shape(df)[0]
    return v_col, perc

In [6]:
for column_name,data in df[con_col].items():
    v_col, perc = cal_robust_stats(data)
    print(f"Column {column_name} outlier = {len(v_col)} => {round(perc,3)}%")

Column age outlier = 0 => 0.0%
Column trtbps outlier = 13 => 4.29%
Column chol outlier = 5 => 1.65%
Column thalachh outlier = 1 => 0.33%
Column oldpeak outlier = 8 => 2.64%


In [7]:
def winsorize(data, lower_percentile=5, upper_percentile=95):
    lower_limit = np.percentile(data, lower_percentile)
    upper_limit = np.percentile(data, upper_percentile)
    winsorized_data = np.clip(data, lower_limit, upper_limit)
    return winsorized_data

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
class WinsorizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, lower_percentile=5, upper_percentile=95):
        self.columns = columns
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for column in self.columns:
            X[column] = winsorize(X[column], self.lower_percentile, self.upper_percentile)
        return X

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

pipe = ColumnTransformer([
                                    ('one_hot_encoder', OneHotEncoder(), cat_col),
                                    ('scaler', MinMaxScaler(), cat_col+con_col)
                            ])

preprocessor = Pipeline([
                                ('winsorize', WinsorizerTransformer(columns=cat_col+con_col)),
                                ('encoder_scaler', pipe)
                    ])
            

In [12]:
preprocessor

In [13]:

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

df1 = df.copy()
X = df1.drop(columns=['output'])
y = df1['output']



X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify=y)
sm = SMOTE(sampling_strategy='minority',random_state=7)

X_train, y_train = sm.fit_resample(X_train,y_train)

In [14]:
processed_X_train = preprocessor.fit_transform(X_train)
processed_X_test = preprocessor.transform(X_test)

In [22]:
arr = np.c_[processed_X_train,np.array(y_train)]
arr[:,:-1]

array([[0.        , 1.        , 1.        , ..., 0.74124914, 0.        ,
        0.50235479],
       [0.        , 1.        , 1.        , ..., 0.28140014, 0.33885942,
        0.69073783],
       [1.        , 0.        , 0.        , ..., 0.30199039, 0.6969496 ,
        0.50235479],
       ...,
       [0.        , 1.        , 0.        , ..., 0.06863418, 0.6571618 ,
        0.91284465],
       [0.        , 1.        , 0.        , ..., 0.08922443, 0.6704244 ,
        0.81787467],
       [1.        , 0.        , 1.        , ..., 0.48043926, 0.49801061,
        0.12546439]])

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import KFold,cross_val_score

from sklearn.metrics import accuracy_score,classification_report

In [30]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'XGBClassifier': XGBClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier()
}

k = 5

kfold = KFold(n_splits=k, shuffle=True, random_state=42)

for model_name, model in models.items():
    model.fit(processed_X_train,y_train)
    y_pred = model.predict(processed_X_test)
    scores = cross_val_score(model, processed_X_train, y_train, cv=kfold)
    print(f"-----------------------------------{model_name}----------------------------------------")
    print(f"accuracy {accuracy_score(y_test,y_pred)}\n")
    print(f'Mean cross val score {scores.mean()}')
    print(classification_report(y_test,y_pred))
    

-----------------------------------LogisticRegression----------------------------------------
accuracy 0.8524590163934426

Mean cross val score 0.8179970972423802
              precision    recall  f1-score   support

           0       0.85      0.82      0.84        28
           1       0.85      0.88      0.87        33

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61

-----------------------------------SVC----------------------------------------
accuracy 0.819672131147541

Mean cross val score 0.8255442670537011
              precision    recall  f1-score   support

           0       0.79      0.82      0.81        28
           1       0.84      0.82      0.83        33

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61

-----------------------------------R

In [40]:
from sklearn.model_selection import GridSearchCV

logistic_regression_params = {
    'penalty': ['l2'],
    'C': [0.01, 0.1],
    'solver': ['newton-cg', 'lbfgs', 'liblinear','saga'],
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000,1400],
    'warm_start': [True, False]
}

svc_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4],
    'class_weight': [None, 'balanced']
}

random_forest_params = {
    'n_estimators': [300,500],
    'max_depth': [None,10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [4,5]
}

gradient_boosting_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05],
    'max_depth': [5,6],
    'subsample': [0.6,0.8],
    'max_features': ['sqrt', 'log2']
}

xgboost_params = {
    'n_estimators': [200, 300],
    'learning_rate': [0.05, 0.01],
    'max_depth': [4, 5],
    'subsample': [0.6,0.8],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'p': [1, 2]
}

param_grid = [
    (LogisticRegression(), logistic_regression_params),
    (SVC(),svc_params),
    (RandomForestClassifier(), random_forest_params),
    (GradientBoostingClassifier(), gradient_boosting_params),
    (XGBClassifier(), xgboost_params),
    (KNeighborsClassifier(), knn_params)
]

param_dict = {}

for model, params in param_grid:
    grid_search = GridSearchCV(model, params, cv=kfold)
    grid_search.fit(processed_X_train, y_train)
    param_dict[ model.__class__.__name__] = grid_search.best_params_
    print("Best parameters for", model.__class__.__name__, ":")
    print(grid_search.best_params_)
    print("Best score:", grid_search.best_score_)
    print()

Best parameters for LogisticRegression :
{'C': 0.01, 'class_weight': None, 'fit_intercept': True, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'newton-cg', 'warm_start': True}
Best score: 0.8293178519593614

Best parameters for SVC :
{'C': 0.1, 'class_weight': None, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
Best score: 0.8368650217706822

Best parameters for RandomForestClassifier :
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 300}
Best score: 0.8521770682148041

Best parameters for GradientBoostingClassifier :
{'learning_rate': 0.1, 'max_depth': 6, 'max_features': 'log2', 'n_estimators': 200, 'subsample': 0.6}
Best score: 0.8560232220609579

Best parameters for XGBClassifier :
{'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.6}
Best score: 0.8483309143686503

Best parameters for KNeighborsClassifier :
{'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 7, 'p': 1, 'weights': '

In [36]:
for model_name, param in param_dict.items():
    model = models[model_name]
    model.set_params(**param)
    model.fit(processed_X_train,y_train)
    X_pred = model.predict(processed_X_train)
    y_pred = model.predict(processed_X_test)
    scores = cross_val_score(model, processed_X_train, y_train, cv=kfold)
    print(f"-----------------------------------{model_name}----------------------------------------")
    print(f"train accuracy score {accuracy_score(y_train,X_pred)}\n")
    print(f"test accuracy score {accuracy_score(y_test,y_pred)}\n")
    print(f'Mean cross val score {scores.mean()}')
    print(param)
    print(classification_report(y_test,y_pred))

-----------------------------------LogisticRegression----------------------------------------
train accuracy score 0.8484848484848485

test accuracy score 0.8360655737704918

Mean cross val score 0.8293178519593614
{'C': 0.01, 'class_weight': None, 'fit_intercept': True, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'newton-cg', 'warm_start': True}
              precision    recall  f1-score   support

           0       0.85      0.79      0.81        28
           1       0.83      0.88      0.85        33

    accuracy                           0.84        61
   macro avg       0.84      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61

-----------------------------------RandomForestClassifier----------------------------------------
train accuracy score 0.928030303030303

test accuracy score 0.8360655737704918

Mean cross val score 0.8484760522496371
{'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 300}
              pre

In [37]:
ar = [('Cruyff', 104), ('Eusebio', 120), ('Messi', 125), ('Ronaldo', 132), ('Pele', 150)]
dict(ar)

{'Cruyff': 104, 'Eusebio': 120, 'Messi': 125, 'Ronaldo': 132, 'Pele': 150}