<a href="https://colab.research.google.com/github/sekhar0146/Machine-learning-projects/blob/master/cardio_analysis_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [77]:
df = pd.read_csv("drive/My Drive/cardio/cardio_train.csv", delimiter=";")
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [39]:
# check for missing data
df.isna().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [40]:
# check for non-numeric data
df.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [78]:
# Split into X and y (on train set)
X = df.drop("cardio", axis=1)
y = df["cardio"]

X.shape, y.shape

((70000, 12), (70000,))

In [79]:
# split data into train and test sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=40)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((56000, 12), (14000, 12), (56000,), (14000,))

In [71]:
# ===========================================
# Modele processing 
# ===========================================
models={"naive_bayes": GaussianNB(),
        "GradientBoostingClassifier": GradientBoostingClassifier(),
        "RandomForestClassifier": RandomForestClassifier(),
        "KNeighborsClassifier": KNeighborsClassifier(),
        "LogisticRegression": LogisticRegression(),
        "XGBClassifier": XGBClassifier()
}

# create an empty disctionary to save the model score 
result = {}

# Run a loop to see all models score
for model_name, model in models.items():
    model.fit(X_train, y_train)
    result[model_name] = model.score(X_train, y_train)

# Print model score results:
result

{'GradientBoostingClassifier': 0.7394285714285714,
 'KNeighborsClassifier': 0.7131071428571428,
 'LogisticRegression': 0.698,
 'RandomForestClassifier': 1.0,
 'XGBClassifier': 0.7379285714285714,
 'naive_bayes': 0.5743035714285715}

In [45]:
from sklearn.model_selection import RandomizedSearchCV
# RandomForestClassifier is giving most accurate score without hyperparameter tuning 
# Now Lets tune hyperparameters of GradientBoostingClassifier and  XGBClassifier
# ------------------------------------------------------------
# GradientBoostingClassifier Hyperparameter tuning with RandomizedSearchCV
# ------------------------------------------------------------
print("=== Hyperparameter tuning with GradientBoostingClassifier ===")
# different GradientBoostingClassifier hyperparameters
rf_gb_grid = {"n_estimators": np.arange(200, 2000, 10),
           "max_depth": [None, 3 , 5, 10, 20, 30],
           "min_samples_split":np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "learning_rate": [0.1, 0.05, 0.02, 0.01]
           }

# Instantiate GradientBoostingClassifier model
rs_gb_model = RandomizedSearchCV(GradientBoostingClassifier(random_state=40),
                              param_distributions=rf_gb_grid,
                              n_iter=2,
                              cv=5,
                              verbose=True)
# Fit the model
rs_gb_model.fit(X_train, y_train)

print("Best parameters for GradientBoostingClassifier ==>")
print(rs_gb_model.best_params_)
print("GradientBoostingClassifier score : ", rs_gb_model.score(X_train, y_train))

=== Hyperparameter tuning with GradientBoostingClassifier ===
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 13.9min finished


Best parameters for GradientBoostingClassifier ==>
{'n_estimators': 1040, 'min_samples_split': 8, 'min_samples_leaf': 11, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.01}
GradientBoostingClassifier score :  0.7915


In [46]:
from sklearn.model_selection import RandomizedSearchCV
# ------------------------------------------------------------
# XGBClassifier Hyperparameter tuning with RandomizedSearchCV
# ------------------------------------------------------------
print("=== Hyperparameter tuning with XGBClassifier ===")
# different XGBClassifier hyperparameters
rs_xb_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

# Instantiate XGBClassifier modl
rs_xb_model = RandomizedSearchCV(XGBClassifier(random_state=40),
                              param_distributions=rs_xb_grid,
                              n_iter=2,
                              cv=5,
                              verbose=True)
# Fit the model
rs_xb_model.fit(X_train, y_train)

print("Best parameters for XGBClassifier ==>")
print(rs_xb_model.best_params_)
print("XGBClassifier score : ", rs_xb_model.score(X_train, y_train))

=== Hyperparameter tuning with XGBClassifier ===
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   28.9s finished


Best parameters for XGBClassifier ==>
{'subsample': 1.0, 'min_child_weight': 1, 'max_depth': 3, 'gamma': 1.5, 'colsample_bytree': 1.0}
XGBClassifier score :  0.7383035714285714


In [80]:
# -------------------------------------------------------------------------------------------------------------------
# Even after tuning XGBClassifier and GradientBoostingClassifier there is no much score than RandomForestClassifier
# So, we are proceedng with RandomForestClassifier to predict the cardio 
# -------------------------------------------------------------------------------------------------------------------
# Instantiate the model RandomForestClassifier
rf_model = RandomForestClassifier(random_state=40)
rf_model.fit(X_train, y_train)
print(rf_model.score(X_train, y_train))

1.0


In [81]:
# ---------------------------------------------------------------
# Make predictions on test data set on RandomForestClassifier
# ---------------------------------------------------------------
print("========= Predict cardio ============ ")
y_preds = rf_model.predict(X_test)
print(y_preds)
print(y_preds.shape)
print("")

pd.set_option('display.max_columns', None)
cardio_preds = rf_model.predict(df.drop("cardio", axis=1))
print(cardio_preds.shape)
df["cardio_predict"] = cardio_preds
print(df.head(50))

[1 0 0 ... 1 1 1]
(14000,)

(70000,)
    id    age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0    0  18393       2     168    62.0    110     80            1     1      0   
1    1  20228       1     156    85.0    140     90            3     1      0   
2    2  18857       1     165    64.0    130     70            3     1      0   
3    3  17623       2     169    82.0    150    100            1     1      0   
4    4  17474       1     156    56.0    100     60            1     1      0   
5    8  21914       1     151    67.0    120     80            2     2      0   
6    9  22113       1     157    93.0    130     80            3     1      0   
7   12  22584       2     178    95.0    130     90            3     3      0   
8   13  17668       1     158    71.0    110     70            1     1      0   
9   14  19834       1     164    68.0    110     60            1     1      0   
10  15  22530       1     169    80.0    120     80            1     1  

In [82]:
# Evaluate the model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

In [83]:
# Create a confusion matrix using the confusion_matrix function
print(confusion_matrix(y_test, y_preds))

[[5230 1821]
 [2033 4916]]


In [84]:
# Create a classification report using the classification_report function
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.72      0.74      0.73      7051
           1       0.73      0.71      0.72      6949

    accuracy                           0.72     14000
   macro avg       0.72      0.72      0.72     14000
weighted avg       0.72      0.72      0.72     14000

