# Importing the library

In [54]:
!pip install xgboost




In [56]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import joblib

# Pre required Steps

In [59]:

df = pd.read_csv("new_cleaned_cardio_data.csv")
target = "cardio"

In [61]:
X = df.drop(columns=['id', 'cardio', 'age', 'height'])
y = df['cardio']

In [63]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)


In [65]:

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

joblib.dump(scaler, "standard_scaler.joblib")

['standard_scaler.joblib']

# Smote balance

In [68]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(x_train_scaled, y_train)
x_train_res,y_train_res = smote.fit_resample(x_train_scaled,y_train)
print(f"Original training shape: {x_train_scaled.shape}")
print(f"SMOTE training shape: {X_train_smote.shape}")

Original training shape: (56251, 14)
SMOTE training shape: (56936, 14)


# MODELS

# -> Decision Tree

In [78]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_smote, y_train_smote)

dt_pred = dt.predict(x_test_scaled)
mse = mean_squared_error(y_test, dt_pred)


print("Decision Tree Model Performance:")
print("Accuracy:", accuracy_score(y_test, dt_pred))
print("MSE:", mse)

Decision Tree Model Performance:
Accuracy: 0.6168613021916494
RÂ² score: -0.5327873669484651
MSE: 0.38313869780835064


# -> Linear Regression

In [80]:
lr = LinearRegression()
lr.fit(X_train_smote, y_train_smote)
lr_pred_raw = lr.predict(x_test_scaled)
lr_pred = [1 if val >= 0.5 else 0 for val in lr_pred_raw]

mse = mean_squared_error(y_test, lr_pred)

print("Linear Regression Model Performance:")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("MSE:", mse)

Linear Regression Model Performance:
Accuracy: 0.7221244600863862
MSE: 0.2778755399136138


# -> Logistic Regression

In [82]:
logreg = LogisticRegression(max_iter=5000, random_state=42)
logreg.fit(x_train_res,y_train_res)

logref_pred = logreg.predict(x_test_scaled)

mse = mean_squared_error(y_test, logref_pred)

print("Logistic Regression Model Performance:")
print("Accuracy:", accuracy_score(y_test, logref_pred))
print("MSE:", mse)

Logistic Regression Model Performance:
Accuracy: 0.7194048952167653 -0.12255074806998234
MSE: 0.2805951047832347


# -> Random Forest

In [84]:

rf = RandomForestClassifier(n_estimators=300,max_depth=12,min_samples_split=4, random_state=42)
rf.fit(x_train_res, y_train_res)

rf_pred = rf.predict(x_test_scaled)

mse = mean_squared_error(y_test, rf_pred)

print("Random Forest Model Performance:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("MSE:", mse)

Random Forest Model Performance:
Accuracy: 0.7277235642297233
MSE: 0.27227643577027677


# -> XGBoost

In [86]:

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="binary:logistic",
    eval_metric="logloss"
)

xgb.fit(x_train_res, y_train_res)

xgb_pred = xgb.predict(x_test_scaled)
acc_xgb = accuracy_score(y_test, xgb_pred)

mse = mean_squared_error(y_test, xgb_pred)


print("XGBoost Model Performance:")
print("Accuracy:", acc_xgb)
print("MSE:", mse)

XGBoost Model Performance:
Accuracy: 0.725323948168293
MSE: 0.2746760518317069


# -> KNN

In [88]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train_res, y_train_res)

knn_pred = knn.predict(x_test_scaled)

mse = mean_squared_error(y_test, knn_pred)

print("KNN Model Performance:")
print("Accuracy:", accuracy_score(y_test, knn_pred))
print("MSE:", mse)

KNN Model Performance:
Accuracy: 0.7120460726283795
MSE: 0.28795392737162057


# -> SVM

In [90]:
svc = SVC(kernel='rbf', random_state=42)
svc.fit(x_train_res, y_train_res)

svc_pred = svc.predict(x_test_scaled)

mse = mean_squared_error(y_test, svc_pred)

print("SVM Model Performance:")
print("Accuracy:", accuracy_score(y_test, svc_pred))
print("MSE:", mse)

SVM Model Performance:
Accuracy: 0.7250039993601024
MSE: 0.2749960006398976


# Export joblib file

In [92]:
joblib.dump(logreg, "models/logistic_regression_model.joblib")
joblib.dump(rf, "models/random_forest_model.joblib")
joblib.dump(dt, "models/decision_tree_model.joblib")
joblib.dump(xgb, "models/XGBClassifier_model.joblib")
joblib.dump(knn, "models/knn_model.joblib")
joblib.dump(svc, "models/svc_model.joblib")

print("New models (KNN & SVM) have been saved successfully.")
print("Models have been trained and saved successfully.")



New models (KNN & SVM) have been saved successfully.
Models have been trained and saved successfully.
