Dataset : https://www.kaggle.com/datasets/mahatiratusher/heart-disease-risk-prediction-dataset/data


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv("/kaggle/input/heart-disease-risk-prediction-dataset/heart_disease_risk_dataset_earlymed.csv")

In [None]:
data.sample()

In [None]:
print(data.isnull().sum())
print(data.info())

In [None]:
X = data.drop('Heart_Risk',axis =1)
y = data['Heart_Risk']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state =42)

In [None]:
print(X_train,"\n")
print(y_train,"\n")
print(X_test,"\n")
print(y_test)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

print(data.describe())

In [None]:
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot= False, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
sns.countplot(x='Heart_Risk', data=data)
plt.title('Distribution of Heart Disease Risk')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}


for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print('-' * 50)

In [None]:
gb_model = GradientBoostingClassifier(random_state = 42)
gb_model.fit(X_train_scaled,y_train)


In [None]:
y_pred = gb_model.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Gradient Boosting Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:

feature_importances = gb_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(feature_importance_df)

In [None]:
import joblib
joblib.dump(gb_model, 'heart_disease_model.pkl')