In [2]:
# =====================================================
# LifeScanAI – Heart Disease Prediction (Large Dataset)
# =====================================================

# 1. Imports
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 2. Loading dataset
df = pd.read_csv("heart_large.csv")

print("Dataset Shape (Before Cleaning):", df.shape)
display(df.head())

# 3. Basic cleaning
df = df.drop_duplicates()
df = df.dropna()

print("Dataset Shape (After Cleaning):", df.shape)

# 4. Encoding target variable (HeartDisease)
# Yes → 1, No → 0
df["HeartDisease"] = df["HeartDisease"].map({"Yes": 1, "No": 0})

# 5. Encoding categorical features
categorical_cols = [
    "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking",
    "Sex", "AgeCategory", "Race", "Diabetic",
    "PhysicalActivity", "GenHealth", "Asthma",
    "KidneyDisease", "SkinCancer"
]

encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

# 6. Spliting features and target
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]

print("Feature columns:")
print(X.columns.tolist())

print("\nTarget distribution:")
print(y.value_counts())

# 7. Train–Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 8. Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 9. Training model
model = LogisticRegression(max_iter=1000, n_jobs=-1)
model.fit(X_train_scaled, y_train)

# 10. Evaluating model
y_pred = model.predict(X_test_scaled)

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)
tn, fp, fn, tp = cm.ravel()
print(f"\nTrue Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

# 11. Save trained artifacts
pickle.dump(model, open("heart_model.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))

print("\nTraining completed successfully.")
print("Saved files: heart_model.pkl, scaler.pkl")

Dataset Shape (Before Cleaning): (319795, 18)


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


Dataset Shape (After Cleaning): (301717, 18)
Feature columns:
['BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime', 'Asthma', 'KidneyDisease', 'SkinCancer']

Target distribution:
HeartDisease
0    274456
1     27261
Name: count, dtype: int64





Model Accuracy: 0.9102976269388837

Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.99      0.95     54892
           1       0.52      0.09      0.16      5452

    accuracy                           0.91     60344
   macro avg       0.72      0.54      0.56     60344
weighted avg       0.88      0.91      0.88     60344


Confusion Matrix:
 [[54414   478]
 [ 4935   517]]

True Negatives: 54414
False Positives: 478
False Negatives: 4935
True Positives: 517

Training completed successfully.
Saved files: heart_model.pkl, scaler.pkl
