In [16]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # or any other model
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [17]:
# Load your preprocessed medical dataset
df = pd.read_csv(r"C:\Users\bhara\Desktop\HEART_DISEASE_PREDICTOR\medical_data_4000.csv")
print(df.head())
print(df.isnull().sum())
print(df.shape)
df.drop('education', axis=1, inplace=True)
print(df['TenYearCHD'].value_counts())

   gender  age  education  currentSmoker  cigsPerDay  BPMeds  prevalentStroke  \
0       0   48          2              0           5       1                0   
1       0   61          2              0          36       0                1   
2       1   78          4              1          12       0                1   
3       1   70          2              0          13       0                0   
4       0   47          3              0          25       0                1   

   prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  heartRate  glucose  \
0             1         1      274    128     85  20.46        100       68   
1             0         0      340    120    110  28.93        103       83   
2             1         1      295    167    103  15.72         72      158   
3             1         0      234    158     86  42.64         92      113   
4             1         1      268     96     69  42.05         74      175   

   TenYearCHD  
0           0  
1     

In [18]:
df_majority = df[df.TenYearCHD == 0]
df_minority = df[df.TenYearCHD == 1]    
# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                    replace=True,     # sample with replacement
                                    n_samples=2000,    # to match majority class
                                    random_state=42) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
# Display new class counts
print(df_upsampled['TenYearCHD'].value_counts())

TenYearCHD
0    2031
1    2000
Name: count, dtype: int64


In [19]:
# Separate features (X) and target label (y)
X = df_upsampled.drop('TenYearCHD', axis=1)    
y = df_upsampled['TenYearCHD']  # target variable

In [20]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
# Initialize and train model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
# Predict and evaluate
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7534076827757125
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.78      0.77       429
           1       0.74      0.72      0.73       378

    accuracy                           0.75       807
   macro avg       0.75      0.75      0.75       807
weighted avg       0.75      0.75      0.75       807

Confusion Matrix:
 [[335  94]
 [105 273]]


In [23]:
# # Save the trained model for later deployment
import pickle
pickle.dump(rf, open(r"C:\Users\bhara\Desktop\HEART_DISEASE_PREDICTOR\models\disease_model.pkl", 'wb'))
pickle.dump(scaler, open(r"C:\Users\bhara\Desktop\HEART_DISEASE_PREDICTOR\models\scaler.pkl", 'wb'))

In [24]:
classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"{name} Classification Report:\n", classification_report(y_test, y_pred))
    print(f"{name} Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Random Forest Accuracy: 0.734820322180917
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.77      0.76       429
           1       0.73      0.69      0.71       378

    accuracy                           0.73       807
   macro avg       0.73      0.73      0.73       807
weighted avg       0.73      0.73      0.73       807

Random Forest Confusion Matrix:
 [[332  97]
 [117 261]]
Logistic Regression Accuracy: 0.4857496902106567
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.47      0.49       429
           1       0.46      0.51      0.48       378

    accuracy                           0.49       807
   macro avg       0.49      0.49      0.49       807
weighted avg       0.49      0.49      0.49       807

Logistic Regression Confusion Matrix:
 [[201 228]
 [187 191]]
Support Vector Machine Accuracy: 0.5551425030978935
Sup

In [25]:
# append result to dataframe
results = []
for name, clf in classifiers.items():
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append((name, acc))
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy'])
print(results_df)


                    Model  Accuracy
0           Random Forest  0.734820
1     Logistic Regression  0.485750
2  Support Vector Machine  0.555143
3             Naive Bayes  0.493185
4           Decision Tree  0.687732
5     K-Nearest Neighbors  0.542751


In [26]:
print(rf.predict(X_test[:5]))  # Example prediction on first 5 test samples
X_test = X_test[10].reshape(-1, 1)  # Reshape a single test sample
print(rf.predict(X_test.T))  # Predict on the reshaped sample
#  TEST 1
print("predicted class :" , rf.predict(X_test.reshape(1, -1))[0])
print("actual class :" , y_test.iloc[10])

[0 0 0 1 1]
[0]
predicted class : 0
actual class : 0


In [27]:
# load random classifier model
with open(r"C:\Users\bhara\Desktop\HEART_DISEASE_PREDICTOR\models\disease_model.pkl", 'rb') as f:
    loaded_model = pickle.load(f)

# load scaler model
with open(r"C:\Users\bhara\Desktop\HEART_DISEASE_PREDICTOR\models\scaler.pkl", 'rb') as f:
    loaded_scaler = pickle.load(f)

In [28]:
# Predict using the loaded model
def predict_disease(gender, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):
    gender_encoded = 1 if gender.lower() == "male" else 0
    currentSmoker_encoded = 1 if currentSmoker.lower() == "yes" else 0
    BPMeds_encoded = 1 if BPMeds.lower() == "yes" else 0
    prevalentStroke_encoded = 1 if prevalentStroke.lower() == "yes" else 0
    prevalentHyp_encoded = 1 if prevalentHyp.lower() == "yes" else 0
    diabetes_encoded = 1 if diabetes.lower() == "yes" else 0
    feature = np.array([[gender_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded,
                         prevalentHyp_encoded, diabetes_encoded, totChol, sysBP, diaBP, BMI, heartRate, glucose]])
    feature_scaled = loaded_scaler.transform(feature)

    predicted_class = loaded_model.predict(feature_scaled)

    message = "High risk of heart disease" if predicted_class[0] == 1 else "Low risk of heart disease"
    return message, predicted_class[0]
    

In [29]:

# Example usage
result = predict_disease("male", 55, "no", 20, "no", "no", "no", "no", 240, 140, 90, 28.5, 80, 100)
message, prediction = result
print(f"Prediction: {message} (class: {prediction})")

Prediction: Low risk of heart disease (class: 0)


