In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix


In [None]:
# Load dataset
df = pd.read_csv("heart_disease.csv")

# View first 5 rows
df.head()


Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.38725,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,Yes,Low,Low,4.44044,Low,393.0,92.0,12.709873,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No


In [None]:
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   9971 non-null   float64
 1   Gender                9981 non-null   object 
 2   Blood Pressure        9981 non-null   float64
 3   Cholesterol Level     9970 non-null   float64
 4   Exercise Habits       9975 non-null   object 
 5   Smoking               9975 non-null   object 
 6   Family Heart Disease  9979 non-null   object 
 7   Diabetes              9970 non-null   object 
 8   BMI                   9978 non-null   float64
 9   High Blood Pressure   9974 non-null   object 
 10  Low HDL Cholesterol   9975 non-null   object 
 11  High LDL Cholesterol  9974 non-null   object 
 12  Alcohol Consumption   7414 non-null   object 
 13  Stress Level          9978 non-null   object 
 14  Sleep Hours           9975 non-null   float64
 15  Sugar Consumption   

Unnamed: 0,0
Age,29
Gender,19
Blood Pressure,19
Cholesterol Level,30
Exercise Habits,25
Smoking,25
Family Heart Disease,21
Diabetes,30
BMI,22
High Blood Pressure,26


In [None]:
X = df.drop("Heart Disease Status", axis=1)  # features
y = df["Heart Disease Status"]               # target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [None]:
# Identify categorical columns (excluding the target column, which is already handled in y)
categorical_cols = X_train.select_dtypes(include='object').columns

# Apply one-hot encoding to the categorical columns
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure both train and test sets have the same columns after encoding
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

# Impute missing numerical values using the mean from the training set
for col in X_train_encoded.columns:
    if X_train_encoded[col].isnull().any():
        mean_val = X_train_encoded[col].mean()
        X_train_encoded[col] = X_train_encoded[col].fillna(mean_val)
        X_test_encoded[col] = X_test_encoded[col].fillna(mean_val)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_encoded, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = model.predict(X_test_encoded)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8065
Precision: 0.0
Recall: 0.0
Confusion Matrix:
 [[1613    0]
 [ 387    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# [[TN FP]
#  [FN TP]]

In [None]:
class_distribution = y_train.value_counts()
print("Class distribution in training set:\n", class_distribution)

Class distribution in training set:
 Heart Disease Status
No     6387
Yes    1613
Name: count, dtype: int64


In [None]:
import sys
!{sys.executable} -m pip install imblearn
print("imblearn installed successfully.")

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
imblearn installed successfully.


In [None]:
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE with a random state for reproducibility
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

print("Original training set shape:", X_train_encoded.shape, y_train.shape)
print("Resampled training set shape:", X_train_resampled.shape, y_train_resampled.shape)
print("Class distribution after SMOTE:", y_train_resampled.value_counts())

Original training set shape: (8000, 24) (8000,)
Resampled training set shape: (12774, 24) (12774,)
Class distribution after SMOTE: Heart Disease Status
Yes    6387
No     6387
Name: count, dtype: int64


In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate a new Logistic Regression model with increased max_iter
model_resampled = LogisticRegression(max_iter=1000)

# Fit the model to the resampled training data
model_resampled.fit(X_train_resampled, y_train_resampled)
print("Logistic Regression model trained on resampled data.")

Logistic Regression model trained on resampled data.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate a new Logistic Regression model with increased max_iter
model_resampled = LogisticRegression(max_iter=5000)

# Fit the model to the resampled training data
model_resampled.fit(X_train_resampled, y_train_resampled)
print("Logistic Regression model trained on resampled data.")

Logistic Regression model trained on resampled data.


In [None]:
y_pred_resampled = model_resampled.predict(X_test_encoded)
print("Predictions made on the test set using the resampled model.")

Predictions made on the test set using the resampled model.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

accuracy_resampled = accuracy_score(y_test, y_pred_resampled)
precision_resampled = precision_score(y_test, y_pred_resampled, pos_label='Yes')
recall_resampled = recall_score(y_test, y_pred_resampled, pos_label='Yes')
conf_matrix_resampled = confusion_matrix(y_test, y_pred_resampled)

print("Accuracy (resampled model):", accuracy_resampled)
print("Precision (resampled model):", precision_resampled)
print("Recall (resampled model):", recall_resampled)
print("Confusion Matrix (resampled model):\n", conf_matrix_resampled)

Accuracy (resampled model): 0.6605
Precision (resampled model): 0.19327731092436976
Recall (resampled model): 0.23772609819121446
Confusion Matrix (resampled model):
 [[1229  384]
 [ 295   92]]


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier model
rf_model = RandomForestClassifier(random_state=42)

# Fit the model to the resampled training data
rf_model.fit(X_train_resampled, y_train_resampled)

print("RandomForestClassifier model trained successfully on resampled data.")

RandomForestClassifier model trained successfully on resampled data.


In [None]:
y_pred_rf = rf_model.predict(X_test_encoded)
print("Predictions made on the test set using the RandomForestClassifier model.")

Predictions made on the test set using the RandomForestClassifier model.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, pos_label='Yes')
recall_rf = recall_score(y_test, y_pred_rf, pos_label='Yes')
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print("Accuracy (RandomForestClassifier):"), accuracy_rf
print("Precision (RandomForestClassifier):"), precision_rf
print("Recall (RandomForestClassifier):"), recall_rf
print("Confusion Matrix (RandomForestClassifier):\n"), conf_matrix_rf

Accuracy (RandomForestClassifier):
Precision (RandomForestClassifier):
Recall (RandomForestClassifier):
Confusion Matrix (RandomForestClassifier):



(None,
 array([[1462,  151],
        [ 354,   33]]))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, pos_label='Yes')
recall_rf = recall_score(y_test, y_pred_rf, pos_label='Yes')
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print(f"Accuracy (RandomForestClassifier): {accuracy_rf}")
print(f"Precision (RandomForestClassifier): {precision_rf}")
print(f"Recall (RandomForestClassifier): {recall_rf}")
print(f"Confusion Matrix (RandomForestClassifier):\n{conf_matrix_rf}")

Accuracy (RandomForestClassifier): 0.7475
Precision (RandomForestClassifier): 0.1793478260869565
Recall (RandomForestClassifier): 0.08527131782945736
Confusion Matrix (RandomForestClassifier):
[[1462  151]
 [ 354   33]]


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score

# Define the parameter grid for RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', None]
}

# Create a custom scorer for precision of the 'Yes' class
precision_scorer = make_scorer(precision_score, pos_label='Yes')

# Instantiate GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring=precision_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit GridSearchCV to the resampled training data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best precision score found: ", grid_search.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters found:  {'class_weight': None, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best precision score found:  0.8638195860743879


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Get the best model from GridSearchCV
best_rf_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_best_rf = best_rf_model.predict(X_test_encoded)

# Evaluate the best model's performance
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
precision_best_rf = precision_score(y_test, y_pred_best_rf, pos_label='Yes')
recall_best_rf = recall_score(y_test, y_pred_best_rf, pos_label='Yes')
conf_matrix_best_rf = confusion_matrix(y_test, y_pred_best_rf)

print(f"Accuracy (Best RandomForestClassifier): {accuracy_best_rf}")
print(f"Precision (Best RandomForestClassifier): {precision_best_rf}")
print(f"Recall (Best RandomForestClassifier): {recall_best_rf}")
print(f"Confusion Matrix (Best RandomForestClassifier):\n{conf_matrix_best_rf}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Ensure GridSearchCV has completed fitting in case of previous interruptions
# (param_grid, precision_scorer, X_train_resampled, y_train_resampled are assumed to be defined from previous cells)

# Re-fit GridSearchCV if it was interrupted or not fully fitted
# (This line is added for robustness, assuming grid_search object is defined but not necessarily fitted)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best model from GridSearchCV
best_rf_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_best_rf = best_rf_model.predict(X_test_encoded)

# Evaluate the best model's performance
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
precision_best_rf = precision_score(y_test, y_pred_best_rf, pos_label='Yes')
recall_best_rf = recall_score(y_test, y_pred_best_rf, pos_label='Yes')
conf_matrix_best_rf = confusion_matrix(y_test, y_pred_best_rf)

print(f"Accuracy (Best RandomForestClassifier): {accuracy_best_rf}")
print(f"Precision (Best RandomForestClassifier): {precision_best_rf}")
print(f"Recall (Best RandomForestClassifier): {recall_best_rf}")
print(f"Confusion Matrix (Best RandomForestClassifier):\n{conf_matrix_best_rf}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Accuracy (Best RandomForestClassifier): 0.752
Precision (Best RandomForestClassifier): 0.18128654970760233
Recall (Best RandomForestClassifier): 0.08010335917312661
Confusion Matrix (Best RandomForestClassifier):
[[1473  140]
 [ 356   31]]


In [40]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Load dataset
df = pd.read_csv("heart_disease.csv")

# Features & target
X = df.drop("Heart Disease Status", axis=1)
y = df["Heart Disease Status"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include='object').columns

# Apply one-hot encoding to the categorical columns
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure both train and test sets have the same columns after encoding
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

# Impute missing numerical values using the mean from the training set
for col in X_train_encoded.columns:
    if X_train_encoded[col].isnull().any():
        mean_val = X_train_encoded[col].mean()
        X_train_encoded[col] = X_train_encoded[col].fillna(mean_val)
        X_test_encoded[col] = X_test_encoded[col].fillna(mean_val)

# Scaling (now applied to the fully preprocessed data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# Logistic Regression (balanced)
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

# Train model
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, pos_label='Yes'))
print("Recall:", recall_score(y_test, y_pred, pos_label='Yes'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.501
Precision: 0.18143899895724713
Recall: 0.4496124031007752
Confusion Matrix:
 [[828 785]
 [213 174]]
