Connected to Python 3.12.0

#### Import dipendencies

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
# import all the model needed
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# import all the metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report # used for discrete classes
from sklearn.metrics import precision_score, recall_score, f1_score # more in depth metrics
# import visualization dipendencies
import matplotlib.pyplot as plt
# data scaler
from sklearn.preprocessing import StandardScaler


##### Getting the data

In [None]:
file_path = 'heart_disease_data.csv'
if os.path.exists(file_path):
	df = pd.read_csv(file_path)
else:
	print(f"File not found: {file_path}. Current working directory: {os.getcwd()}")
	df = pd.DataFrame()

df

##### Explore the data

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

##### Setting up features and targets and the train/test split

In [None]:
X = df.drop(['target'], axis=1) # features
y = df['target'] # target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.2)

## Training the models

In [None]:
accuracies = []

##### Logistic regressing

In [None]:
scaler = StandardScaler() # we have to scale the data since we are using logistic regression
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

In [None]:
logi_accuracy = accuracy_score(y_test, y_test_pred)
accuracies.append("{:.3f}".format(logi_accuracy))
train_acc = accuracy_score(y_train, y_train_pred)

print(f"Training Accuracy: {train_acc:.3f}")
print(f"Testing Accuracy:  {logi_accuracy:.3f}")

##### Random forest Classifier

In [None]:
model1 = RandomForestClassifier()
model1.fit(X_train, y_train)

# Predict and evaluate
y_train_pred1 = model1.predict(X_train)
y_test_pred1 = model1.predict(X_test)

In [None]:
forest_accuracy = accuracy_score(y_test, y_test_pred1)
accuracies.append("{:.3f}".format(forest_accuracy))
train_acc1 = accuracy_score(y_train, y_train_pred1)

print(f"Training Accuracy: {train_acc1:.3f}")
print(f"Testing Accuracy:  {forest_accuracy:.3f}")

##### XGBoost

In [None]:
model2 = xgb.XGBClassifier(n_estimators = 500)
model2.fit(X_train, y_train)

# Predict and evaluate
y_train_pred2 = model2.predict(X_train)
y_test_pred2 = model2.predict(X_test)

In [None]:
xgb_accuracy = accuracy_score(y_test, y_test_pred2)
accuracies.append("{:.3f}".format(xgb_accuracy))
train_acc2 = accuracy_score(y_train, y_train_pred2)

print(f"Training Accuracy: {train_acc2:.3f}")
print(f"Testing Accuracy:  {xgb_accuracy:.3f}")

In [None]:
models = ['LogisticRegression', 'RandomForestClassifier', 'XGBoost']
colors = ['blue', 'red', 'green']
accuracies = [float(a) for a in accuracies]

In [None]:
plt.figure(figsize=(10, 6))
bars = plt.bar(models, accuracies, color=colors)
plt.xlabel('Machine learning models')
plt.ylabel('Accuracy scores')
plt.title('Comparison of all the models')
plt.xticks(rotation=45)
plt.tight_layout()
for bar, score in zip(bars, accuracies):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, score, ha='center', va='center_baseline', fontsize=8)

In [None]:
for i, model in enumerate(models):
    print(f'{i+1}.{model}: {accuracies[i]*100}%')

#### Let's test the model:

##### Get user data

In [None]:
def get_user_input():
    age = int(input('Enter the age: '))
    sex = int(input('Enter the sex: '))
    cp = int(input('Enter chest pain type (0-3) (0 for typical angina, 1 for atypical angina, 2 for non-anginal pain, 3 for asymptomatic): '))
    trestbps = int(input('Enter resting blood pressure: '))
    chol = int(input('Enter serum cholesterol level: '))
    fbs = int(input('Is fasting blood sugar > 120 mg/dl? (1 = yes, 0 = no): '))
    restecg = int(input('Enter resting electrocardiographic results (0 = normal, 1 = ST-T abnormality, 2 = left ventricular hypertrophy): '))
    thalach = int(input('Enter maximum heart rate achieved: '))
    exang = int(input('Exercise induced angina? (1 = yes, 0 = no): '))
    oldpeak = float(input('Enter ST depression induced by exercise relative to rest (oldpeak): '))
    slope = int(input('Enter the slope of the peak exercise ST segment (0 = upsloping, 1 = flat, 2 = downsloping): '))
    ca = int(input('Enter number of major vessels (0â€“4) colored by fluoroscopy: '))
    thal = int(input('Enter thalassemia type (1 = normal, 2 = fixed defect, 3 = reversible defect): '))

    return [[age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]] # return all the values

##### Predict and evaluate

In [None]:
user_data = get_user_input()

user_prediction = model.predict(scaler.transform(user_data))
user_prediction_proba = model.predict_proba(scaler.transform(user_data))

print('Prediction:', 'Heart attack' if user_prediction[0] == 1 else 'No heart attack')
print(f'Prediction probability: {user_prediction_proba[0][1] * 100:.2f}% for heart attack')