# Load the Heart Disease Dataset from the UCI Data Repository

In [1]:
# ! mkdir heart-disease && cd heart-disease && wget https://archive.ics.uci.edu/static/public/45/heart+disease.zip

In [2]:
# ! cd heart-disease && unzip heart+disease.zip -d heart-disease

In [1]:
import numpy as np
import pandas as pd

import mlflow

mlflow.set_tracking_uri("http://0.0.0.0:5000")
mlflow.set_experiment("nyc-taxi-experiment")

ModuleNotFoundError: No module named 'mlflow'

In [3]:
cleveland = pd.read_csv("heart-disease/processed.cleveland.data", names=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"])
cleveland.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [5]:
hungary = pd.read_csv("heart-disease/processed.hungarian.data", names=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"])
hungary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    object 
 4   chol      294 non-null    object 
 5   fbs       294 non-null    object 
 6   restecg   294 non-null    object 
 7   thalach   294 non-null    object 
 8   exang     294 non-null    object 
 9   oldpeak   294 non-null    float64
 10  slope     294 non-null    object 
 11  ca        294 non-null    object 
 12  thal      294 non-null    object 
 13  num       294 non-null    int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 32.3+ KB


In [6]:
switzerland = pd.read_csv("heart-disease/processed.switzerland.data", names=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"])
switzerland.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       123 non-null    int64 
 1   sex       123 non-null    int64 
 2   cp        123 non-null    int64 
 3   trestbps  123 non-null    object
 4   chol      123 non-null    int64 
 5   fbs       123 non-null    object
 6   restecg   123 non-null    object
 7   thalach   123 non-null    object
 8   exang     123 non-null    object
 9   oldpeak   123 non-null    object
 10  slope     123 non-null    object
 11  ca        123 non-null    object
 12  thal      123 non-null    object
 13  num       123 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 13.6+ KB


In [7]:
veterans = pd.read_csv("heart-disease/processed.va.data", names=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"])
veterans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       200 non-null    int64 
 1   sex       200 non-null    int64 
 2   cp        200 non-null    int64 
 3   trestbps  200 non-null    object
 4   chol      200 non-null    object
 5   fbs       200 non-null    object
 6   restecg   200 non-null    int64 
 7   thalach   200 non-null    object
 8   exang     200 non-null    object
 9   oldpeak   200 non-null    object
 10  slope     200 non-null    object
 11  ca        200 non-null    object
 12  thal      200 non-null    object
 13  num       200 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 22.0+ KB


In [11]:
heart_disease_df = pd.concat([cleveland, hungary, switzerland, veterans])
heart_disease_df.shape

(920, 14)

# Target (Dependent) Variables

In [12]:
# num: diagnosis of heart disease (angiographic disease status)
#         -- Value 0: < 50% diameter narrowing
#         -- Value 1: > 50% diameter narrowing
heart_disease_df.num.value_counts()

num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64

In [13]:
# ca: number of major vessels (0-3) colored by flourosopy
heart_disease_df.ca.value_counts()

ca
?      611
0.0    176
1.0     65
2.0     38
3.0     20
0        5
2        3
1        2
Name: count, dtype: int64

In [14]:
# thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
heart_disease_df.thal.value_counts()

thal
?      486
3.0    166
7.0    117
7       75
3       30
6       28
6.0     18
Name: count, dtype: int64

# Train Model

In [15]:
# select columns
heart_disease_df = heart_disease_df[["age", "sex", "trestbps", "chol", "fbs", "num"]]
heart_disease_df.shape

(920, 6)

In [19]:
heart_disease_df.head(3)

Unnamed: 0,age,sex,trestbps,chol,fbs,num
0,63.0,1.0,145.0,233.0,1.0,0
1,67.0,1.0,160.0,286.0,0.0,2
2,67.0,1.0,120.0,229.0,0.0,1


In [37]:
(heart_disease_df=="?").sum()

age         0
sex         0
trestbps    0
chol        0
fbs         0
num         0
dtype: int64

In [40]:
heart_disease_df.trestbps.replace("?", np.nan, inplace=True)
heart_disease_df.trestbps.fillna(heart_disease_df.trestbps.astype(float).mean().round(0), inplace=True)
heart_disease_df.chol.replace("?", np.nan, inplace=True)
heart_disease_df.chol.fillna(heart_disease_df.chol.astype(float).mean().round(0), inplace=True)
heart_disease_df.fbs.replace("?", np.nan, inplace=True)
heart_disease_df.fbs.fillna(heart_disease_df.fbs.astype(float).mean().round(0), inplace=True)
(heart_disease_df=="?").sum()

age         0
sex         0
trestbps    0
chol        0
fbs         0
num         0
dtype: int64

In [70]:
# heart_disease_df.sketch.howto("create a machine learning model to predict ca")
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X = heart_disease_df.drop('num', axis=1)
y = heart_disease_df['num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_tracking_uri("mysql+pymysql://mlflow_user:mlflow@127.0.0.1:3306/mlflow_database")
# ! mlflow db upgrade mysql://mlflow_user:mlflow@127.0.0.1:3306/mlflow_database
mlflow.set_experiment("heart-disease")

with mlflow.start_run():

    params = {"C": 0.1, "random_state": 42}
    mlflow.log_params(params)

    # Create a logistic regression model and fit it to the training data
    logreg = LogisticRegression(**params)
    logreg.fit(X_train, y_train)

    # Make predictions on the test set and calculate accuracy score
    y_pred = logreg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(logreg, artifact_path="models")

    print('Accuracy: {:.2f}'.format(accuracy))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.42


In [69]:
mlflow.end_run()

In [71]:
# import pickle

# pickle.dump(logreg, open("heart-disease-logreg-model.pkl", 'wb'))

In [72]:
mlflow.log_artifact(local_path="heart-disease-logreg-model.pkl", artifact_path="models_pickle")