# Load the Heart Disease Dataset from the UCI Data Repository

In [1]:
# ! mkdir heart-disease && cd heart-disease && wget https://archive.ics.uci.edu/static/public/45/heart+disease.zip

In [2]:
# ! cd heart-disease && unzip heart+disease.zip -d heart-disease

In [3]:
! pip install sketch --quiet
import sketch
import pandas as pd

In [4]:
heart_disease_df = pd.read_csv("heart-disease/processed.cleveland.data", names=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"])
heart_disease_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


# Target (Dependent) Variables

In [5]:
# num: diagnosis of heart disease (angiographic disease status)
#         -- Value 0: < 50% diameter narrowing
#         -- Value 1: > 50% diameter narrowing
heart_disease_df.num.value_counts()

num
0    164
1     55
2     36
3     35
4     13
Name: count, dtype: int64

In [6]:
# ca: number of major vessels (0-3) colored by flourosopy
heart_disease_df.ca.value_counts()

ca
0.0    176
1.0     65
2.0     38
3.0     20
?        4
Name: count, dtype: int64

In [7]:
# thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
heart_disease_df.thal.value_counts()

thal
3.0    166
7.0    117
6.0     18
?        2
Name: count, dtype: int64

# Train Model

In [8]:
# select columns
heart_disease_df = heart_disease_df[["age", "sex", "trestbps", "chol", "fbs", "ca"]]
heart_disease_df = heart_disease_df[heart_disease_df.ca!="?"]
heart_disease_df.shape

(299, 6)

In [12]:
heart_disease_df.head(3)

Unnamed: 0,age,sex,trestbps,chol,fbs,ca
0,63.0,1.0,145.0,233.0,1.0,0.0
1,67.0,1.0,160.0,286.0,0.0,3.0
2,67.0,1.0,120.0,229.0,0.0,2.0


In [9]:
# heart_disease_df.sketch.howto("create a machine learning model to predict ca")
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X = heart_disease_df.drop('ca', axis=1)
y = heart_disease_df['ca']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model and fit it to the training data
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Make predictions on the test set and calculate accuracy score
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Accuracy: 0.62


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# import pickle

# pickle.dump(logreg, open("heart-disease-logreg-model.pkl", 'wb'))