# Setup

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "project1"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Get data

In [2]:
import os
import urllib.request

dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/"
dataset_fname = "processed.cleveland.data"
dataset_path = os.path.join("datasets", "heart")

if not os.path.isdir(dataset_path):
        os.makedirs(dataset_path)
                
urllib.request.urlretrieve(dataset_url+dataset_fname, os.path.join(dataset_path, dataset_fname))

('datasets\\heart\\processed.cleveland.data',
 <http.client.HTTPMessage at 0x1e845d6a8e0>)

In [3]:
import pandas as pd

cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

heart_data = pd.read_csv(os.path.join(dataset_path, dataset_fname), names=cols, header=None)

In [4]:
heart_data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


In [5]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [6]:
heart_data = heart_data.replace('?', np.nan) #replaces symbol indicating it is not a numbe

In [7]:
heart_data["ca"] = pd.to_numeric(heart_data["ca"])
heart_data["thal"] = pd.to_numeric(heart_data["thal"])

In [8]:
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0


In [10]:
heart_data['cp'].value_counts()

4.0    144
3.0     86
2.0     50
1.0     23
Name: cp, dtype: int64

# Set aside a test set

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(heart_data, test_size=0.2, random_state=42) #what proportion of the original data is analyzedfra

In [None]:
train_set.info()

In [None]:
heart_data.info()

In [None]:
test_set.head()

In [None]:
test_set['sex'].value_counts()

In [None]:
train_set['sex'].value_counts()

In [None]:
heart_data['sex'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(heart_data, heart_data['sex']):
    strat_train_set = heart_data.loc[train_index]
    strat_test_set = heart_data.loc[test_index]

In [None]:
strat_train_set['sex'].value_counts()

In [None]:
strat_test_set['sex'].value_counts() / len(strat_test_set)

In [None]:
heart_data['sex'].value_counts() / len(heart_data)

# Explore the data


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
heart_data.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

In [None]:
heart_data.plot(kind="scatter",x="age", y="num", alpha=0.1)
save_fig("better_visualization_plot")

# Preparing the Data

In [None]:
train_set.info()

In [None]:
clean_set = train_set.dropna()

In [None]:
clean_set.head()

In [None]:
from sklearn.compose import ColumnTransformer
passthrough = ['exang', 'chol','thalach']
pipeline = ColumnTransformer([
    ('passthrough','passthrough',passthrough)
])

X = pipeline.fit_transform(clean_set)
y = clean_set['num'] > 0

In [None]:
y

# Train Models

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)

In [None]:
np.count_nonzero(sgd_clf.predict(X)==y)

In [None]:
sgd_clf.predict(X)==y

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y,sgd_clf.predict(X))

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X, y, cv=4,scoring='accuracy')

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_predict = cross_val_predict(sgd_clf, X, y, cv=4)
confusion_matrix(y,y_predict)

In [None]:
confusion_matrix(y,y_predict)/np.size(y)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

passthrough = ['exang','ca',]
numerical_att = ['thalach',]
pipeline = ColumnTransformer([
    ('std_scalar', StandardScaler(), numerical_att),
    ('one_hot', OneHotEncoder(),['cp']),
    ('passthrough','passthrough')
])


# Random Forest


In [None]:
from sklearn.ensemble import RandomForestClassifier
rant_clf = RandomForestClassifier(n_estimators=10)


In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_predict = cross_val_predict(rant_clf, X, y, cv=4)
confusion_matrix(y,y_predict)

In [None]:
confusion_matrix(y,y_predict)/np.size(y)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm_clf = SVC(random_state=42)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_predict = cross_val_predict(rant_clf, X, y, cv=4)
confusion_matrix(y,y_predict)

# K-nearest Neighbors

# Multi-layer Perception