In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

Note: Data is from the UCI Machine Learning Repository:

Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

In [102]:
# data: https://archive.ics.uci.edu/ml/datasets/heart+disease
heart = pd.read_csv('processed.cleveland.data.csv')

In [103]:
heart.head()
# data of patients who visited a heart clinic

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


- age: age in years
- sex: 1=male, 0=female
- cp: chest pain type
 - Value 1: typical angina
 - Value 2: atypical angina
 - Value 3: non-anginal pain
 - Value 4: asymptomatic
- trestbps: resting blood pressure (in mm Hg on admission to the hospital)
- chol: serum cholestoral in mg/dl
- fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg: resting electrocardiographic results
 - Value 0: normal
 - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
 - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
- thalach: maximum heart rate achieved in an exercise test
- exang: exercise induced angina (1 = yes; 0 = no)
- oldpeak: ST depression induced by exercise relative to rest
- slope: the slope of the peak exercise ST segment
 - Value 1: upsloping
 - Value 2: flat
 - Value 3: downsloping
- ca: number of major vessels (0-3) colored by flourosopy
- thal: 
 - Value 3: normal
 - Value 6: fixed defect
 - Value 7: reversable defect
- heart_disease: diagnosis of heart disease (angiographic disease status)
 - Value 0: < 50% diameter narrowing
 - Value 1: > 50% diameter narrowing
"\[This field\] refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0)."


In [104]:
# first thing we're going to do is print out some 
# summary statistics for this dataset
# guess this helps us understand what our dataset represents

# STEP 1: Cleaning up our dataset
# STEP 2: Figuring out a way to condense our dataset down to a 
#         smaller set of numbers or visualisations

heart.describe(include='all')
# add the 'include' parameter to show all summary stats



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
unique,,,,,,,,,,,,5.0,4.0,
top,,,,,,,,,,,,0.0,3.0,
freq,,,,,,,,,,,,176.0,166.0,
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,,,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,,,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,,,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,,,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,,,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,,,2.0


In [105]:
# let's take a look at the dtypes

heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    float64
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             303 non-null    object 
 12  thal           303 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


Columns that need cleaning/manipulating
1. **thal** - has 4 unique values instead of 3; should set numerical categories as word categories for easier manipulation
2. **ca** - has 5 unique values instead of 4
3. **sex** - should be set to just male and female
4. **cp** - should be set to text categories
5. **restecg** - should be set to just normal or abnormal for the purposes of our analysis
6. **slope** - should be set to text categories; then we can assign a numerical code for ML purposes
7. **heart_disease** - should be set to just presence or absence

In [106]:
# the instructor looked at the variables that seem to be returning
# odd dtypes

# the 'ca' and 'thal' columns contain numerical data but are returning
# strings/objects

# so let's take a look at the unique values in the 2 weird columns

heart['ca'].unique()

# 'ca' shouldn't have a 0 nor a ?


array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)

### Fix 'ca' and 'thal'

In [107]:
# let's investigate the unique values in 'thal'

heart['thal'].unique()

# aah another question mark
# instead of a ?, we need to explicitly tell Python that the data is missing

array(['6.0', '3.0', '7.0', '?'], dtype=object)

In [108]:
# let's replace the question marks

heart.replace('?', np.nan, inplace=True)
heart

# what to do with missing data?
# delete the rows with missing data
# impute the data/guess what the value should be
# 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [109]:
# then let's change the dtype of the columns

heart['ca'] = heart['ca'].astype('float')
# couldn't change 'ca' column to int because of the decimal point precision

In [129]:
# let's also replace the numerical codes with text categories 
# thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

heart.thal.replace({'3.0':'normal', '6.0':'fixed defect', '7.0':'reversable defect'}, inplace = True)

In [110]:
# the dtype conversion worked

heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    float64
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             299 non-null    float64
 12  thal           301 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(12), int64(1), object(1)
memory usage: 33.3+ KB


### Fix 'cp'

In [111]:
# for data that's actually categorical data (that might be stored as int or float)
# and is not ordered in any way, it's usually good to store them as 
# 'category' - also acts as a marker that you shouldn't perform numerical ops on that column

# cp: chest pain type
# Value 1: typical angina
# Value 2: atypical angina
# Value 3: non-anginal pain
# Value 4: asymptomatic

# let's replace the numbers with category names

heart['cp'].replace({1.0: 'typical angina',
                     2.0: 'atypical angina',
                     3.0: 'non-anginal pain',
                     4.0: 'asymptomatic'},
                     inplace=True)


In [112]:
# let's check the dtypes again
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    object 
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             299 non-null    float64
 12  thal           301 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


### Fix 'slope'

In [113]:
# the 'slope' category is ordinal data
# you want to know what each of the categories means

# let's replace the numbers in the 'slope' column
# slope: the slope of the peak exercise ST segment
# Value 1: upsloping
# Value 2: flat
# Value 3: downsloping

heart['slope'].replace({1.0: 'upsloping',
                        2.0: 'flat',
                        3.0: 'downsloping'},
                        inplace=True)

In [114]:
# use pd.Categorical() to change the column to a categorical dtype

heart['slope'] = pd.Categorical(values=heart['slope'], 
               categories=['upsloping', 'flat', 'downsloping'],
               ordered=True,
               )
# ordered parameter orders the categories according to the order in the square brackets


In [115]:
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,typical angina,145.0,233.0,1.0,2.0,150.0,0.0,2.3,downsloping,0.0,6.0,0
1,67.0,1.0,asymptomatic,160.0,286.0,0.0,2.0,108.0,1.0,1.5,flat,3.0,3.0,2
2,67.0,1.0,asymptomatic,120.0,229.0,0.0,2.0,129.0,1.0,2.6,flat,2.0,7.0,1
3,37.0,1.0,non-anginal pain,130.0,250.0,0.0,0.0,187.0,0.0,3.5,downsloping,0.0,3.0,0
4,41.0,0.0,atypical angina,130.0,204.0,0.0,2.0,172.0,0.0,1.4,upsloping,0.0,3.0,0


In [116]:
# let's see how the categories have been coded

# notice: upsloping is now a 0, flat is now a 1, downsloping 2
# assists with plotting functions - instead of ordering the data
# alphabetically, it will order it numerically according to this code

heart['slope'].cat.codes

# this is one form of categorical coding
# needed when you want to put a predictive model on your data
# certain models can only work with/operate on numbers

# could also do one-hot encoding
# break down one categorical variable up into separate columns
# where each column is a 1 or a 0 based on whether that value is in the category or not
# USEFUL for the heart_disease column

0      2
1      1
2      1
3      2
4      0
      ..
298    1
299    1
300    1
301    1
302    0
Length: 303, dtype: int8

In [117]:
heart.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303,299.0,301.0,303.0
unique,,,4,,,,,,,,3,,3.0,
top,,,asymptomatic,,,,,,,,upsloping,,3.0,
freq,,,144,,,,,,,,142,,166.0,
mean,54.438944,0.679868,,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,,0.672241,,0.937294
std,9.038662,0.467299,,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,,0.937438,,1.228536
min,29.0,0.0,,94.0,126.0,0.0,0.0,71.0,0.0,0.0,,0.0,,0.0
25%,48.0,0.0,,120.0,211.0,0.0,0.0,133.5,0.0,0.0,,0.0,,0.0
50%,56.0,1.0,,130.0,241.0,0.0,1.0,153.0,0.0,0.8,,0.0,,0.0
75%,61.0,1.0,,140.0,275.0,0.0,2.0,166.0,1.0,1.6,,1.0,,2.0


### Fix 'sex'

In [118]:
# Let's get rid of the 1s and 0s and replace them with male and female
# sex: 1=male, 0=female

# checking to see the data in this column
heart['sex']

# use the replace function

heart['sex'].replace({1.0: 'male',
                      0.0: 'female'},
                      inplace=True)

In [119]:
heart.dtypes

age               float64
sex                object
cp                 object
trestbps          float64
chol              float64
fbs               float64
restecg           float64
thalach           float64
exang             float64
oldpeak           float64
slope            category
ca                float64
thal               object
heart_disease       int64
dtype: object

### Fix 'restecg'

In [120]:
# let's set the values for resting ecg to two values: 'abnormal' and 'normal'

# restecg: resting electrocardiographic results
# Value 0: normal
# Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

# according to this tutorial, the easiest thing to do is extract the 'normal' column using pd.get_dummies
# this code converts categorical variable into dummy/indicator variables i.e 0 and 1

# checking that data in that column
heart['restecg']

# then let's convert the numerical categories to text categories
heart['restecg'].replace({0.0: 'normal',
                          1.0: 'ST-T wave abnormality',
                          2.0: 'left ventricular hypertrophy'},
                         inplace=True)

# and then expand the restecg column into separate columns by category
heart = pd.get_dummies(data=heart, columns=['restecg'])

# now we need to drop the ST-T Wave and Left Ventricular Hypertrophy columns
heart.drop(labels=['restecg_ST-T wave abnormality', 'restecg_left ventricular hypertrophy'], axis=1, inplace=True)



In [121]:
# checking to see if the cleaning procedures above worked
heart

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,heart_disease,restecg_normal
0,63.0,male,typical angina,145.0,233.0,1.0,150.0,0.0,2.3,downsloping,0.0,6.0,0,0
1,67.0,male,asymptomatic,160.0,286.0,0.0,108.0,1.0,1.5,flat,3.0,3.0,2,0
2,67.0,male,asymptomatic,120.0,229.0,0.0,129.0,1.0,2.6,flat,2.0,7.0,1,0
3,37.0,male,non-anginal pain,130.0,250.0,0.0,187.0,0.0,3.5,downsloping,0.0,3.0,0,1
4,41.0,female,atypical angina,130.0,204.0,0.0,172.0,0.0,1.4,upsloping,0.0,3.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,male,typical angina,110.0,264.0,0.0,132.0,0.0,1.2,flat,0.0,7.0,1,1
299,68.0,male,asymptomatic,144.0,193.0,1.0,141.0,0.0,3.4,flat,2.0,7.0,2,1
300,57.0,male,asymptomatic,130.0,131.0,0.0,115.0,1.0,1.2,flat,1.0,7.0,3,1
301,57.0,female,atypical angina,130.0,236.0,0.0,174.0,0.0,0.0,flat,1.0,3.0,1,0


### Fix 'heart_disease'

In [124]:
# for the 'heart_disease' column we need to set to just 0 and 1

# heart_disease: diagnosis of heart disease (angiographic disease status)
# Value 0: < 50% diameter narrowing
# Value 1: > 50% diameter narrowing "[This field] refers to the presence of heart disease in the patient."

# METHOD 1: Using .loc to do this
# heart['heart_disease']
# heart.loc[heart['heart_disease'] > 0, 'heart_disease'] = 'presence'
# heart.loc[heart['heart_disease'] == 0, 'heart_disease'] = 'absence'

# METHOD 2: Using np.where()
# In the tutorial, we can also use np.where()
np.where(heart['heart_disease'] == 0, 'absence', 'presence')

# assign the returned values above to the heart_disease column
heart['heart_disease'] = np.where(heart['heart_disease'] == 0, 'absence', 'presence')

In [126]:
# checking to see that the assignment worked
heart['heart_disease'][1:10]

1    presence
2    presence
3     absence
4     absence
5     absence
6    presence
7     absence
8    presence
9    presence
Name: heart_disease, dtype: object

In [131]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             303 non-null    float64 
 1   sex             303 non-null    object  
 2   cp              303 non-null    object  
 3   trestbps        303 non-null    float64 
 4   chol            303 non-null    float64 
 5   fbs             303 non-null    float64 
 6   thalach         303 non-null    float64 
 7   exang           303 non-null    float64 
 8   oldpeak         303 non-null    float64 
 9   slope           303 non-null    category
 10  ca              299 non-null    float64 
 11  thal            301 non-null    object  
 12  heart_disease   303 non-null    object  
 13  restecg_normal  303 non-null    uint8   
dtypes: category(1), float64(8), object(4), uint8(1)
memory usage: 29.3+ KB


In [130]:
heart
# Notice that slope was set to category dtype to keep it ordered and assign numerical codes to it

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,heart_disease,restecg_normal
0,63.0,male,typical angina,145.0,233.0,1.0,150.0,0.0,2.3,downsloping,0.0,fixed defect,absence,0
1,67.0,male,asymptomatic,160.0,286.0,0.0,108.0,1.0,1.5,flat,3.0,normal,presence,0
2,67.0,male,asymptomatic,120.0,229.0,0.0,129.0,1.0,2.6,flat,2.0,reversable defect,presence,0
3,37.0,male,non-anginal pain,130.0,250.0,0.0,187.0,0.0,3.5,downsloping,0.0,normal,absence,1
4,41.0,female,atypical angina,130.0,204.0,0.0,172.0,0.0,1.4,upsloping,0.0,normal,absence,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,male,typical angina,110.0,264.0,0.0,132.0,0.0,1.2,flat,0.0,reversable defect,presence,1
299,68.0,male,asymptomatic,144.0,193.0,1.0,141.0,0.0,3.4,flat,2.0,reversable defect,presence,1
300,57.0,male,asymptomatic,130.0,131.0,0.0,115.0,1.0,1.2,flat,1.0,reversable defect,presence,1
301,57.0,female,atypical angina,130.0,236.0,0.0,174.0,0.0,0.0,flat,1.0,normal,presence,0


In [132]:
heart.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,heart_disease,restecg_normal
count,303.0,303,303,303.0,303.0,303.0,303.0,303.0,303.0,303,299.0,301,303,303.0
unique,,2,4,,,,,,,3,,3,2,
top,,male,asymptomatic,,,,,,,upsloping,,normal,absence,
freq,,206,144,,,,,,,142,,166,164,
mean,54.438944,,,131.689769,246.693069,0.148515,149.607261,0.326733,1.039604,,0.672241,,,0.49835
std,9.038662,,,17.599748,51.776918,0.356198,22.875003,0.469794,1.161075,,0.937438,,,0.500824
min,29.0,,,94.0,126.0,0.0,71.0,0.0,0.0,,0.0,,,0.0
25%,48.0,,,120.0,211.0,0.0,133.5,0.0,0.0,,0.0,,,0.0
50%,56.0,,,130.0,241.0,0.0,153.0,0.0,0.8,,0.0,,,0.0
75%,61.0,,,140.0,275.0,0.0,166.0,1.0,1.6,,1.0,,,1.0
