In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

N: Column: Description

1: age: age in years
2: sex: sex (1 = male; 0 = female)
3: cp: chest pain type
    -- Value 1: typical angina
    -- Value 2: atypical angina
    -- Value 3: non-anginal pain
    -- Value 4: asymptomatic
4: trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5: chol: serum cholestoral in mg/dl
6: fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
7: restecg: resting electrocardiographic results
    -- Value 0: normal
    -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
8: thalach: maximum heart rate achieved
9: exang: exercise induced angina (1 = yes; 0 = no)
10: oldpeak = ST depression induced by exercise relative to rest
11: slope: the slope of the peak exercise ST segment
    -- Value 1: upsloping
    -- Value 2: flat
    -- Value 3: downsloping
12: ca: number of major vessels (0-3) colored by flourosopy
13: thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14: num: diagnosis of heart disease (angiographic disease status)
    -- Value 0: < 50% diameter narrowing
    -- Value 1,2,3,4: > 50% diameter narrowing
    presence (values 1,2,3,4) absence (value 0)

In [25]:
col_names = ["AGE", "GENDER", "CP", "TRESTBPS", "CHOL", "FBS", "RESTECG", "THALACH", "EXANG", "OLDPEAK", "SLOPE", "CA", "THAL", "NUM"]
col_types = {"AGE": np.int_, "GENDER": np.int_ , "CP": np.int_, "TRESTBPS": np.float_, "CHOL": np.float_,
             "FBS": np.int_, "RESTECG": np.int_, "THALACH": np.float_, "EXANG": np.int_, "OLDPEAK": np.float_,
             "SLOPE": np.int_, "CA": np.int_, "THAL": np.int_, "NUM": np.int_}
health_data = pd.read_csv("Data/processed.cleveland.data", names = col_names)
print(health_data.values)

[[63.0 1.0 1.0 ..., '0.0' '6.0' 0]
 [67.0 1.0 4.0 ..., '3.0' '3.0' 2]
 [67.0 1.0 4.0 ..., '2.0' '7.0' 1]
 ..., 
 [57.0 1.0 4.0 ..., '1.0' '7.0' 3]
 [57.0 0.0 2.0 ..., '1.0' '3.0' 1]
 [38.0 1.0 3.0 ..., '?' '3.0' 0]]


In [26]:
for name in col_names:
    data_type = col_types[name]
    print(name+":", health_data[name].dtype)
    

AGE: float64
GENDER: float64
CP: float64
TRESTBPS: float64
CHOL: float64
FBS: float64
RESTECG: float64
THALACH: float64
EXANG: float64
OLDPEAK: float64
SLOPE: float64
CA: object
THAL: object
NUM: int64


In [27]:
print("CA")
print(health_data.CA.value_counts())
print("THAL")
print(health_data.THAL.value_counts())

CA
0.0    176
1.0     65
2.0     38
3.0     20
?        4
Name: CA, dtype: int64
THAL
3.0    166
7.0    117
6.0     18
?        2
Name: THAL, dtype: int64


In [29]:
health_data.loc[health_data.CA == "?", "CA"] = -1

In [33]:
health_data.loc[health_data.THAL == "?", "THAL"] = -1

In [34]:
print("CA")
print(health_data.CA.value_counts())
print("THAL")
print(health_data.THAL.value_counts())

CA
0.0    176
1.0     65
2.0     38
3.0     20
-1       4
Name: CA, dtype: int64
THAL
3.0    166
7.0    117
6.0     18
-1       2
Name: THAL, dtype: int64


In [36]:
health_data.columns

Index(['AGE', 'GENDER', 'CP', 'TRESTBPS', 'CHOL', 'FBS', 'RESTECG', 'THALACH',
       'EXANG', 'OLDPEAK', 'SLOPE', 'CA', 'THAL', 'NUM'],
      dtype='object')

In [None]:
dir(pd.DataFrame)

In [38]:
health_data.CA.values

array(['0.0', '3.0', '2.0', '0.0', '0.0', '0.0', '2.0', '0.0', '1.0',
       '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0',
       '0.0', '0.0', '0.0', '0.0', '0.0', '2.0', '2.0', '0.0', '0.0',
       '0.0', '0.0', '0.0', '2.0', '2.0', '0.0', '0.0', '0.0', '0.0',
       '0.0', '1.0', '1.0', '0.0', '3.0', '0.0', '2.0', '0.0', '0.0',
       '1.0', '0.0', '0.0', '1.0', '0.0', '1.0', '0.0', '1.0', '0.0',
       '1.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '0.0', '3.0',
       '0.0', '1.0', '2.0', '0.0', '0.0', '0.0', '0.0', '0.0', '2.0',
       '2.0', '2.0', '1.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0',
       '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0',
       '0.0', '3.0', '3.0', '0.0', '0.0', '1.0', '1.0', '2.0', '1.0',
       '0.0', '0.0', '0.0', '1.0', '1.0', '3.0', '0.0', '1.0', '1.0',
       '1.0', '0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0', '0.0',
       '0.0', '3.0', '1.0', '2.0', '3.0', '0.0', '0.0', '1.0', '0.0',
       '2.0', '1.0',