In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


In [18]:
# creates dataframe from csv file
DATA_PATH = "testHeart.csv"
data = pd.read_csv(DATA_PATH)


In [11]:
#918 data points, 11 features, and 1 target
data.shape


(918, 1)

In [12]:
#checks for column (feature) names
data.columns


Index(['Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease'], dtype='object')

In [13]:
#for feature map and normalisation, all data-types must be float64
data.info()

#since there are no non-null instances, we do not need to fill in missing datapoints

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 1 columns):
 #   Column                                                                                                               Non-Null Count  Dtype 
---  ------                                                                                                               --------------  ----- 
 0   Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease  918 non-null    object
dtypes: object(1)
memory usage: 7.3+ KB


In [14]:
data.describe(include="all")


Unnamed: 0,"Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease"
count,918
unique,918
top,"56,M,ASY,130,203,1,Normal,98,N,1.5,Flat,1"
freq,1


In [15]:
#finds correlation between individual features and the target (closer to abs(1) means they're more related)
#but not all features are type float64, sex, chestpain type, restingecg, exerciseAngina, and st_slope are non float64
data.corr()

In [19]:
#find the unique categories for each feature
cols = data.columns

num_cols = data._get_numeric_data().columns

categorical_cols = list(set(cols)-set(num_cols))

for col in categorical_cols:
    unique = data[col].unique()
    print("Column: {} has {} unique values\n".format(col, unique))


Column: ChestPainType has ['ATA' 'NAP' 'ASY' 'TA'] unique values

Column: Sex has ['M' 'F'] unique values

Column: RestingECG has ['Normal' 'ST' 'LVH'] unique values

Column: ST_Slope has ['Up' 'Flat' 'Down'] unique values

Column: ExerciseAngina has ['N' 'Y'] unique values



In [25]:
#since sex and ExerciseAngina are binary features, we should read them as float64 and see if they are key features
data['Sex'] = pd.get_dummies(data['Sex'], drop_first = False)
data['ExerciseAngina'] = pd.get_dummies(data['ExerciseAngina'], drop_first = False)

data.head(10)
# Male = 0, Female =1
# No = 1, Yes = 0


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,ATA,140,289,0,Normal,172,1,0.0,Up,0
1,49,1,NAP,160,180,0,Normal,156,1,1.0,Flat,1
2,37,0,ATA,130,283,0,ST,98,1,0.0,Up,0
3,48,1,ASY,138,214,0,Normal,108,0,1.5,Flat,1
4,54,0,NAP,150,195,0,Normal,122,1,0.0,Up,0
5,39,0,NAP,120,339,0,Normal,170,1,0.0,Up,0
6,45,1,ATA,130,237,0,Normal,170,1,0.0,Up,0
7,54,0,ATA,110,208,0,Normal,142,1,0.0,Up,0
8,37,0,ASY,140,207,0,Normal,130,0,1.5,Flat,1
9,48,1,ATA,120,284,0,Normal,120,1,0.0,Up,0


In [27]:
#finds correlation between features and target (closer abs value to 1 is better): ExerciseAngina, Oldpeak, MaxHR, Sex are the 4 most influential features
data.corr()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease
Age,1.0,-0.05575,0.254399,-0.095282,0.198039,-0.382045,-0.215793,0.258612,0.282039
Sex,-0.05575,1.0,-0.005133,0.200092,-0.120076,0.189186,0.190664,-0.105734,-0.305445
RestingBP,0.254399,-0.005133,1.0,0.100893,0.070193,-0.112135,-0.155101,0.164803,0.107589
Cholesterol,-0.095282,0.200092,0.100893,1.0,-0.260974,0.235792,0.034166,0.050148,-0.232741
FastingBS,0.198039,-0.120076,0.070193,-0.260974,1.0,-0.131438,-0.060451,0.052698,0.267291
MaxHR,-0.382045,0.189186,-0.112135,0.235792,-0.131438,1.0,0.370425,-0.160691,-0.400421
ExerciseAngina,-0.215793,0.190664,-0.155101,0.034166,-0.060451,0.370425,1.0,-0.408752,-0.494282
Oldpeak,0.258612,-0.105734,0.164803,0.050148,0.052698,-0.160691,-0.408752,1.0,0.403951
HeartDisease,0.282039,-0.305445,0.107589,-0.232741,0.267291,-0.400421,-0.494282,0.403951,1.0


In [28]:
#shuffle features just in case data is organized in relationship to a certain column
data = shuffle(data, random_state = 42)

#only include the necessary features in the final dataframe, and seperate features from target
features = data[['ExerciseAngina', 'Oldpeak', 'MaxHR', 'Sex']]
target = data[['HeartDisease']]

#normalize data between -2pi and 2pi to be compatible with Bloch Sphere
scaler = MinMaxScaler(feature_range = (-2 * np.pi, 2 * np.pi))
features = scaler.fit_transform(features)

#Split the 918 datapoints into test and training sets for variational circuit
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=42)

print(features)

KeyError: "['OldPeak'] not in index"