## **Final Project: Matters of the Heart**
**Chloé Blanchard | chb2132 | 5210 Python | Group 1**


*A Data-Driven Investigation into Heart Failure and Patient Privacy Protection through Hashing Encryption Methods*



In [None]:
import pandas as pd

In [None]:
%cd /Users/user/Desktop/python/

/Users/user/Desktop/python


In [None]:
heart_data = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [None]:
# getting some preliminary information/a general overview about the heart dataset we've just loaded in
# this will be verified later, but it seems that we have 299 entries, of which no columns have null values

heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [None]:
# looking at the first few rows of our dataset to get an idea of the dataset and check for read/load errors

heart_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
# checking the entire dataset for null variables by using the .isnull().sum() function, will return the
# sum/total number of null data values/entries per column, for all columns in our heart dataset

heart_data.isnull().sum()

# no null values in any of the variables across the heart dataset! yay! no data imputing is required here.
# this echoes our initial observation, that we have 299 rows of data, where 299 entries are non-null values

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [None]:
# describing the heart_data before we do any data cleaning, without imputing any values/replacing
# any column data values (for example, in the outlier cleaning process that follows below)

heart_data.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.327821,0.431438,581.839465,0.41806,36.779041,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,8.977789,0.496107,970.287881,0.494067,6.674272,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,43.0,0.0,23.0,0.0,25.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,53.0,0.0,116.5,0.0,35.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.833893,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,65.0,1.0,582.0,1.0,40.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,81.0,1.0,7861.0,1.0,55.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [None]:
# cleaning outliers from the age column, since it is a non-boolean, numeric datatype (float64)

# we want to be sure to try to eliminate any extreme outliers/ununusual data values, as these could possibly
# skew our dataset analysis, and create skewed averages/means/ranges, which could cloud dataset trends, etc.

def filter_outlier_age(x):
    if x >= heart_data.age.quantile(0.95) or x <= heart_data.age.quantile(0.05):
        return heart_data.age.mean()
    else:
        return x

heart_data.age = heart_data.age.apply(filter_outlier_age)
heart_data.age.describe()

count    299.000000
mean      60.107352
std        6.258857
min       46.000000
25%       55.500000
50%       60.327821
75%       64.000000
max       73.000000
Name: age, dtype: float64

In [None]:
# cleaning outliers from creatinine_phosphokinase column, since it is a non-boolean, numeric datatype (int64)

# we want to be sure to try to eliminate any extreme outliers/ununusual data values, as these could possibly
# skew our dataset analysis, and create skewed averages/means/ranges, which could cloud dataset trends, etc.

def filter_outlier_creatinine_phosphokinase(x):
    if x >= heart_data.creatinine_phosphokinase.quantile(0.95) or x <= heart_data.creatinine_phosphokinase.quantile(0.05):
        return heart_data.creatinine_phosphokinase.mean()
    else:
        return x

heart_data.creatinine_phosphokinase = heart_data.creatinine_phosphokinase.apply(filter_outlier_creatinine_phosphokinase)
heart_data.creatinine_phosphokinase.describe()

count     299.000000
mean      441.250914
std       402.135902
min        60.000000
25%       131.500000
50%       320.000000
75%       582.000000
max      2261.000000
Name: creatinine_phosphokinase, dtype: float64

In [None]:
# cleaning outliers from the ejection_fraction column, since it is a non-boolean, numeric datatype (int64)

# we want to be sure to try to eliminate any extreme outliers/ununusual data values, as these could possibly
# skew our dataset analysis, and create skewed averages/means/ranges, which could cloud dataset trends, etc.

def filter_outlier_ejection_fraction(x):
    if x >= heart_data.ejection_fraction.quantile(0.95) or x <= heart_data.ejection_fraction.quantile(0.05):
        return heart_data.ejection_fraction.mean()
    else:
        return x

heart_data.ejection_fraction = heart_data.ejection_fraction.apply(filter_outlier_ejection_fraction)
heart_data.ejection_fraction.describe()

count    299.000000
mean      37.085871
std        3.468460
min       30.000000
25%       35.000000
50%       38.000000
75%       38.083612
max       45.000000
Name: ejection_fraction, dtype: float64

In [None]:
# clean outlier from platelets column, since it is a non-boolean, numeric datatype (float64)

# we want to be sure to try to eliminate any extreme outliers/ununusual data values, as these could possibly
# skew our dataset analysis, and create skewed averages/means/ranges, which could cloud dataset trends, etc.

def filter_outlier_platelets(x):
    if x >= data.platelets.quantile(0.95) or x <= data.platelets.quantile(0.05):
        return data.platelets.mean()
    else:
        return x

data.platelets = data.platelets.apply(filter_outlier_platelets)
data.platelets.describe()

count       299.000000
mean     258624.386715
std       60665.337855
min      132000.000000
25%      221000.000000
50%      263358.029264
75%      285000.000000
max      422000.000000
Name: platelets, dtype: float64

In [None]:
# clean outlier from serum_creatinine column, since it is a non-boolean, numeric datatype (float64)

# we want to be sure to try to eliminate any extreme outliers/ununusual data values, as these could possibly
# skew our dataset analysis, and create skewed averages/means/ranges, which could cloud dataset trends, etc.

def filter_outlier_serum_creatinine(x):
    if x >= data.serum_creatinine.quantile(0.95) or x <= data.serum_creatinine.quantile(0.05):
        return data.serum_creatinine.mean()
    else:
        return x

data.serum_creatinine = data.serum_creatinine.apply(filter_outlier_serum_creatinine)
data.serum_creatinine.describe()

count    299.000000
mean       1.266639
std        0.399739
min        0.750000
25%        1.000000
50%        1.180000
75%        1.393880
max        2.900000
Name: serum_creatinine, dtype: float64

In [None]:
# clean outlier from time column, since it is a non-boolean, numeric datatype (int64)

# we want to be sure to try to eliminate any extreme outliers/ununusual data values, as these could possibly
# skew our dataset analysis, and create skewed averages/means/ranges, which could cloud dataset trends, etc.

def filter_outlier_time(x):
    if x >= data.time.quantile(0.95) or x <= data.time.quantile(0.05):
        return data.time.mean()
    else:
        return x

data.time = data.time.apply(filter_outlier_time)
data.time.describe()

count    299.000000
mean     128.507052
std       65.169238
min       13.000000
25%       79.500000
50%      129.000000
75%      187.000000
max      247.000000
Name: time, dtype: float64

In [None]:
heart_data.describe()

# another look at the data using describe, this time after we've accounted for all of the possible outliers!

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.107352,0.431438,441.250914,0.41806,37.085871,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,6.258857,0.496107,402.135902,0.494067,3.46846,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,46.0,0.0,60.0,0.0,30.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,55.5,0.0,131.5,0.0,35.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.327821,0.0,320.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,64.0,1.0,582.0,1.0,38.083612,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,73.0,1.0,2261.0,1.0,45.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [None]:
# implementing the pandas utility to hash a pandas object (works on index/series/dataframes in python)

# choosing the default settings for index, categorize (= True), setting our own, 'random' hash key value
# using the utf8 encoding setting (also default), will try to hash to other encodings next

encr_heart_data = pd.util.hash_pandas_object(
    heart_data, index = True, encoding = 'utf8', hash_key = '47002109956938612005432', categorize = True)

In [None]:
encr_heart_data

# now we have a series of dtype: uint64, which is the same length as our original object: length = 299

0        805128537297532431
1      17140030433791436049
2      11226973166410540159
3      15028596663500338484
4       6820087250034108829
               ...         
294    16527880117216591146
295     4102492704584823915
296     7083541088165125451
297    17210002913170551347
298     8520318956149101207
Length: 299, dtype: uint64