# Imports

In [1]:
import pandas             as pd
import numpy              as np
import regex              as re
import seaborn            as sns
import matplotlib.pyplot  as plt
from IPython.core.display import display, HTML
sns.set(style = "white", palette = "dark")
display(HTML("<style>.container { width:95% !important; }</style>"))
%matplotlib inline

# Data Cleaning

## Reading In The Data & Overview

In [2]:
# Reading in the MRI data file

mri = pd.read_csv("../Data/stress_cardiac_mri.csv")

In [3]:
# Looking at the first two rows

mri.head(2)

Unnamed: 0,Name,"Sex (1=male,0=female)",Age,"Hypertension (1=yes,0=no)",Hyperlipidemia,Diabetes,History of smoking,LVEDV,LVESV,LV Wall Thickness,...,MA_Ischemia,MAS_Ischemia,MIS_Ischemia,MI_Ischemia,MIL_Ischemia,MAL_Ischemia,AA_Ischemia,AS_Ischemia,AI_Ischemia,AL_Ischemia
0,CloudCMR-35423,1,37,0,0,No,Never,613.0,501.0,Normal,...,0,0,0,0,0,0,0,0,0,0
1,CloudCMR-24662,1,74,0,1,No,Never,569.0,492.0,MILD HYPERTROPHY,...,0,0,0,0,0,0,0,1,0,1


In [4]:
# Checking the shape of the data

print(f'The shape of the mri set is: {mri.shape}')

The shape of the mri set is: (6494, 48)


In [5]:
# Getting the data types

mri.dtypes

Name                                                                                                                    object
Sex (1=male,0=female)                                                                                                    int64
Age                                                                                                                      int64
Hypertension (1=yes,0=no)                                                                                                int64
Hyperlipidemia                                                                                                           int64
Diabetes                                                                                                                object
History of smoking                                                                                                      object
LVEDV                                                                                                          

In [6]:
# Sum of the data types

mri.dtypes.value_counts()

int64      37
object      6
float64     5
dtype: int64

### Columns

In [7]:
# Some of the columns have very long names, so I will rename them

mri = mri.rename({"Sex (1=male,0=female)": "Sex", 
                  "Hypertension (1=yes,0=no)": "Hypertension", 
                  "Name": "ID",
                  "History of smoking": "Smoker Status", 
                  "Tricuspid Regurgitation": "Tricusp Reg"}, axis = 1)
mri = mri.rename({"Aortic Regurgitation (0=none, 0.5 = trivial, 1=mild, 1.5=mild-moderate, 2=moderate, 2.5=moderate-severe, 3= severe)":
                  "Aortic Reg"}, axis = 1)
mri = mri.rename({"Mitral Regurgitation (0=none, 0.5 = trivial, 1=mild, 1.5=mild-moderate, 2=moderate, 2.5=moderate-severe, 3= severe)":
                  "Mitral Reg"}, axis = 1)

In [8]:
# Replacing spaces with underscores

mri.columns = mri.columns.str.replace(" ", "_")

# Making sure that all columns are lower case

mri.columns = mri.columns.str.lower()

### Ordinal & Nominal Values

In [9]:
# Making Yes/No values binary

mri["diabetes"] = mri["diabetes"].apply(lambda x: 1 if x == "Yes" else 0)

In [10]:
# Creating dictionaries for mapping ordinal data to numeric values

lv_wall_map         = {"Normal": 0, 
                       "MILD HYPERTROPHY": 1, 
                       "MODERATE HYPERTROPHY": 2, 
                       "SEVERE HYPERTROPHY": 3}

aortic_stenosis_map = {"None": 0, 
                       "Indeterminant": 0, 
                       "Trivial": 1, 
                       "Mild": 2, 
                       "MODERATE": 3, 
                       "SEVERE": 4}

tricuspid_map       = {"None": 0, 
                       "Trivial": 1, 
                       "MILD": 2, 
                       "MILD-MODERATE": 3, 
                       "MODERATE": 4, 
                       "MODERATE-SEVERE": 5, 
                       "SEVERE": 6}

aortic_mitral_map   = {0: 0, 
                       0.5: 1, 
                       1: 2, 
                       1.5: 2, 
                       2: 3, 
                       2.5: 3, 
                       3: 4}

In [11]:
# Mapping the ordinal data using the dicitionaries from above
# I am ignoring NaNs here, but they will be dealt with later

mri["lv_wall_thickness"] = mri["lv_wall_thickness"].map(lv_wall_map, na_action = "ignore")
mri["aortic_stenosis"]   = mri["aortic_stenosis"].map(aortic_stenosis_map, na_action = "ignore")
mri["tricusp_reg"]       = mri["tricusp_reg"].map(tricuspid_map)
mri["aortic_reg"]        = mri["aortic_reg"].map(aortic_mitral_map)
mri["mitral_reg"]        = mri["mitral_reg"].map(aortic_mitral_map)

In [12]:
# The doctors who gave me this data told me to treat the `Unknown` value as `Never`

mri["smoker_status"] = mri["smoker_status"].replace({"Unknown": "Never"})

In [13]:
# Since the `smoker_status` column is nominal, they have to be made into dummies

mri = pd.get_dummies(mri, columns = ["smoker_status"], drop_first = True )

In [14]:
# Making sure again that all columns are lower case

mri.columns = mri.columns.str.lower()

In [15]:
mri.columns

Index(['id', 'sex', 'age', 'hypertension', 'hyperlipidemia', 'diabetes',
       'lvedv', 'lvesv', 'lv_wall_thickness', 'lvef', 'aortic_stenosis',
       'aortic_reg', 'mitral_reg', 'tricusp_reg', 'ba_he', 'bas_he', 'bis_he',
       'bi_he', 'bil_he', 'bal_he', 'ma_he', 'mas_he', 'mis_he', 'mi_he',
       'mil_he', 'mal_he', 'aa_he', 'as_he', 'ai_he', 'al_he', 'apex_he',
       'ba_ischemia', 'bas_ischemia', 'bis_ischemia', 'bi_ischemia',
       'bil_ischemia', 'bal_ischemia', 'ma_ischemia', 'mas_ischemia',
       'mis_ischemia', 'mi_ischemia', 'mil_ischemia', 'mal_ischemia',
       'aa_ischemia', 'as_ischemia', 'ai_ischemia', 'al_ischemia',
       'smoker_status_former (>1yr)', 'smoker_status_never'],
      dtype='object')

In [16]:
# Renaming the `smoker_status_former` column

mri = mri.rename({"smoker_status_former (>1yr)": "smoker_status_former"}, axis = 1)

In [17]:
mri.head()

Unnamed: 0,id,sex,age,hypertension,hyperlipidemia,diabetes,lvedv,lvesv,lv_wall_thickness,lvef,...,mis_ischemia,mi_ischemia,mil_ischemia,mal_ischemia,aa_ischemia,as_ischemia,ai_ischemia,al_ischemia,smoker_status_former,smoker_status_never
0,CloudCMR-35423,1,37,0,0,0,613.0,501.0,0.0,18.0,...,0,0,0,0,0,0,0,0,0,1
1,CloudCMR-24662,1,74,0,1,0,569.0,492.0,1.0,14.0,...,0,0,0,0,0,1,0,1,0,1
2,CloudCMR-25620,1,78,1,1,1,564.0,500.0,0.0,11.0,...,1,1,0,0,1,1,0,0,0,1
3,CloudCMR-21728,1,61,0,1,0,531.0,435.0,1.0,18.0,...,0,0,0,0,0,0,0,0,0,1
4,CloudCMR-49141,1,53,0,0,0,522.2,408.6,0.0,21.75,...,0,0,0,0,0,0,0,0,0,0


### Continuous Values

In [18]:
male = mri[mri["sex"] == 1]

male[["age", "lvedv", "lvesv", "lvef"]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,3312.0,60.276268,13.989795,18.0,52.0,62.0,70.0,106.0
lvedv,3312.0,162.907506,67.695168,30.4,120.0,148.0,186.0,613.0
lvesv,3308.0,81.771173,62.552793,7.7,45.0,61.0,91.0,501.0
lvef,3275.0,53.814061,14.826902,9.0,46.505,57.0,64.0,147.0


In [19]:
female = mri[mri["sex"] == 0]

female[["age", "lvedv", "lvesv", "lvef"]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,3182.0,60.073853,14.507157,18.0,51.0,61.0,71.0,98.0
lvedv,3182.0,123.12319,44.718785,27.0,95.925,116.0,139.455,435.0
lvesv,3178.0,53.126963,38.758219,3.0,32.925,43.0,58.0,370.0
lvef,3144.0,59.590623,12.780668,8.7,55.0,62.0,67.7875,90.0


### Imputation of `NaN` Values

In [20]:
# Checking for columns with missing/NaN data

(mri.isnull().mean()*100).sort_values(ascending = False).head(7)

tricusp_reg          17.801047
aortic_stenosis      16.338158
aortic_reg           12.750231
mitral_reg            7.776409
lv_wall_thickness     3.849707
lvef                  1.154912
lvesv                 0.123191
dtype: float64

There is quite a bit of missing data, I cannot drop `NaN` values because that will remove a lot of otherwise good data.  Furthermore, not all of the columns are the same type: `lvef` and `lvesv` are numeric while the others are ordinal.

I will be making use of a technique called X.

### Target Definition

Because this data is biometric in nature, the definition of left ventricular dilation is different for men than in women.  The following values are clinical definitions:

<img src = "../Images/Source_snip.PNG" alt = "Article Snip" width = "750" >

* **For men**: >214 mL

* **For women**: >178 mL

The values are further dependant upon age (<20 and >80), but for the sake of simplicity I will not specify that much.

-----

I will be defining a third category: at-risk.

* **For men**: >186 mL

* **For women**: >139 mL

These values are not formally supported by formal clinical definitions, but the cardiologist who sent me this data said that they consider values with a z-score of >1.5 to be abnormal.  However, in my data set the "high" value is within one $\sigma$ of the mean: I opted to define borderline as any values greater than the 75th percentile.

Below is the 75th percentile (orange) and the high value (red):

|Male At-risk And High |Female At-risk And High |
|:--------------------:|:----------------------:|
| <img src = "../Images/male lvedv.PNG" alt = "Male LVEDV" width = "300"> | <img src = "../Images/female lvedv.PNG" alt = "Female LVEDV" width = "300"> |

In [21]:
m_high       = mri[(mri["sex"] == 1) & (mri["lvedv"] > 214)]
m_atrisk     = mri[(mri["sex"] == 1) & (mri["lvedv"] > 186)]
f_high       = mri[(mri["sex"] == 0) & (mri["lvedv"] > 178)]
f_atrisk     = mri[(mri["sex"] == 0) & (mri["lvedv"] > 139)]
atrisk_cases = len(m_atrisk) + len(f_atrisk)
high_cases   = len(m_high) + len(f_high)
total        = atrisk_cases + high_cases

In [22]:
print(f"There are {high_cases} cases of left ventricular dilation")
print(f"The cases of dilation make up {round((high_cases/len(mri))*100,2)}% of the data")

There are 811 cases of left ventricular dilation
The cases of dilation make up 12.49% of the data


In [23]:
print(f"There are {atrisk_cases} cases of left ventricular dilation")
print(f"The cases of dilation make up {round((atrisk_cases/len(mri))*100,2)}% of the data")

There are 1621 cases of left ventricular dilation
The cases of dilation make up 24.96% of the data


In [24]:
print(f"There are {total} cases of left ventricular dilation")
print(f"The high and at-risk cases of dilation make up {round((total/len(mri))*100,2)}% of the data")

There are 2432 cases of left ventricular dilation
The high and at-risk cases of dilation make up 37.45% of the data


# Initial Visualizations