# Imports

In [None]:
import pandas             as pd
import numpy              as np
import regex              as re
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
%matplotlib inline

# Data Cleaning

## Reading In The Data & Overview

In [None]:
# Reading in the MRI data file

mri = pd.read_csv("../Data/stress_cardiac_mri.csv")

In [None]:
# Looking at the first two rows

mri.head(2)

In [None]:
# Checking the shape of the data

print(f'The shape of the mri set is: {mri.shape}')

In [None]:
# Getting the data types

mri.dtypes

In [None]:
# Sum of the data types

mri.dtypes.value_counts()

In [None]:
# Descriptions of continuous numeric columns

mri[["age","lvedv", "lvesv", "lvef"]].describe().T

### Columns

In [None]:
# Some of the columns have very long names, so I will rename them

mri = mri.rename({"Sex (1=male,0=female)": "Sex", "Hypertension (1=yes,0=no)": "Hypertension", "Name": "ID",
                  "History of smoking": "Smoker Status", "Tricuspid Regurgitation": "Tricusp Reg"}, axis = 1)
mri = mri.rename({"Aortic Regurgitation (0=none, 0.5 = trivial, 1=mild, 1.5=mild-moderate, 2=moderate, 2.5=moderate-severe, 3= severe)":
                  "Aortic Reg"}, axis = 1)
mri = mri.rename({"Mitral Regurgitation (0=none, 0.5 = trivial, 1=mild, 1.5=mild-moderate, 2=moderate, 2.5=moderate-severe, 3= severe)":
                  "Mitral Reg"}, axis = 1)

In [None]:
# Replacing spaces with underscores

mri.columns = mri.columns.str.replace(" ", "_")

# Making sure that all columns are lower case

mri.columns = mri.columns.str.lower()

### Ordinal & Nominal Values

In [None]:
# Making Yes/No values binary

mri["diabetes"] = mri["diabetes"].apply(lambda x: 1 if x == "Yes" else 0)

In [None]:
# Creating dictionaries for mapping ordinal data to numeric values

lv_wall_map         = {"Normal": 0, "MILD HYPERTROPHY": 1, "MODERATE HYPERTROPHY": 2, "SEVERE HYPERTROPHY": 3}
aortic_stenosis_map = {"None": 0, "Indeterminant": 0, "Trivial": 1, "Mild": 2, "MODERATE": 3, "SEVERE": 4}
tricuspid_map       = {"None": 0, "Trivial": 1, "MILD": 2, "MILD-MODERATE": 3, "MODERATE": 4, "MODERATE-SEVERE": 5, "SEVERE": 6}
aortic_mitral_map   = {0: 0, 0.5: 1, 1: 2, 1.5: 2, 2: 3, 2.5: 3, 3: 4}

In [None]:
# Mapping the ordinal data using the dicitionaries from above
# I am ignoring NaNs here, but they will be dealt with later

mri["lv_wall_thickness"] = mri["lv_wall_thickness"].map(lv_wall_map, na_action = "ignore")
mri["aortic_stenosis"]   = mri["aortic_stenosis"].map(aortic_stenosis_map, na_action = "ignore")
mri["tricusp_reg"]       = mri["tricusp_reg"].map(tricuspid_map)
mri["aortic_reg"]        = mri["aortic_reg"].map(aortic_mitral_map)
mri["mitral_reg"]        = mri["mitral_reg"].map(aortic_mitral_map)

In [None]:
# The doctors who gave me this data told me to treat the `Unknown` value as `Never`

mri["smoker_status"] = mri["smoker_status"].replace({"Unknown": "Never"})

In [None]:
# Since the `smoker_status` column is nominal, they have to be made into dummies

mri = pd.get_dummies(mri, columns = ["smoker_status"], drop_first = True )

In [None]:
# Making sure again that all columns are lower case

mri.columns = mri.columns.str.lower()

In [None]:
mri.columns

In [None]:
# Renaming the `smoker_status_former` column

mri = mri.rename({"smoker_status_former (>1yr)": "smoker_status_former"}, axis = 1)

# Checkig that the changes worked

mri.columns

### Imputation of `NaN` Values

In [None]:
# Checking for columns with missing/NaN data

(mri.isnull().mean()*100).sort_values(ascending = False).head(7)

There is quite a bit of missing data, I cannot drop `NaN` values because that will remove a lot of otherwise good data.  Furthermore, not all of the columns are the same type: `lvef` and `lvesv` are numeric while the others are ordinal.

I will be making use of a technique called X.

### Target Definition

Because this data is biometric in nature, the definition of left ventricular dilation is different for men than in women:

* For men: >214 mL

* For women: >178 mL

-----

I will be defining a third category: at-risk.

For a subject to be considered at-risk of dilation they have to have a dilation of within 15% of the definintion:

* For men: >178 mL

* For women: >151 mL

-----

The purpose of adding a third category is to indicate to a physician that a subject needs to be watched carefully since dilation can contribute to significant cardiac conditions.

In [None]:
m_high       = mri[(mri["sex"] == 1) & (mri["lvedv"] > 214)]
m_atrisk     = mri[(mri["sex"] == 1) & (mri["lvedv"] > 182)]
f_high       = mri[(mri["sex"] == 0) & (mri["lvedv"] > 178)]
f_atrisk     = mri[(mri["sex"] == 0) & (mri["lvedv"] > 151)]
atrisk_cases = len(m_atrisk) + len(f_atrisk)
high_cases   = len(m_high) + len(f_high)
total        = atrisk_cases + high_cases

In [None]:
print(f"There are {high_cases} cases of left ventricular dilation")
print(f"The cases of dilation make up {round((high_cases/len(mri))*100,2)}% of the data")

In [None]:
print(f"There are {atrisk_cases} cases of left ventricular dilation")
print(f"The cases of dilation make up {round((atrisk_cases/len(mri))*100,2)}% of the data")

In [None]:
print(f"There are {total} cases of left ventricular dilation")
print(f"The high and at-risk cases of dilation make up {round((total/len(mri))*100,2)}% of the data")

In [None]:
np.log(mri["lvedv"]).plot(kind = "hist", bins = 35);

# Initial Visualizations