# Imports

In [None]:
import pandas               as pd
import numpy                as np
import seaborn              as sns
import matplotlib.pyplot    as plt
from sklearn.experimental   import enable_iterative_imputer
from sklearn.impute         import IterativeImputer
from sklearn.ensemble       import ExtraTreesRegressor
from autoimpute.imputations import MultipleImputer
from IPython.core.display   import display, HTML
sns.set(style = "white", palette = "dark")
display(HTML("<style>.container { width:95% !important; }</style>"))
%matplotlib inline

# Table Of Contents

-----

1. [Data Cleaning](#Data-Cleaning)

    - [Reading In The Data](#Reading-In-The-Data)
    - [Overview](#Overview)
    - [Columns](#Columns)
    - [Ordinal & Nominal Values](#Ordinal-&-Nominal-Values)
    - [Imputation Of Missing Data](#Imputation-Of-Missing-Data)
    
-----

2. [Exploratory Visualizations](#Exploratory-Visualizations)
    - [Numeric Data](#Numeric-Data)
        - [Functions](#Functions)
        - [Histograms](#Histograms)
        - [Scatter Plots](#Scatter-Plots)
    - [Ordinal Data](#Ordinal-Data)
        - [Functions](#Functions)
        - [Bar Plots](#Bar-Plots)
        
-----

3. [Heat Maps](#Heat-Maps)

# Data Cleaning

## Reading In The Data

In [None]:
# Reading in the MRI data file

mri = pd.read_csv("../Data/stress_cardiac_mri.csv")

## Overview

In [None]:
# Looking at the first two rows

mri.head(2)

In [None]:
# Checking the shape of the data

print(f'The shape of the mri set is: {mri.shape}')

In [None]:
# Getting the data types

mri.dtypes

In [None]:
# Sum of the data types

mri.dtypes.value_counts()

### Columns

In [None]:
# Some of the columns have very long names, so I will rename them

mri = mri.rename({"Sex (1=male,0=female)": "Sex", 
                  "Hypertension (1=yes,0=no)": "Hypertension", 
                  "Name": "ID",
                  "History of smoking": "Smoker Status", 
                  "Tricuspid Regurgitation": "Tricusp Reg"}, axis = 1)
mri = mri.rename({"Aortic Regurgitation (0=none, 0.5 = trivial, 1=mild, 1.5=mild-moderate, 2=moderate, 2.5=moderate-severe, 3= severe)":
                  "Aortic Reg"}, axis = 1)
mri = mri.rename({"Mitral Regurgitation (0=none, 0.5 = trivial, 1=mild, 1.5=mild-moderate, 2=moderate, 2.5=moderate-severe, 3= severe)":
                  "Mitral Reg"}, axis = 1)

In [None]:
# Replacing spaces with underscores

mri.columns = mri.columns.str.replace(" ", "_")

# Making sure that all columns are lower case

mri.columns = mri.columns.str.lower()

### Ordinal & Nominal Values

In [None]:
# Making Yes/No values binary

mri["diabetes"] = mri["diabetes"].apply(lambda x: 1 if x == "Yes" else 0)

In [None]:
# Creating dictionaries for mapping ordinal data to numeric values

lv_wall_map         = {"Normal": 0, 
                       "MILD HYPERTROPHY": 1, 
                       "MODERATE HYPERTROPHY": 2, 
                       "SEVERE HYPERTROPHY": 3}

aortic_stenosis_map = {"None": 0, 
                       "Indeterminant": 0, 
                       "Trivial": 1, 
                       "Mild": 2, 
                       "MODERATE": 3, 
                       "SEVERE": 4}

tricuspid_map       = {"None": 0, 
                       "Trivial": 1, 
                       "MILD": 2, 
                       "MILD-MODERATE": 3, 
                       "MODERATE": 4, 
                       "MODERATE-SEVERE": 5, 
                       "SEVERE": 6}

aortic_mitral_map   = {0: 0, 
                       0.5: 1, 
                       1: 2, 
                       1.5: 2, 
                       2: 3, 
                       2.5: 3, 
                       3: 4}

In [None]:
# Mapping the ordinal data using the dicitionaries from above
# I am ignoring NaNs here, but they will be dealt with later

mri["lv_wall_thickness"] = mri["lv_wall_thickness"].map(lv_wall_map, na_action = "ignore")
mri["aortic_stenosis"]   = mri["aortic_stenosis"].map(aortic_stenosis_map, na_action = "ignore")
mri["tricusp_reg"]       = mri["tricusp_reg"].map(tricuspid_map)
mri["aortic_reg"]        = mri["aortic_reg"].map(aortic_mitral_map)
mri["mitral_reg"]        = mri["mitral_reg"].map(aortic_mitral_map)

In [None]:
# The doctors who gave me this data told me to treat the `Unknown` value as `Never`

mri["smoker_status"] = mri["smoker_status"].replace({"Unknown": "Never"})

In [None]:
# Since the `smoker_status` column is nominal, they have to be made into dummies

mri = pd.get_dummies(mri, columns = ["smoker_status"], drop_first = True )

In [None]:
# Making sure again that all columns are lower case

mri.columns = mri.columns.str.lower()

In [None]:
# Renaming the `smoker_status_former` column

mri = mri.rename({"smoker_status_former (>1yr)": "smoker_status_former"}, axis = 1)

In [None]:
mri.head()

### Imputation Of Missing Data

In [None]:
# Checking for columns with missing/NaN data

(mri.isnull().mean()*100).sort_values(ascending = False).head(6)

Despite these columns missing data, the doctors who gave us the data said that the `NaNs` in `LV_Wall_Thickness` can be considered to be "normal".

In [None]:
mri["lv_wall_thickness"] = mri["lv_wall_thickness"].fillna(0)

There are two numeric columns `lvesv` and `lvef` which have a small number of missing values, but they need to be filled in.

To impute the values, I will use `sklearn`'s `IterativeImputer` algorithm.

In [None]:
# Instantiating an IterativeImputer model

imp = IterativeImputer(random_state = 42,
                       max_iter     = 25,
                       estimator    = ExtraTreesRegressor(n_estimators = 10, 
                                                          random_state = 42))

# Fit_transforming the model

mri[["lvef", "lvesv"]] = imp.fit_transform(X = mri[["lvef", "lvesv"]])

There are still four more columns that need to be impute and they are all ordinal.

To do that I will X.

In [None]:
(mri.isnull().mean()*100).sort_values(ascending = False).head(4)

# Exploratory Visualizations

## Numeric Data

### Functions

In [None]:
def continuous_data_histograms(list_of_columns):
    count = 0
    fig   = plt.figure(figsize = (12,6))
    for column in list_of_columns:
        count += 1
        ax    = fig.add_subplot(2, 2, count)
        title = column.upper()
        plt.title(f"Distribution Of {title}", size = 18)
        sns.distplot(mri[column], color = "black",
                     kde = False)
        plt.axvline(mri[column].mean(),
                    color = "red")
        plt.xlabel(f"{title}", size = 16)
        plt.ylabel("Frequency", size = 16)
        plt.xticks(size = 14)
        plt.yticks(size = 14)
    plt.tight_layout();
    plt.show();

In [None]:
def continuous_data_scatterplots(list_of_columns):
    count = 0
    fig   = plt.figure(figsize = (12,6))
    for column in list_of_columns:
        count += 1
        ax    = fig.add_subplot(2, 2, count)
        title = column.upper()
        plt.title(f"{title} Vs LVEDV", size = 18)
        sns.regplot(x = column, y = "lvedv",
                    data = mri, fit_reg = True,
                    marker = "+", color = "black",
                    line_kws = {"color": "red"}, ci = None)
        plt.xlabel(f"{title}", size = 16)
        plt.ylabel("LVEDV Volume", size = 16)
        plt.xticks(size = 14)
        plt.yticks(size = 14)
    plt.tight_layout();
    plt.show();

### Histograms

In [None]:
continuous_data_histograms(["age", "lvedv", "lvesv", "lvef"])

### Scatter Plots

In [None]:
continuous_data_scatterplots(["age", "lvesv", "lvef"])

## Ordinal Data

### Functions

In [None]:
# Defining groups to visualize
# Rather than using plotting all ordinal columns at once,
# I'm breaking them down by similarity

segments_scr = mri.columns[14:31]
segments_isc = mri.columns[37:46]
structure    = mri.columns[[8,10,11,12,13]]
lifestyle    = mri.columns[[1,3,4,5,47,48]]

In [None]:
def plot_barcharts(list_of_columns):
    fig       = plt.figure(figsize = (20,30))
    count     = 0
    for column in list_of_columns:
        count += 1
        ax    = fig.add_subplot(8, 3, count)
        title = column.upper()
        plt.title(f"{title} Counts", size = 16)
        sns.countplot(mri[column])
        plt.xlabel("Category", size = 14)
        plt.ylabel("Count", size = 14)
        plt.xticks(size = 12)
        plt.yticks(size = 12)
    plt.tight_layout();
    plt.show();

### Bar Plots

In [None]:
plot_barcharts(segments_scr)

In [None]:
plot_barcharts(segments_isc)

In [None]:
plot_barcharts(structure)

In [None]:
plot_barcharts(lifestyle)

# Heat Maps

In [None]:
mri.columns

In [None]:
numeric_cols = mri.columns[[2,6,7,9]]

In [None]:
numeric_cols

In [None]:
plt.figure(figsize = (5,5))
plt.title("Correlations Among Numeric Columns", size = 18)
corr = mri[numeric_cols].corr()
mask = np.zeros_like(corr)                                                                                
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    sns.heatmap(corr, cmap = "RdBu", mask = mask,
                vmin = -1, vmax = 1, annot = True)

#### E N D