# Imports

In [None]:
import pandas                as pd
import numpy                 as np
import matplotlib.pyplot     as plt
import seaborn               as sns
import missingno             as msno
from sklearn.linear_model    import LinearRegression
from sklearn.linear_model    import LassoCV
from sklearn.linear_model    import RidgeCV
from sklearn.metrics         import r2_score
from sklearn.metrics         import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline        import Pipeline
from IPython.core.display    import display, HTML
sns.set(style = "white", palette = "husl")
display(HTML("<style>.container { width:95% !important; }</style>"))
%matplotlib inline

# Table Of Contents

-----

1. [Reading In The Data](#Reading-In-The-Data)
    - [Overview](#Overview)

-----

2. [Feature Engineering](#Feature-Engineering)
    - [Transforming Numeric Data](#Transforming-Numeric-Data)
    - [Creating Segmental Features](#Creating-Segmental-Features)
    
-----

3. [Modeling](#Modeling)
    - [Functions](#Functions)
        - [Model Functions](#Model-Functions)
        - [Evaluation Functions](#Evaluation-Functions)
    - [Linear Regression](#Linear-Regression)
    - [Ridge](#Ridge)
    - [LASSO](#LASSO)

# Reading In The Data

In [None]:
mri = pd.read_csv("../Data/mri_cleaned.csv")

## Overview

In [None]:
mri.head()

The `smoker_status` column is still in the data, but we will not needing the column for the models because we turned it into a pair of dummy columns so we will drop the column here.

In [None]:
#  Dropping `smoker_status`

mri = mri.drop("smoker_status", axis = 1)

In [None]:
# Checking the shape of the data

print(f"The shape of the dataset is: {mri.shape}")

In [None]:
# Summary of column data types

mri.dtypes.value_counts()

In [None]:
# Checking for columns with missing/NaN data

(mri.isnull().mean()*100).sort_values(ascending = False).head(4)

# Feature Engineering

## Transforming Numeric Data

There are only four numeric columns in the data set: `age`, `lvesv`, `lvedv`, `lvef`.  Of the four, only `lvef` does not have any kind of a normal distribution: `age` is close to normally distributed, while `lvesv` and `lvedv` are log-normally distributed.

We cannot do anything to `lvedv` because that is my target variable, but we can take the log of `lvesv` (in this case the natural log).  We also tried squaring `age` but that did not affect the distribution in the way we hoped it would.

In [None]:
# Take the natural log of `lvesv`.
# We chose to make it it's own column rather
# than overwrite the column.

mri["lvesv_log"] = mri["lvesv"].apply(lambda x: np.log(x))

In [None]:
print(f"The shape of the dataset is: {mri.shape}")

[Top](#Table-Of-Contents)

## Creating Segmental Features

Because the model attempts to predict the end diastolic volume, we want it to be as accurate as it can be.  As part of that, we will try to use many combinations of features in an attempt to achieve high accuracy.

The data have 34 columns that we wish to engineer: a column measuring scarification and a column measuring ischemia.  Because there are so many of them, we felt the need to experiment with how they are passed into the model.  We are unable to create interaction columns, because there are zeros.  Instead, we elected to create segmental columns by summing similar columns together: we will compare the model's performance with the originals and with the segmental columns.

We used this image to guide our create of segmental columns:

<img src = "../Images/cardiac-segmentation-for-cardiac-perfusion-defects.jpg" alt = "Cardiac Segmentation" height = 750 width = 750>

Case courtesy of Dr Hamid Chalian, <a href="https://radiopaedia.org/">Radiopaedia.org</a>. From the case <a href="https://radiopaedia.org/cases/47102">rID: 47102</a>

In [None]:
# Creating columns based off of the red, blue, green, and yellow
# I'm combining the apex and apical regions because there is only
# one region in the apex.

# New columns for scar tissue

mri["basal_he"]  = mri["ba_he"] + mri["bas_he"] + mri["bis_he"] \
                   + mri["bi_he"] + mri["bil_he"] + mri["bal_he"]
mri["mid_he"]    = mri["ma_he"] + mri["mas_he"] + mri["mis_he"] \
                   + mri["mi_he"] + mri["mil_he"] + mri["mal_he"]
mri["apical_he"] = mri["aa_he"] + mri["as_he"] + mri["ai_he"] \
                   + mri["al_he"] + mri["apex_he"]

# New columns for ischemia

mri["basal_ischemia"]  = mri["ba_ischemia"] + mri["bas_ischemia"] + mri["bis_ischemia"] \
                         + mri["bi_ischemia"] + mri["bil_ischemia"] + mri["bal_ischemia"]
mri["mid_ischemia"]    = mri["ma_ischemia"] + mri["mas_ischemia"] + mri["mis_ischemia"] \
                         + mri["mi_ischemia"] + mri["mil_ischemia"] + mri["mal_ischemia"]
mri["apical_ischemia"] = mri["aa_ischemia"] + mri["as_ischemia"] + mri["ai_ischemia"] \
                         + mri["al_ischemia"]

print(f"The shape of the dataset is: {mri.shape}")

[Top](#Table-Of-Contents)

It will be easier when modeling to have two data sets: one with the original features and one with the features I engineered.


In [None]:
# Defining the original dataframe

mri_og  = mri.drop(labels = ["lvesv_log", "basal_he", "mid_he",
                             "apical_he", "basal_ischemia",
                             "mid_ischemia", "apical_ischemia"],
                   axis = 1)

# Defining the dataframe with only new features

mri_new = mri.drop(labels = ['lvesv', 'ba_he', 'bas_he', 'bis_he','bi_he', 
                             'bil_he', 'bal_he', 'ma_he', 'mas_he', 'mis_he', 
                             'mi_he', 'mil_he','mal_he', 'aa_he', 'as_he', 
                             'ai_he', 'al_he', 'apex_he', 'ba_ischemia',
                             'bas_ischemia', 'bis_ischemia', 'bi_ischemia', 
                             'bil_ischemia','bal_ischemia', 'ma_ischemia', 
                             'mas_ischemia', 'mis_ischemia','mi_ischemia', 
                             'mil_ischemia', 'mal_ischemia', 'aa_ischemia', 
                             'as_ischemia', 'ai_ischemia', 'al_ischemia'],
                   axis = 1)

# Checking to make sure the two have different numbers of columns

print(f"The shape of the dataframe with original features is: {mri_og_features.shape}")
print(f"The shape of the dataframe with new features is     : {mri_new_features.shape}")

[Top](#Table-Of-Contents)

# Modeling

## Functions

### Modeling Functions

### Evaluation Functions

## Linear Regression

## Ridge

## LASSO