# 0.0. Imports

In [None]:
import json
import math
# import pylab 
import random
import pickle
import requests
import datetime
import warnings
warnings.filterwarnings( 'ignore')
import inflection
import numpy as np
import pandas as pd 
import seaborn as sns
import xgboost as xgb
                

from scipy                 import stats  as ss
from sklearn.metrics       import mean_absolute_error, mean_squared_error
from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression
from sklearn.linear_model  import Lasso
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, RobustScaler


from flask                         import Flask, request, Response
from boruta                        import BorutaPy
from matplotlib                    import pyplot as plt
from matplotlib                    import gridspec
from IPython.display               import Image
from IPython.core.display          import HTML
from IPython.core.interactiveshell import InteractiveShell

%pylab inline
%matplotlib inline


plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 24

display( HTML( '<style>.container { width:100% !important; }</style>') )
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option( 'display.expand_frame_repr', False )

sns.set();

## 0.1 Helper Functions

## 0.2. Loading Data

In [None]:
df_raw = pd.read_csv('data/heart_failure_clinical_records_dataset.csv')

## Attribute Information:

Thirteen (13) clinical features:

- **age**: age of the patient (years)
- **anaemia**: decrease of red blood cells or hemoglobin (boolean)
- **high blood pressure**: if the patient has hypertension (boolean)
- **creatinine phosphokinase (CPK)**: level of the CPK enzyme in the blood (mcg/L)
- **diabetes**: if the patient has diabetes (boolean)
- **ejection fraction**: percentage of blood leaving the heart at each contraction (percentage)
- **platelets**: platelets in the blood (kiloplatelets/mL)
- **sex**: woman or man (binary)
- **serum creatinine**: level of serum creatinine in the blood (mg/dL)
- **serum sodium**: level of serum sodium in the blood (mEq/L)
- **smoking**: if the patient smokes or not (boolean)
- **time**: follow-up period (days)
- **[target] death event**: if the patient deceased during the follow-up period (boolean)

In [None]:
df_raw.sample(5)

# 1.0. STEP 01 - DESCRIPTION OF DATA

In [None]:
df1 = df_raw.copy()

## 1.1. Rename Columns

In [None]:
# rename columns so they are all tiny
cols_old = ['age', 'anaemia','creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine','serum_sodium', 'sex', 'smoking', 'time', 'DEATH_EVENT']

snakecase = lambda x: inflection.underscore(x)

cols_new = list(map(snakecase, cols_old))

df1.columns = cols_new

In [None]:
df1.sample(5)

## 1.2. Data Dimensions

In [None]:
print('Number of Rows : {}'.format(df1.shape[0]))
print('Number of Cols : {}'.format(df1.shape[1]))

## 1.3. Data Types

In [None]:
df1.dtypes

## 1.4. Check NA

In [None]:
df1.isna().sum()

## 1.5. Fillout NA

## 1.6. Change Data Types

## 1.7. Descriptive Statistical

In [None]:
num_attributes = df1.select_dtypes( include=['int64', 'float64'] )

### 1.7.1 Numerical Attributes

In [None]:
# Central Tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame( num_attributes.apply( min ) ).T
d3 = pd.DataFrame( num_attributes.apply( max ) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.max() - x.min() ) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# concatenate
m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ( ['attributes','min','max','range','mean','median','std','skew','kurtosis'] )
m


In [None]:
sns.distplot(df1['serum_sodium']);

### 1.7.2. Categorical Attributes

# 2.0. STEP 02 - FEATURE ENGINNERING

In [None]:
df2 = df1.copy()

## 2.1. Hypothesis Mind Map

In [None]:
Image('img/MindMapHypothesis.png')

## 2.2. Creation of Hypotheses

### 2.2.1. Age Hypothesis

**1.** Men die more than women from heart attack

### 2.2.2. Sex Hypothesis

**1.** Men are more likely to die from heart disease than women.

### 2.2.3. Smooking Hypothesis

**1.** Men who smoke die more from heart attack than women.

### 2.2.4. Diabetes Hypothesis

**1.** People with Diabetes die more from heart attack than people without diabetes.

### 2.2.5. High Blood Pressure Hypothesis

**1.** Women with high blood pressure are more likely to die of a heart attack than men.

### 2.2.6. Anaemia Hypothesis

**1.** Pessoas com anemia morrem mais do que quem não tem anemia.

## 2.3. Feature Engineering

# 3.0. STEP 03 - VARIABLES FILTERING

In [None]:
df3 = df2.copy()

# 4.0. STEP 04 - EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
df4 = df3.copy()

In [None]:
df4.tail()

### 4.1.1. Response Variable

In [None]:
sns.distplot(df4['sex'], kde=False);

### 4.1.2. Numerical Variable

In [None]:
num_attributes.hist( bins=25);

## 4.2. Analise Bivariada

### **H1.** Men die more than women from heart attack

In [None]:
aux1 = df4[(df4['death_event'] == 1)]
aux2 = aux1[['sex', 'death_event']].groupby('sex').sum().reset_index()
aux3 = df4[(df4['death_event'] == 0)]
aux4 = aux3[['sex', 'death_event']].groupby('sex').sum().reset_index()

plt.subplot(1,2,1)
sns.barplot(x='sex', y='death_event', data=aux2)

plt.subplot(1,2,2)
sns.barplot(x='sex', y='death_event', data=aux4)

In [None]:
aux1.head()

In [None]:
print(len(aux4))


In [None]:
aux4