<a href="https://colab.research.google.com/github/asyaf/public_speaking/blob/main/Data_exploration_lecture_heart_failure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data exploration example

## Install packages

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from  matplotlib.ticker import FuncFormatter
import pandas as pd
from scipy import stats
import seaborn as sns

%matplotlib inline

In [2]:
!pip install -UIv pandas-profiling==3.4.0

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  Skipping link: none of the wheel's tags (cp38-cp38-macosx_10_9_x86_64) are compatible (run pip debug --verbose to show compatible tags): https://files.pythonhosted.org/packages/6b/b4/2caa7b2ae0e745e77b0cdd9459969dc63efeede90a4436cb34b2b6edbf42/MarkupSafe-2.0.0a1-cp38-cp38-macosx_10_9_x86_64.whl#sha256=68e0fd039b68d2945b4beb947d4023ca7f8e95b708031c345762efba214ea761 (from https://pypi.org/simple/markupsafe/) (requires-python:>=3.6)
  Skipping link: none of the wheel's tags (cp38-cp38-manylinux1_i686) are compatible (run pip debug --verbose to show compatible tags): https://files.pythonhosted.org/packages/17/c3/75f9b279df59386370b006a9c3065c85af2d77ed791f0a487dcdf4ffd8f2/MarkupSafe-2.0.0a1-cp38-cp38-manylinux1_i686.whl#sha256=4eb07faad54bb07427d848f31030a65a49ebb0cec0b30674f91cf1ddd456bfe4 (from https://pypi.org/simple/markupsafe/) (requires-python:>=3.6)
  Skipping link: none of the wheel's tags (cp38-cp38-manylinux1_x86

## Load data

In [1]:
# path to csv on github - https://github.com/asyaf/public_speaking/blob/main/heart_failure_clinical_records_dataset.csv
RAW_DATA_CSV_PATH = 'https://raw.githubusercontent.com/asyaf/public_speaking/main/heart_failure_clinical_records_dataset.csv'

In [5]:
data = pd.read_csv(RAW_DATA_CSV_PATH)
data

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [7]:
DEATH_LABELS = ['alive', 'dead']
SEX_LABELS = ['women', 'men']

BINARY_FEATURES = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']

# Profile report

In [8]:
from pandas_profiling import ProfileReport

In [9]:
profile = ProfileReport(data, title='Profiling Report')
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
output_file = 'pandas_profiling.html'
profile.to_file(output_file)
files.download(output_file)

# Simple statistics

In [None]:
data.describe()

# Visualizations and analysis

In [None]:
plt.plot(data['time']);
plt.title('Time value change');
plt.ylabel('time');

In [None]:
plt.bar(DEATH_LABELS, data['DEATH_EVENT'].value_counts())
plt.title('Death outcome distribution');

In [None]:
sns.histplot(data=data['age'], kde=True, label='All', color='green')
dead_data = data[data['DEATH_EVENT']==1]
sns.histplot(dead_data['age'], kde=True, label='Dead', color='blue');
plt.legend();

# Is the data verifying prior knowledge
- Women suffer more from anaemia V
- Men smoke more V
- Older people suffer more from diabetes and high blood pressure X
- Diabetes may cause high blood pressure X

In [None]:
ax = sns.catplot(x="sex", kind="count", hue='anaemia', palette="pastel", data=data);
ax.set_xticklabels(SEX_LABELS);

In [None]:
ax = sns.catplot(x="sex", kind="count", hue='smoking', palette="pastel", data=data);
ax.set_xticklabels(SEX_LABELS);

In [None]:
data.groupby('DEATH_EVENT')['smoking'].value_counts()

Fact: Smokers three times likely to die from heart disease

In [None]:
ages = data.age.astype(int)
ax = sns.distplot(ages)

In [None]:
plt.figure(figsize=(16, 6))
ax = sns.countplot(data.age.astype(int), hue='high_blood_pressure', data=data)

In [None]:
plt.figure(figsize=(16, 6))
ax = sns.countplot(data.age.astype(int), hue='diabetes', data=data)

In [None]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(data['diabetes'], data['high_blood_pressure'])

**Fact:** High blood pressure is twice as likely to strike a person with diabetes than a person without diabetes. 

# Discovering feature relations

In [None]:
plt.figure(figsize=(16, 6))
ax = sns.barplot(x = data.age.astype(int), y = 'ejection_fraction', data=data, palette="Blues_r")

In [None]:
ax = sns.boxplot(x="sex", y="creatinine_phosphokinase", data=data, palette="Set3")
ax.set_xticklabels(SEX_LABELS);

In [None]:
ax = sns.boxplot(x="DEATH_EVENT", y="serum_sodium", data=data, palette="Set3")
ax.set_xticklabels(DEATH_LABELS);

In [None]:
ax = sns.violinplot(x='sex', y='serum_creatinine', data=data, palette='Set3')
ax.set_xticklabels(['women', 'men']);

In [None]:
ax = sns.violinplot(x='sex', y='serum_creatinine', data=data, hue='smoking', palette='Set3', split='true')
ax.set_xticklabels(['women', 'men']);

In [None]:
ax = sns.catplot(x='DEATH_EVENT', y='ejection_fraction', data=data, kind='point')
ax.set_xticklabels(DEATH_LABELS);

**Fact:** Low ejection fraction, sometimes called low EF, is the term we use to describe your ejection fraction if it falls below 55%. It means your heart isn't functioning as well as it could. 

In [None]:
ax = sns.catplot(x='DEATH_EVENT', y='ejection_fraction', hue='smoking', data=data, kind='point')
ax.set_xticklabels(DEATH_LABELS);

In [None]:
ax = sns.jointplot(x="serum_creatinine", y="serum_sodium", data=data)

**Fact:** A normal blood sodium level is between 135 and 145 milliequivalents per liter (mEq/L).

**Fact:** The normal range for creatinine in the blood may be 0.84 to 1.21 milligrams per deciliter (74.3 to 107 micromoles per liter), although this can vary from lab to lab, between men and women, and by age.

In [None]:
ax = sns.swarmplot(x='DEATH_EVENT', y='platelets', hue='high_blood_pressure', data=data)
ax.set_xticklabels(DEATH_LABELS);

In [None]:
columns_to_remove = ['time', 'DEATH_EVENT'] + BINARY_FEATURES
g = sns.pairplot(data.drop(columns=columns_to_remove))

In [None]:
p_corr, p_pvalue = stats.pearsonr(data['ejection_fraction'], data['DEATH_EVENT'])
print('Pearson correlation: {}, pvalue: {}'.format(p_corr, p_pvalue)) 

In [None]:
s_corr, s_pvalue = stats.spearmanr(data['ejection_fraction'], data['DEATH_EVENT'])
print('Spearman correlation: {}, pvalue: {}'.format(s_corr, s_pvalue)) 

In [None]:
k_corr, k_pvalue = stats.kendalltau(data['ejection_fraction'], data['DEATH_EVENT'])
print('Kendall correlation: {}, pvalue: {}'.format(k_corr, k_pvalue)) 

In [None]:
data_for_correlation = data[['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine',
                             'serum_sodium']]

In [None]:
p_corr_data = data_for_correlation.corr(method='pearson');
sns.heatmap(data=p_corr_data, cmap="Blues");

In [None]:
s_corr_data = data_for_correlation.corr(method='spearman');
sns.heatmap(data=s_corr_data, cmap="Blues");

In [None]:
k_corr_data = data_for_correlation.corr(method='kendall');
sns.heatmap(data=k_corr_data, cmap="Blues");