# Exploratory Data Analysis

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Read dataset

Now we're going to read the `tubes2_HeartDisease_train` and `tubes2_HeartDisease_test`

In [None]:
heart_disease = {}
heart_disease['columns_detail'] = [
    'Age', 
    'Sex', 
    'Pain type', 
    'Blood pressure', 
    'Serum cholesterol', 
    'Fasting blood sugar > 120mg/dl', 
    'Resting ECG', 
    'Max heart rate achieved', 
    'exercise induced agina', 
    'ST depression induced by exercise relative to rest', 
    'Peak exercise ST segment', 
    'Number of major vessels colored by flourosopy', 
    'Thal', 
    'Diagnosis'
]
heart_disease['train'] = pd.read_csv('../data/tubes2_HeartDisease_train.csv')
heart_disease['test'] = pd.read_csv('../data/tubes2_HeartDisease_test.csv')

In [None]:
def fix_data(data):
    """Convert dataframe to appropriate types"""
    data.loc[data['Column3'] == 1, 'Column3'] = 'typical_agina'
    data.loc[data['Column3'] == 2, 'Column3'] = 'atypical_agina'
    data.loc[data['Column3'] == 3, 'Column3'] = 'non_aginal_pain'
    data.loc[data['Column3'] == 4, 'Column3'] = 'asymtotic'

    data.loc[data['Column7'] == '0', 'Column7'] = 'normal'
    data.loc[data['Column7'] == '1', 'Column7'] = 'having ST-T wave abnormality'
    data.loc[data['Column7'] == '2', 'Column7'] = 'left ventricular hyperthrophy'

    data.loc[data['Column11'] == '1', 'Column11'] = 'upsloping'
    data.loc[data['Column11'] == '2', 'Column11'] = 'flat'
    data.loc[data['Column11'] == '3', 'Column11'] = 'downsloping'

    data.loc[data['Column13'] == '3', 'Column13'] = 'normal'
    data.loc[data['Column13'] == '6', 'Column13'] = 'fixed_defect'
    data.loc[data['Column13'] == '7', 'Column13'] = 'reversable_defect'
    
    data.Column4 = pd.to_numeric(data.Column4, errors='coerce')
    data.Column5 = pd.to_numeric(data.Column5, errors='coerce')
    data.Column6 = pd.to_numeric(data.Column6, errors='coerce')
    data.Column8 = pd.to_numeric(data.Column8, errors='coerce')
    data.Column9 = pd.to_numeric(data.Column9, errors='coerce')
    data.Column10 = pd.to_numeric(data.Column10, errors='coerce')
    data.Column12 = pd.to_numeric(data.Column12, errors='coerce')
    return data

In [None]:
heart_disease['train'] = fix_data(heart_disease['train'])
heart_disease['train']

Check for NULL values

In [None]:
null = [[], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
a = heart_disease['train'].isnull().sum()
null[0] = [a[i] for i in range(len(a))]
for i, col in enumerate(heart_disease['train'].columns):
    if col in ['Column7', 'Column11', 'Column13']:
        null_train = heart_disease['train'][heart_disease['train'][col].astype(str) == '?']
        null[0][i] += int(null_train.shape[0])
    null[1][i] += null[0][i] / heart_disease['train'].shape[0] * 100
    
null_df = pd.DataFrame(null, columns=heart_disease['train'].columns, index=['Num', '%'], dtype=int)
null_df

### General Data Descriptions

Age

In [None]:
heart_disease['train']['Column1'].describe()

In [None]:
heart_disease['train']['Column1'].plot.box()

In [None]:
heart_disease['train']['Column1'].plot.hist()

Sex

In [None]:
heart_disease['train']['Column2'].describe()

In [None]:
heart_disease['train']['Column2'].plot.box()

In [None]:
heart_disease['train']['Column2'].value_counts().plot.bar()

Chest pain type

In [None]:
heart_disease['train']['Column3'].describe()

In [None]:
heart_disease['train']['Column3'].value_counts().plot.bar()

Resting blood pressure

In [None]:
heart_disease['train']['Column4'].describe()

In [None]:
heart_disease['train']['Column4'].plot.box();

In [None]:
heart_disease['train']['Column4'].plot.hist()

Serum cholesterol

In [None]:
heart_disease['train']['Column5'].describe()

In [None]:
heart_disease['train']['Column5'].median()

In [None]:
heart_disease['train']['Column5'].plot.box()

In [None]:
heart_disease['train']['Column5'].plot.hist()

Fasting blood sugar > 120mg/dl

In [None]:
heart_disease['train']['Column6'].describe()

In [None]:
heart_disease['train']['Column6'].median()

In [None]:
heart_disease['train']['Column6'].plot.box()

In [None]:
heart_disease['train']['Column6'].value_counts().plot.bar()

Resting ECG  

In [None]:
heart_disease['train']['Column7'].describe()

In [None]:
heart_disease['train']['Column7'].value_counts().plot.bar()

Max heart rate achieved

In [None]:
heart_disease['train']['Column8'].describe()

In [None]:
heart_disease['train']['Column8'].median()

In [None]:
heart_disease['train']['Column8'].plot.box()

In [None]:
heart_disease['train']['Column8'].plot.hist()

exercise induced agina

In [None]:
heart_disease['train']['Column9'].describe()

In [None]:
heart_disease['train']['Column9'].median()

In [None]:
heart_disease['train']['Column9'].mode()

In [None]:
heart_disease['train']['Column9'].plot.box()

In [None]:
heart_disease['train']['Column9'].value_counts().plot.bar()

ST depression induced by exercise relative to rest

In [None]:
heart_disease['train']['Column10'].describe()

In [None]:
heart_disease['train']['Column10'].median()

In [None]:
heart_disease['train']['Column10'].plot.box()

In [None]:
heart_disease['train']['Column10'].plot.hist()

Peak exercise ST segment

In [None]:
heart_disease['train']['Column11'].describe()

In [None]:
heart_disease['train']['Column11'].value_counts().plot.bar()

In [None]:
vc11 = heart_disease['train']['Column11'].value_counts()
vc11

In [None]:
s = vc11[0] + vc11[2] + vc11[3]  
prop_0 = vc11[0] / s
print(prop_0)
prop_2 = vc11[2] / s
print(prop_2)
prop_3 = vc11[3] / s
print(prop_3)


Number of major vessels colored by flourosopy

In [None]:
heart_disease['train']['Column12'].describe()

In [None]:
heart_disease['train']['Column12'].plot.box()

In [None]:
vc12 = heart_disease['train']['Column12'].value_counts()
vc12

In [None]:
s = vc12[0] + vc12[1] + vc12[2] + vc12[3]  
prop_0 = vc12[0] / s
print(prop_0)
prop_1 = vc12[1] / s
print(prop_1)
prop_2 = vc12[2] / s
print(prop_2)
prop_3 = vc12[3] / s
print(prop_3)



Thal

In [None]:
heart_disease['train']['Column13'].describe()

In [None]:
pd.value_counts(heart_disease['train']['Column13'].values)

In [None]:
vc13 = heart_disease['train']['Column13'].value_counts()

In [None]:
s = vc13['normal'] + vc13['reversable_defect'] + vc13['fixed_defect']  
prop_normal = vc13['normal'] / s
print(prop_normal)
prop_rd = vc13['reversable_defect'] / s
print(prop_rd)
prop_fd = vc13['fixed_defect'] / s
print(prop_fd)


Diagnosis

In [None]:
heart_disease['train']['Column14'].describe()

In [None]:
heart_disease['train']['Column14'].value_counts().plot.bar()