# Exploratory Data Analysis

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Read dataset

Now we're going to read the `tubes2_HeartDisease_train` and `tubes2_HeartDisease_test`

In [None]:
heart_disease = {}
heart_disease['columns_detail'] = [
    'Age', 
    'Sex', 
    'Pain type', 
    'Blood pressure', 
    'Serum cholesterol', 
    'Fasting blood sugar > 120mg/dl', 
    'Resting ECG', 
    'Max heart rate achieved', 
    'exercise induced agina', 
    'ST depression induced by exercise relative to rest', 
    'Peak exercise ST segment', 
    'Number of major vessels colored by flourosopy', 
    'Thal', 
    'Diagnosis'
]
heart_disease['train'] = pd.read_csv('../data/tubes2_HeartDisease_train.csv')
heart_disease['test'] = pd.read_csv('../data/tubes2_HeartDisease_test.csv')

In [None]:
def fix_data(data):
    """Convert dataframe to appropriate types"""
    data.loc[data['Column3'] == 1, 'Column3'] = 'typical_agina'
    data.loc[data['Column3'] == 2, 'Column3'] = 'atypical_agina'
    data.loc[data['Column3'] == 3, 'Column3'] = 'non_aginal_pain'
    data.loc[data['Column3'] == 4, 'Column3'] = 'asymtotic'

    data.loc[data['Column7'] == '0', 'Column7'] = 'normal'
    data.loc[data['Column7'] == '1', 'Column7'] = 'having ST-T wave abnormality'
    data.loc[data['Column7'] == '2', 'Column7'] = 'left ventricular hyperthrophy'

    data.loc[data['Column11'] == '1', 'Column11'] = 'upsloping'
    data.loc[data['Column11'] == '2', 'Column11'] = 'flat'
    data.loc[data['Column11'] == '3', 'Column11'] = 'downsloping'

    data.loc[data['Column13'] == '3', 'Column13'] = 'normal'
    data.loc[data['Column13'] == '6', 'Column13'] = 'fixed_defect'
    data.loc[data['Column13'] == '7', 'Column13'] = 'reversable_defect'
    
    data.Column4 = pd.to_numeric(data.Column4, errors='coerce')
    data.Column5 = pd.to_numeric(data.Column5, errors='coerce')
    data.Column6 = pd.to_numeric(data.Column6, errors='coerce')
    data.Column8 = pd.to_numeric(data.Column8, errors='coerce')
    data.Column9 = pd.to_numeric(data.Column9, errors='coerce')
    data.Column10 = pd.to_numeric(data.Column10, errors='coerce')
    data.Column12 = pd.to_numeric(data.Column12, errors='coerce')
    return data

In [None]:
heart_disease['train'] = fix_data(heart_disease['train'])
heart_disease['train']

Check for NULL values

In [None]:
null = [[], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
a = heart_disease['train'].isnull().sum()
null[0] = [a[i] for i in range(len(a))]
for i, col in enumerate(heart_disease['train'].columns):
    if col in ['Column7', 'Column11', 'Column13']:
        null_train = heart_disease['train'][heart_disease['train'][col].astype(str) == '?']
        null[0][i] += int(null_train.shape[0])
    null[1][i] += null[0][i] / heart_disease['train'].shape[0] * 100
    
null_df = pd.DataFrame(null, columns=heart_disease['train'].columns, index=['Num', '%'], dtype=int)
null_df

### General Data Descriptions

Age

In [None]:
heart_disease['train']['Column1'].describe()

Sex

In [None]:
heart_disease['train']['Column2'].describe()

Chest pain type

In [None]:
heart_disease['train']['Column3'].describe()

Resting blood pressure

In [None]:
heart_disease['train']['Column4'].describe()

Serum cholesterol

In [None]:
heart_disease['train']['Column5'].describe()

Fasting blood sugar > 120mg/dl

In [None]:
heart_disease['train']['Column6'].describe()

In [None]:
heart_disease['train']['Column6'].mode()

Resting ECG  

In [None]:
heart_disease['train']['Column7'].describe()

Max heart rate achieved

In [None]:
heart_disease['train']['Column8'].describe()

exercise induced agina

In [None]:
heart_disease['train']['Column9'].describe()

In [None]:
heart_disease['train']['Column9'].mode()

ST depression induced by exercise relative to rest

In [None]:
heart_disease['train']['Column10'].describe()

Peak exercise ST segment

In [None]:
heart_disease['train']['Column11'].describe()

Number of major vessels colored by flourosopy

In [None]:
heart_disease['train']['Column12'].describe()

Thal

In [None]:
heart_disease['train']['Column13'].describe()

In [None]:
pd.value_counts(heart_disease['train']['Column'.values)

Diagnosis

In [None]:
heart_disease['train']['Column14'].describe()