In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

## This notebook is for data cleaning and preprocessing of the asthma disease data

In [57]:
# import nbimporter

# # Import the notebook as a module
#import data_exploration


In [58]:
#%run "data_exploration.ipynb"

In [59]:
asthma_data = pd.read_pickle('asthma_data.pkl')

In [60]:
asthma_data

Unnamed: 0,patient_id,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,...,lung_function_fev1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,doctor_in_charge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,7421,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,...,3.125249,5.166032,0,1,0,0,0,1,1,Dr_Confid
2388,7422,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,...,1.132977,5.509502,0,0,0,1,1,0,1,Dr_Confid
2389,7423,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,...,1.685962,3.346877,1,0,1,1,0,1,1,Dr_Confid
2390,7424,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,...,3.481549,1.713274,0,1,1,0,1,1,0,Dr_Confid


As evident from data exploration file:
- there are no missing values in the dataframe
- there are no outliers
- categorical variables are already encoded

In [61]:
#Remove column, contains repeating string
asthma_data = asthma_data.drop(columns=['doctor_in_charge'])

In [64]:
# Standardize (Z-score normalization) age and continuous features 
#to ensure that the features contribute equally to the model
variables_to_standartize = ['bmi', 'physical_activity', 'diet_quality', 
                   'sleep_quality', 'pollution_exposure', 'pollen_exposure', 
                   'dust_exposure', 'pet_allergy', 'lung_function_fev1', 'lung_function_fvc']

scaler = StandardScaler()
asthma_data_scaled = asthma_data.copy()
asthma_data_scaled[variables_to_standartize] = scaler.fit_transform(asthma_data[variables_to_standartize])

asthma_data_scaled.head()

Unnamed: 0,patient_id,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,...,gastroesophageal_reflux,lung_function_fev1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,5034,63,0,1,0,-1.582769,0,-1.432099,0.160113,0.971063,...,0,-1.368934,0.920608,0,0,1,0,0,1,0
1,5035,26,1,2,2,-0.6233,0,0.291269,0.453069,-1.076746,...,0,-0.407132,-1.564256,1,0,0,1,1,1,0
2,5036,57,0,2,1,-1.229074,0,0.58133,1.434458,-0.102976,...,0,-0.987146,0.983019,1,1,1,0,1,1,0
3,5037,40,1,2,1,1.565307,0,-1.256398,0.276233,-1.59688,...,0,0.561114,-1.105641,1,0,1,1,1,0,0
4,5038,61,0,0,3,-1.105686,0,-0.154081,-0.651625,1.504976,...,0,1.070095,-0.516586,1,1,1,0,0,1,0


In [65]:
asthma_data_scaled.to_pickle('asthma_data_scaled.pkl')