# Feature Engineering
- Age Category
- BMI Category - Overweight, Normal Weight, Underweight
- Pollution Risk - Location and Air Pollution Level
- Smoking Status Encoding
- Interaction Features - Interaction between the various categories
- Location Encoding - Categories have to be passed as numbers, changing categories to numerical values

In [37]:
# Import the required libraries
import pandas as pd
import numpy as np

In [38]:
# Load the data
path = r"../Data/synthetic_COPD_data.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0


In [39]:
# Age Categories
df['Age_Category'] = pd.cut(df['Age'], bins = [29, 39, 49, 59, 69, 79], labels = ['30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79'])
df['BMI_Category'] = pd.cut(df['BMI'], bins = [0, 18.5, 24.5, 29.9, 35], labels = ['Under-weight', 'Normal-weight', 'Over-weight', 'Obesity'])
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_Category
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0,30 - 39,Over-weight
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0,60 - 69,Obesity
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0,30 - 39,Over-weight
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1,30 - 39,Over-weight
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0,50 - 59,Over-weight


In [40]:
# Pollution Risk Score
df['Pollution_Risk_Score'] = np.where(df['Air_Pollution_Level'] > 150, 1, 0) #if greater than 150 then 1 else 0

#Encoding the smoking status
df['Smoking_Status_Encoded'] = df['Smoking_Status'].map({'Current': 1, 'Former': 0.5, 'Never': 0})

#Encoding the Gender
df['Gender_Encoded'] = df['Gender'].map({'Male': 1, 'Female': 0})

#Interaction features: Smoking Status and Pollution Level
df['Smoking_Pollution_Interaction'] = df['Smoking_Status_Encoded'] * df['Air_Pollution_Level']

df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_Category,Pollution_Risk_Score,Smoking_Status_Encoded,Gender_Encoded,Smoking_Pollution_Interaction
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0,30 - 39,Over-weight,0,0.5,1,42.0
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0,60 - 69,Obesity,0,0.0,1,0.0
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0,30 - 39,Over-weight,0,0.5,1,61.5
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1,30 - 39,Over-weight,1,1.0,0,253.0
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0,50 - 59,Over-weight,0,0.0,1,0.0


In [41]:
# One Hot Encoding for the location
df = pd.get_dummies(df, columns=['Location'], drop_first=True) #Categories the location field with it's values by dropping the original columns
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,...,Smoking_Pollution_Interaction,Location_Biratnagar,Location_Butwal,Location_Chitwan,Location_Dharan,Location_Hetauda,Location_Kathmandu,Location_Lalitpur,Location_Nepalgunj,Location_Pokhara
0,31,Male,Former,1,1,1,27.56,84,0,0,...,42.0,False,False,False,False,False,False,True,False,False
1,60,Male,Never,1,0,0,30.3,131,1,0,...,0.0,False,False,False,False,False,False,False,False,True
2,33,Male,Former,0,0,1,28.45,123,1,0,...,61.5,False,False,False,False,False,False,False,False,True
3,36,Female,Current,1,0,0,27.49,253,0,1,...,253.0,False,False,False,False,False,True,False,False,False
4,58,Male,Never,0,0,0,25.49,117,1,0,...,0.0,False,False,False,False,False,False,False,False,True


# Machine Learning Data

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   Age                               1000 non-null   int64   
 1   Gender                            1000 non-null   object  
 2   Smoking_Status                    1000 non-null   object  
 3   Biomass_Fuel_Exposure             1000 non-null   int64   
 4   Occupational_Exposure             1000 non-null   int64   
 5   Family_History_COPD               1000 non-null   int64   
 6   BMI                               1000 non-null   float64 
 7   Air_Pollution_Level               1000 non-null   int64   
 8   Respiratory_Infections_Childhood  1000 non-null   int64   
 9   COPD_Diagnosis                    1000 non-null   int64   
 10  Age_Category                      980 non-null    category
 11  BMI_Category                      1000 non-null   categor

In [43]:
#Data should be either boolean or integer,float so we perform the following operations
#Dropping "Smoking_Status"
df = df.drop(columns=['Smoking_Status', 'Age_Category', 'BMI_Category', 'Gender'])

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               1000 non-null   int64  
 1   Biomass_Fuel_Exposure             1000 non-null   int64  
 2   Occupational_Exposure             1000 non-null   int64  
 3   Family_History_COPD               1000 non-null   int64  
 4   BMI                               1000 non-null   float64
 5   Air_Pollution_Level               1000 non-null   int64  
 6   Respiratory_Infections_Childhood  1000 non-null   int64  
 7   COPD_Diagnosis                    1000 non-null   int64  
 8   Pollution_Risk_Score              1000 non-null   int32  
 9   Smoking_Status_Encoded            1000 non-null   float64
 10  Gender_Encoded                    1000 non-null   int64  
 11  Smoking_Pollution_Interaction     1000 non-null   float64
 12  Locatio

In [47]:
# Save new data after feature engineering
df.to_csv('engineered_COPD_data.csv', index=False)