# Feature Engineering (Categorical Encoding)

## Importing Dependencies

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, OrdinalEncoder
import numpy as np
from sklearn.compose import ColumnTransformer


## Defining Feature Engineering Functions

In [3]:
def breakdown_dataset(df):
  """
  Breaks down the dataset and outputs each dimension in its own singular table and the first ten items for that dimension.

  Args:
    df: A Pandas DataFrame.

  Returns:
    None.
  """

  # Get the column names of the DataFrame.
  column_names = df.columns.tolist()

  # Iterate over the column names and print the first ten items for each column.
  for column_name in column_names:
    print(f"Column name: {column_name}")
    print(df[column_name].head(10))
    print("\n")

def save_csv(df, filename):
  """
  Saves the DataFrame to a CSV file.

  Args:
    df: A Pandas DataFrame.
    filename: The name of the CSV file to save.

  Returns:
    None.
  """

  df.to_csv(filename)

## Importing the Raw Dataset

In [42]:
# Read the CSV dataset into a Pandas DataFrame
df = pd.read_csv(".././data/raw/dataset.csv")

## Categorical Encoding of Target Variable

In [43]:
df['Heart_Disease'] = df['Heart_Disease'].map({'No':0,'Yes':1})
print('')
print(df['Heart_Disease'].value_counts())


Heart_Disease
0    283883
1     24971
Name: count, dtype: int64


## Splitting the Dataset 

In [6]:
# Split the dataset into train and test sets while maintaining the class imbalance
train, test = train_test_split(df, test_size=0.2, random_state=22, stratify=df['Heart_Disease'])

# Print the shapes of the train and test sets
print(f"Shape of the training set: {train.shape}")
print(f"Shape of the test set: {test.shape}")

Shape of the training set: (247083, 19)
Shape of the test set: (61771, 19)


## Ratio of Target Variable

In [7]:
# Calculate the ratio of people with heart disease to total for both train and test sets
yes_train = train['Heart_Disease'].value_counts()[0]/len(train['Heart_Disease'])*100
no_train = train['Heart_Disease'].value_counts()[1]/len(train['Heart_Disease'])*100
yes_test = test['Heart_Disease'].value_counts()[0]/len(test['Heart_Disease'])*100
no_test = test['Heart_Disease'].value_counts()[1]/len(test['Heart_Disease'])*100

# Print the ratios for both train and test sets
print('Train Set:')
print(f'Ratio of people with heart disease to total is {yes_train:.2f}%')
print(f'Ratio of people that don\'t have heart disease to total is {no_train:.2f}%')
print('\n')
print('Test Set:')
print(f'Ratio of people with heart disease to total is {yes_test:.2f}%')
print(f'Ratio of people that don\'t have heart disease to total is {no_test:.2f}%')

Train Set:
Ratio of people with heart disease to total is 91.91%
Ratio of people that don't have heart disease to total is 8.09%


Test Set:
Ratio of people with heart disease to total is 91.92%
Ratio of people that don't have heart disease to total is 8.08%


## Splitting X and y variable in the Train and Test Sets

In [45]:
## Splitting the X and y variables in the train set
X_train = train.drop("Heart_Disease", axis=1)
X_train = X_train.drop("BMI", axis=1)
y_train = train["Heart_Disease"].copy()
print(train["Heart_Disease"])

## Splitting the X and y variables in the test set
X_test = test.drop("Heart_Disease", axis=1)
X_test = X_test.drop("BMI", axis=1)
y_test = test["Heart_Disease"].copy()
print(y_test[0:5])

252191    0
93646     0
182562    1
288342    0
207357    0
         ..
301519    0
266566    0
192321    0
304943    0
286010    0
Name: Heart_Disease, Length: 247083, dtype: int64
156839    0
32739     0
164728    0
252925    0
140028    0
Name: Heart_Disease, dtype: int64
   Column Name
0            1
1            2
2            3
3            4
4            5


## Creating Numerical and Categorical Columns

In [9]:
## Creating numerical and categorical columns
numerical = X_train.select_dtypes(include=['float64']).columns.sort_values()
categorical = df.select_dtypes(include=['object']).columns.sort_values()

## Printing the length of numerical and categorical. The total length should have
## the same length as our dataframe
print(f'There are {len(categorical)} Categorical variables')
print(f'There are {len(numerical)} Numerical variables')

There are 11 Categorical variables
There are 6 Numerical variables


## Printing the Unique Values in Each Dimension

In [10]:
X_train.nunique()

General_Health                    5
Checkup                           5
Exercise                          2
Skin_Cancer                       2
Other_Cancer                      2
Depression                        2
Diabetes                          4
Arthritis                         2
Sex                               2
Age_Category                     13
Height_(cm)                      98
Weight_(kg)                     502
Smoking_History                   2
Alcohol_Consumption              31
Fruit_Consumption                73
Green_Vegetables_Consumption     73
FriedPotato_Consumption          67
dtype: int64

## Creating Pipelines

### Categorical Pipeline

For categorical pipeline, only OneHotEncoder will be implemented. Since this dataset has been cleaned and there are no missing values.

In [11]:
cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore',drop='first'))

### Numerical Pipeline

For numerical pipeline, two methods are used:
1. Log Transform: From the EDA, most of the numerical functions are skewed right.
2. Standard Scaler: The numerical variable will be scaled to put them all on the same scale.

In [12]:
num_pipeline = make_pipeline(
                             FunctionTransformer(np.log1p,feature_names_out='one-to-one'),
                             StandardScaler()
                            )

### Ordinal Pipeline

For the ordinal variable, the variables are transformed based on their order. The values with in the lowest order will start with 0 and increases by 1.

In [13]:
## Age Category Pipeline
agecat_pipeline = make_pipeline(
        OrdinalEncoder()
)

## General Health Pipeline
genhealth_pipeline = make_pipeline(
        OrdinalEncoder(categories=[['Poor','Fair','Good','Very Good','Excellent']])
)

## Checkup Pipeline
checkup_pipeline = make_pipeline(
        OrdinalEncoder(categories=[['Within the past year','Within the past 2 years','Within the past 5 years','5 or more years ago','Never']])
)

## Creating the Pipeline Lists

In [14]:
## Setting each column to the pipeline where they will be used
num_pipe_col = numerical

cat_pipe_col = ['Arthritis', 'Depression', 'Diabetes',
       'Exercise', 'Other_Cancer', 'Sex',
       'Skin_Cancer', 'Smoking_History']

## Finalising the Feature Engineering Pipeline

In [15]:
## Combining all the pipelines and creating a main pipeline to enter all the data
preprocessing = ColumnTransformer([
    ('Categorical', cat_pipeline,   cat_pipe_col),
    ('Age_Category',agecat_pipeline,['Age_Category']),
    ('Checkup',checkup_pipeline,['Checkup']),
    ('Gen_health',genhealth_pipeline,['General_Health']),
    ('Numerical',   num_pipeline,  num_pipe_col),
],remainder='passthrough')
preprocessing

## Using the Feature Engineering Pipeline

In [46]:
## Using preprocessing pipeline
print('Shape before the preprocessing:')
print(X_train.shape)

X_train_feature_engineered = preprocessing.fit_transform(X_train)

print('Shape after the preprocessing:')
print(X_train_feature_engineered.shape)

Shape before the preprocessing:
(247083, 17)
Shape after the preprocessing:
(247083, 19)


## Viewing the Feature Engineered Data

In [47]:
X_train_feature_engineered_df = pd.DataFrame(X_train_feature_engineered)
breakdown_dataset(X_train_feature_engineered_df)

Column name: 0
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    1.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: 0, dtype: float64


Column name: 1
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: 1, dtype: float64


Column name: 2
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: 2, dtype: float64


Column name: 3
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    1.0
Name: 3, dtype: float64


Column name: 4
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: 4, dtype: float64


Column name: 5
0    0.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
7    1.0
8    1.0
9    1.0
Name: 5, dtype: float64


Column name: 6
0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: 6, dtype: float64


Column name: 7
0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
5    0.0
6    1.0
7    

## Saving the Feature Engineered Dataset to CSV File

In [48]:
save_csv(X_train_feature_engineered_df, "../data/feature_engineering/X_train_feature_engineered.csv")
save_csv(y_train, "../data/feature_engineering/y_train.csv")