# Introduction

# 0. Importing Libraries and Other Code

In [14]:
# Import libraries

from os import cpu_count
from math import  sqrt , floor

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

In [2]:
# Relevant variables

URL_DATASET = "../Datasets/"

Features = ['age','gender','tot_bilirubin','direct_bilirubin','alkphos','sgpt','sgot','tot_proteins','albumin','ag_ratio']
Target = 'is_patient'

In [15]:
# Auxiliar variables

RANDOM_STATE = 8013
NUM_JOBS = floor(sqrt(cpu_count()))

## 0.1 Defining Utilities Functions

In [4]:
def SplitFeaturesUsingDatatype(Dataset:pd.DataFrame,Features:list[str]):
    continuous , integer , categorical = [] , [] , []
    
    for feature in Features:
        if (data_type:=Dataset[feature].dtype) == 'float':
            continuous.append(feature)
        elif data_type == 'int':
            integer.append(feature)
        else:
            categorical.append(feature)
    
    return continuous , integer , categorical

# 1. Load Dataset

In [5]:
# Loading dataset

LiverDataset = pd.read_csv(URL_DATASET+'IndianLiverPatientDataset_Clean01.csv',index_col=None)

LiverDataset.head(5)

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,alkphos,sgpt,sgot,tot_proteins,albumin,ag_ratio,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,Yes
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,Yes
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,Yes
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,Yes
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,Yes


In [6]:
# Splitting features based on its data types

ContinuousFeatures , IntegerFeatures , CategoricalFeatures = SplitFeaturesUsingDatatype(LiverDataset,Features)

# 2. Preprocessing Pipeline

Referring to the insights acquired in [Exploratory Data Analysis](../ExploratoryDataAnalysis/ExploratoryDataAnalysis.ipynb), some of the numerical attributes are transformed with the ``log10`` function in order to transform their distribution and make the information of these transformed values more relevant to the model. Since the models that are created, based on [Technical Requirements](../TechnicalRequirements.pdf) some of them are based on distance, a standard scaling is applied in order to obtain the values under the same range and measure.

In [30]:
# Import libraries for preprocessing

from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import FunctionTransformer , OrdinalEncoder , StandardScaler
from sklearn.decomposition import PCA

In [None]:
# Defining preprocessing by feature

PreprocessingFeatures = ColumnTransformer(
    [
        ('NumericalFeaturesTransformation',FunctionTransformer(np.log10,lambda value : np.pow(10,value)),[*ContinuousFeatures[:2],*IntegerFeatures[1:]]),
        ('CategoricalFeaturesEncode',OrdinalEncoder(),CategoricalFeatures),
    ], remainder='passthrough', n_jobs=NUM_JOBS
)

In [27]:
# Adding features for PCA

AdditionalFeatures = FeatureUnion(
    [
        ('PCA',PCA(n_components=5,random_state=RANDOM_STATE)),
        ('WithoutChanges','passthrough'),
    ], n_jobs=NUM_JOBS
)

In [32]:
# Defining preprocessing pipeline

PreprocessingPipeline = Pipeline(
    [
        ('PreprocessingFeatures',PreprocessingFeatures),
        ('GeneratingFeatures',AdditionalFeatures),
        ('Scaling',StandardScaler()),
    ]
)

PreprocessingPipeline