# Introduction

# 0. Importing Libraries and Other Code

In [14]:
# Import libraries

from os import cpu_count
from math import  sqrt , floor

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

In [2]:
# Relevant variables

URL_DATASET = "../Datasets/"

Features = ['age','gender','tot_bilirubin','direct_bilirubin','alkphos','sgpt','sgot','tot_proteins','albumin','ag_ratio']
Target = 'is_patient'

In [15]:
# Auxiliar variables

RANDOM_STATE = 8013
NUM_JOBS = floor(sqrt(cpu_count()))

## 0.1 Defining Utilities Functions

In [4]:
def SplitFeaturesUsingDatatype(Dataset:pd.DataFrame,Features:list[str]):
    continuous , integer , categorical = [] , [] , []
    
    for feature in Features:
        if (data_type:=Dataset[feature].dtype) == 'float':
            continuous.append(feature)
        elif data_type == 'int':
            integer.append(feature)
        else:
            categorical.append(feature)
    
    return continuous , integer , categorical

# 1. Load Dataset

In [5]:
# Loading dataset

LiverDataset = pd.read_csv(URL_DATASET+'IndianLiverPatientDataset_Clean01.csv',index_col=None)

LiverDataset.head(5)

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,alkphos,sgpt,sgot,tot_proteins,albumin,ag_ratio,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,Yes
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,Yes
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,Yes
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,Yes
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,Yes


In [6]:
# Splitting features based on its data types

ContinuousFeatures , IntegerFeatures , CategoricalFeatures = SplitFeaturesUsingDatatype(LiverDataset,Features)

# 2. Preprocessing Pipeline

Referring to the insights acquired in [Exploratory Data Analysis](../ExploratoryDataAnalysis/ExploratoryDataAnalysis.ipynb), some of the numerical attributes are transformed with the ``log10`` function in order to transform their distribution and make the information of these transformed values more relevant to the model. Since the models that are created, based on [Technical Requirements](../TechnicalRequirements.pdf) some of them are based on distance, a standard scaling is applied in order to obtain the values under the same range and measure.

In [30]:
# Import libraries for preprocessing

from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import FunctionTransformer , OrdinalEncoder , StandardScaler
from sklearn.decomposition import PCA

In [None]:
# Defining preprocessing by feature

PreprocessingFeatures = ColumnTransformer(
    [
        ('NumericalFeaturesTransformation',FunctionTransformer(np.log10,lambda value : np.pow(10,value)),[*ContinuousFeatures[:2],*IntegerFeatures[1:]]),
        ('CategoricalFeaturesEncode',OrdinalEncoder(),CategoricalFeatures),
    ], remainder='passthrough', n_jobs=NUM_JOBS
)

In [27]:
# Adding features for PCA

AdditionalFeatures = FeatureUnion(
    [
        ('PCA',PCA(n_components=5,random_state=RANDOM_STATE)),
        ('WithoutChanges','passthrough'),
    ], n_jobs=NUM_JOBS
)

In [32]:
# Defining preprocessing pipeline

PreprocessingPipeline = Pipeline(
    [
        ('PreprocessingFeatures',PreprocessingFeatures),
        ('GeneratingFeatures',AdditionalFeatures),
        ('Scaling',StandardScaler()),
    ]
)

PreprocessingPipeline

# 3. Models Definition

Following the requirements in [Technical Requirements](../TechnicalRequirements.pdf), three models are defined based on Logistic Regression, K Nearest Neighbors and Decision Tree where the best models are selected by varying the regularization factor ``C``, ``n_neighbors`` and ``max_depth`` respectively, leaving the other parameters at their default values because the interest lies in determining how the behavior of the models, prediction, changes when varying the most relevant hyper parameters of the models.

In [None]:
# Import libraries

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [45]:
# Defined models

Models = ['LogisticRegression','KNN','DecisionTree']

In [76]:
# Defining Logistic Regression Model

Model_LogisticRegression = Pipeline(
    [
        ('Preprocessing',PreprocessingPipeline),
        ('Classifier',LogisticRegression(random_state=RANDOM_STATE,n_jobs=NUM_JOBS)),
    ]
)
Parameter_LogisticRegression = 'Classifier__C'
ParameterRange_LogisticRegression = np.linspace(0,2,21)[1:]

Model_LogisticRegression

In [47]:
# Defining K Nearest Neighbors Model

Model_KNN = Pipeline(
    [
        ('Preprocessing',PreprocessingPipeline),
        ('Classifier',KNeighborsClassifier(n_jobs=NUM_JOBS)),
    ]
)

Parameter_KNN = 'Classifier__n_neighbors'
ParameterRange_KNN = np.arange(1,25)

Model_KNN

In [48]:
# Defining Decision Tree Model

Model_DecisionTree = Pipeline(
    [
        ('Preprocessing',PreprocessingPipeline),
        ('Classifier',DecisionTreeClassifier(random_state=RANDOM_STATE)),
    ]
)

Parameter_DecisionTree = 'Classifier__max_depth'
ParameterRange_DecisionTree = np.arange(1,8)

Model_DecisionTree

# 4. Models Fitting

Based on [Technical Requirements](../TechnicalRequirements.pdf), for hyperparameter fitting and model training, ``10`` folds will be used along with a proportion of ``80%`` of the data will be for training and the remaining ``20%`` for validation. Similarly, this pattern is followed to divide the original data set into the same proportions for training and testing.

For determining which hyperparameter configuration is the most appropriate for each model, the ``recall`` metric is chosen because it is desired to reduce the impact of False Negatives in the sense that giving a patient a diagnosis that he does not have a liver disease is more aggravated than telling him that he does have it and performing more studies around this; to this is also added the fact that the data set is not balanced, so it is appropriate to use a metric other than ``accuracy``.

In [91]:
# Import libraries

from sklearn.model_selection import train_test_split , ShuffleSplit , validation_curve
from sklearn.metrics import recall_score , make_scorer

In [56]:
# Splitting dataset into training and testing datasets

train_X , test_X , train_y , test_y = train_test_split(LiverDataset[Features],LiverDataset[Target],test_size=0.2,random_state=RANDOM_STATE)

In [92]:
# Defining custom recall score

RecallScore = make_scorer(recall_score,pos_label='Yes')

In [97]:
# Declaring globals variables
global_vars = globals()

# Fitting models

folds_cross_validation = ShuffleSplit(n_splits=10,test_size=0.2,random_state=RANDOM_STATE)

for model in Models:
    print(f'START FITTING :: {model}')
    
    model_estimator = global_vars[f'Model_{model}'] 
    parameter_name = global_vars[f'Parameter_{model}'] 
    parameter_range = global_vars[f'ParameterRange_{model}'] 
    
    validation_curve_model = validation_curve(model_estimator,train_X,train_y,
                                              param_name=parameter_name,param_range=parameter_range,
                                              cv=folds_cross_validation,scoring=RecallScore,
                                              n_jobs=NUM_JOBS,error_score='raise',)
    global_vars[f'ValidationCurve_{model}'] = validation_curve_model

    print(f'END FITTING :: {model}\n')

START FITTING :: LogisticRegression
END FITTING :: LogisticRegression

START FITTING :: KNN
END FITTING :: KNN

START FITTING :: DecisionTree
END FITTING :: DecisionTree

