In [1]:
# data cleaning and plots
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

# sklearn: data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# sklearn: train model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, classification_report

# sklearn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
df_male = pd.read_csv('../../datasets/liver/indian_liver_patient_male.csv')
df_female = pd.read_csv('../../datasets/liver/indian_liver_patient_female.csv')

In [3]:
df_male.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    2
Dataset                       0
dtype: int64

In [4]:
print(df_male.shape)

# Discard the rows with missing values
df_male = df_male.dropna()

# Shape of the data: we could see that the number of rows is 579 now (originally it is 583)
print(df_male.shape)

(441, 11)
(439, 11)


In [5]:
df_female.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    2
Dataset                       0
dtype: int64

In [6]:
print(df_female.shape)

# Discard the rows with missing values
df_female = df_female.dropna()

# Shape of the data: we could see that the number of rows is 579 now (originally it is 583)
print(df_female.shape)

(142, 11)
(140, 11)


In [7]:
df_male.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

In [8]:
df_female.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

In [9]:
X_train = df_male.drop(axis=1, columns=['Dataset']) # X is a dataframe

y_train = df_male['Dataset'].values # y is an array

X_test = df_female.drop(axis=1, columns=['Dataset']) # X is a dataframe

y_test = df_female['Dataset'].values # y is an array

In [10]:
data_to_use = [df_male, df_female]
data_to_use = pd.concat(data_to_use)

y = data_to_use['Dataset'].values # y is an array

In [11]:
X_test['Gender'].value_counts()

Female    140
Name: Gender, dtype: int64

In [12]:
# Calculate Y ratio
def data_ratio(y):
    '''
    Calculate Y's ratio
    '''
    unique, count = np.unique(y, return_counts=True)
    ratio = round(count[0]/count[1], 1)
    return f'{ratio}:1 ({count[0]}/{count[1]})'

In [13]:
print('The class ratio in training data: ', data_ratio(y_train))
print('The class ratio in testing data: ', data_ratio(y_test))

The class ratio in training data:  0.4:1 (116/323)
The class ratio in testing data:  0.5:1 (49/91)


In [14]:
# determine categorical and numerical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'bool']).columns

In [15]:
numerical_cols

Index(['Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase',
       'Alamine_Aminotransferase', 'Aspartate_Aminotransferase',
       'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio'],
      dtype='object')

In [16]:
categorical_cols

Index(['Gender'], dtype='object')

In [17]:
ohe = OneHotEncoder(handle_unknown = "ignore")

# define the transformation methods for the columns
t = [('ohe', ohe, categorical_cols),
    ('scale', StandardScaler(), numerical_cols)]

col_trans = ColumnTransformer(transformers=t)

# fit the transformation on training data
col_trans.fit(X_train)

ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'),
                                 Index(['Gender'], dtype='object')),
                                ('scale', StandardScaler(),
                                 Index(['Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase',
       'Alamine_Aminotransferase', 'Aspartate_Aminotransferase',
       'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio'],
      dtype='object'))])

In [18]:
# apply transformation to both training and testing data 
X_train_transform = col_trans.transform(X_train)
X_test_transform = col_trans.transform(X_test)

In [19]:
# look at the transformed training data
X_train_transform

array([[ 1.        ,  1.76556047, -0.39989564, ..., -0.02210596,
         0.39135708,  0.4668162 ],
       [ 1.        ,  1.70405586, -0.27801475, ...,  0.07211265,
        -0.12303142, -0.44521011],
       [ 1.        , -0.75612838, -0.44560097, ...,  0.54320567,
         0.77714846,  0.4668162 ],
       ...,
       [ 1.        ,  0.22794532, -0.46083608, ..., -1.34116643,
        -0.89461417, -0.14120134],
       [ 1.        ,  0.22794532, -0.38466053, ...,  0.54320567,
         1.54873122,  1.68285128],
       [ 1.        , -0.44860535, -0.39989564, ...,  0.82586148,
         1.67732834,  1.68285128]])

In [20]:
col_trans.transformers_[0][1].get_feature_names()


array(['x0_Male'], dtype=object)

In [21]:
new_cols = ['Male'] + numerical_cols.to_list()

pd.DataFrame(X_train_transform, columns=new_cols).head()

Unnamed: 0,Male,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,1.0,1.76556,-0.399896,-0.463728,-0.506771,-0.293814,-0.282504,-0.022106,0.391357,0.466816
1,1.0,1.704056,-0.278015,-0.257887,-0.303757,-0.343167,-0.248505,0.072113,-0.123031,-0.44521
2,1.0,-0.756128,-0.445601,-0.532342,-0.538352,-0.269138,-0.279413,0.543206,0.777148,0.466816
3,1.0,-0.018073,-0.430366,-0.498035,-0.664672,-0.323426,-0.319593,-0.116325,0.134163,0.162807
4,1.0,-0.756128,-0.430366,-0.498035,-0.687229,-0.293814,-0.291776,0.731643,1.677328,1.682851


In [22]:
# Note that the distinct values/labels in `y` target are 1 and 2. 
pd.unique(y)

array([2, 1])

In [23]:
# Define a LabelEncoder() transformation method and fit on y_train
target_trans = LabelEncoder()
target_trans.fit(y_train)

LabelEncoder()

In [24]:
# apply transformation method on y_train and y_test
y_train_transform = target_trans.transform(y_train)
y_test_transform = target_trans.transform(y_test)

In [25]:
# ===== Step 1: cross-validation ========
# define a Logistic Regression classifier
clf = LogisticRegression(solver='lbfgs', random_state=123)

# define  Stratified 5-fold cross-validator, it provides train/validate indices to split data in train/validate sets.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

# define metrics for evaluating
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# perform the 5-fold CV and get the metrics results
cv_results = cross_validate(estimator=clf,
                            X=X_train_transform,
                            y=y_train_transform,
                            scoring=scoring,
                            cv=cv,
                            return_train_score=False) # prevent to show the train scores on cv splits.

In [26]:
cv_results

{'fit_time': array([0.01656604, 0.01255798, 0.0094614 , 0.00998688, 0.01032186]),
 'score_time': array([0.00427222, 0.00347829, 0.00497031, 0.00337267, 0.00360656]),
 'test_accuracy': array([0.76136364, 0.64772727, 0.73863636, 0.77272727, 0.75862069]),
 'test_precision': array([0.78947368, 0.72972973, 0.75609756, 0.78205128, 0.7721519 ]),
 'test_recall': array([0.92307692, 0.83076923, 0.95384615, 0.953125  , 0.953125  ]),
 'test_f1': array([0.85106383, 0.77697842, 0.84353741, 0.85915493, 0.85314685]),
 'test_roc_auc': array([0.78929766, 0.69364548, 0.76722408, 0.72981771, 0.84714674])}

In [27]:
cv_results['test_accuracy'].mean()

0.7358150470219437

In [28]:
for i in cv_results:
    print(i, ": ", cv_results[i].mean())

fit_time :  0.011778831481933594
score_time :  0.003940010070800781
test_accuracy :  0.7358150470219437
test_precision :  0.7659008311402651
test_recall :  0.9227884615384616
test_f1 :  0.8367762889487451
test_roc_auc :  0.7654263343088072


In [29]:
# ======== Step 2: Evaluate the model using testing data =======

# fit the Logistic Regression model
clf.fit(X=X_train_transform, y=y_train_transform)

# predition on testing data
y_pred_class = clf.predict(X=X_test_transform)
y_pred_score = clf.predict_proba(X=X_test_transform)[:, 1]

# AUC of ROC
auc_ontest = roc_auc_score(y_true=y_test_transform, y_score=y_pred_score)
# confusion matrix
cm_ontest = confusion_matrix(y_true=y_test_transform, y_pred=y_pred_class)
# precision score
precision_ontest = precision_score(y_true=y_test_transform, y_pred=y_pred_class)
# recall score
recall_ontest = recall_score(y_true=y_test_transform, y_pred=y_pred_class)
# classifition report
cls_report_ontest = classification_report(y_true=y_test_transform, y_pred=y_pred_class)

# print the above results
print('The model scores {:1.5f} ROC AUC on the test set.'.format(auc_ontest))
print('The precision score on the test set: {:1.5f}'.format(precision_ontest))
print('The recall score on the test set: {:1.5f}'.format(recall_ontest))
print('Confusion Matrix:\n', cm_ontest)
# Print classification report:
print('Classification Report:\n', cls_report_ontest)

The model scores 0.66898 ROC AUC on the test set.
The precision score on the test set: 0.67826
The recall score on the test set: 0.85714
Confusion Matrix:
 [[12 37]
 [13 78]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.24      0.32        49
           1       0.68      0.86      0.76        91

    accuracy                           0.64       140
   macro avg       0.58      0.55      0.54       140
weighted avg       0.61      0.64      0.61       140

