### Indian Liver Patient Records and model training

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score,accuracy_score

print('pandas version {}'.format(pd.__version__))

pandas version 0.24.1


In [3]:
data =  pd.read_csv('/Users/aakashvarshney/Downloads/indian_liver_patient.csv')

In [4]:
# dataset 1 - liver Patient
# dataset 2 - Not a liver Patient
#y = data['Dataset']
data['Dataset'] = [0 if x == 2 else x for x in data['Dataset']]
data['Gender'] = [1 if x == 'Male' else 0 for x in data['Gender']]

In [5]:
data['Gender'].value_counts()

1    441
0    142
Name: Gender, dtype: int64

In [6]:
data.head(10)

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1
5,46,1,1.8,0.7,208,19,14,7.6,4.4,1.3,1
6,26,0,0.9,0.2,154,16,12,7.0,3.5,1.0,1
7,29,0,0.9,0.3,202,14,11,6.7,3.6,1.1,1
8,17,1,0.9,0.3,202,22,19,7.4,4.1,1.2,0
9,55,1,0.7,0.2,290,53,58,6.8,3.4,1.0,1


### Missing value

In [7]:
print(data.isnull().sum())

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64


In [8]:
data.dropna(inplace=True)

In [17]:
# Seperate out feature and labels
y = data['Dataset']
X = data.drop('Dataset', axis = 1)

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 582
Data columns (total 10 columns):
Age                           579 non-null int64
Gender                        579 non-null int64
Total_Bilirubin               579 non-null float64
Direct_Bilirubin              579 non-null float64
Alkaline_Phosphotase          579 non-null int64
Alamine_Aminotransferase      579 non-null int64
Aspartate_Aminotransferase    579 non-null int64
Total_Protiens                579 non-null float64
Albumin                       579 non-null float64
Albumin_and_Globulin_Ratio    579 non-null float64
dtypes: float64(5), int64(5)
memory usage: 49.8 KB


### Split training and test dataset

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.3)
print('Train Data set shape {} label shape {}'.format(X_train.shape, y_train.shape))
print('Test Data set shape {} label shape {}'.format(X_test.shape, y_test.shape))

Train Data set shape (405, 10) label shape (405,)
Test Data set shape (174, 10) label shape (174,)


In [12]:
# Logistic Regression
def model_summary(model_obj, model_name):
    model_obj.fit(X_train, y_train)
    print('-'*40)
    print('Model : {}'.format(model_name))
    print('-'*40)
    print('Traing dataset score: {}'.format(model_obj.score(X_train, y_train)))
    model_pred = model_obj.predict(X_test)
    print('Test Data Metrics')
    print ('Accuracy Score :\t{:.4}'.format(accuracy_score(y_test,model_pred)))
    print ('Recall Score :\t\t{:.4}'.format(recall_score(y_test,model_pred)))
    print ('Precision Score :\t{:.4}'.format(precision_score(y_test,model_pred)))

In [13]:
# Logistic Model
lr = LogisticRegression(random_state=42, solver='liblinear')
model_summary(lr,'Logistic Regression')

----------------------------------------
Model : Logistic Regression
----------------------------------------
Traing dataset score: 0.725925925925926
Test Data Metrics
Accuracy Score :	0.6667
Recall Score :		0.8952
Precision Score :	0.7115


In [14]:
# decision Tree
dct = DecisionTreeClassifier(random_state=42)
model_summary(dct,'Decision Tree Classifier')

----------------------------------------
Model : Decision Tree Classifier
----------------------------------------
Traing dataset score: 1.0
Test Data Metrics
Accuracy Score :	0.6782
Recall Score :		0.7742
Precision Score :	0.7742
