In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

# Loading a Data

In [5]:
india_liver_patient=pd.read_csv("indian_liver_patient.csv")

In [6]:
india_liver_patient.shape

(583, 11)

### 2. Exploring the data

In [7]:
india_liver_patient.columns[india_liver_patient.isnull().any()]

Index(['Albumin_and_Globulin_Ratio'], dtype='object')

In [8]:
india_liver_patient.Albumin_and_Globulin_Ratio.isnull().any()

True

In [9]:
india_liver_patient.Albumin_and_Globulin_Ratio.mean()

0.9470639032815201

In [10]:
india_liver_patient.Albumin_and_Globulin_Ratio.fillna(india_liver_patient['Albumin_and_Globulin_Ratio'].mean(),inplace=True)

In [11]:
india_liver_patient.columns

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Label'],
      dtype='object')

In [12]:
india_liver_patient.dtypes

Age                             int64
Gender                         object
Total_Bilirubin               float64
Direct_Bilirubin              float64
Alkaline_Phosphotase            int64
Alamine_Aminotransferase        int64
Aspartate_Aminotransferase      int64
Total_Protiens                float64
Albumin                       float64
Albumin_and_Globulin_Ratio    float64
Label                           int64
dtype: object

### 3. Data preparation

In [13]:
for i in india_liver_patient.columns:
    print(i,india_liver_patient[i].nunique())

Age 72
Gender 2
Total_Bilirubin 113
Direct_Bilirubin 80
Alkaline_Phosphotase 263
Alamine_Aminotransferase 152
Aspartate_Aminotransferase 177
Total_Protiens 58
Albumin 40
Albumin_and_Globulin_Ratio 70
Label 2


#### 3.2 LabelEncoder is used for converting categorical string columns to numeric.

In [14]:
categorical_col=['Gender']

In [15]:
le=LabelEncoder()
for col in categorical_col:
    india_liver_patient[col]=le.fit_transform(india_liver_patient[col])

In [16]:
india_liver_patient.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Label
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [17]:
india_liver_patient.shape

(583, 11)

#### 3.3 Split the data into train and test

In [18]:
india_liver_patient_train=india_liver_patient.sample(466, random_state=123)

In [19]:
india_liver_patient_test=india_liver_patient.drop(india_liver_patient_train.index)

In [20]:

(india_liver_patient.Label.value_counts()/india_liver_patient.Label.count())*100

1    71.35506
2    28.64494
Name: Label, dtype: float64

In [21]:
# Train data - Ration of patient with liver disease and no disease
(india_liver_patient_train.Label.value_counts()/india_liver_patient_train.Label.count())*100

1    71.67382
2    28.32618
Name: Label, dtype: float64

In [22]:
# Test data - Ration of patient with liver disease and no disease
(india_liver_patient_test.Label.value_counts()/india_liver_patient_test.Label.count())*100

1    70.08547
2    29.91453
Name: Label, dtype: float64

In [23]:
# Taking labels into seprate obj
train_labels=india_liver_patient_train.Label
test_labels=india_liver_patient_test.Label

### 4. Training the model (Decison Tree)

In [24]:
india_liver_patient_clf=DecisionTreeClassifier(criterion='entropy')

In [25]:
# Training/Build the model with train data
indian_liver_patient_pkl=india_liver_patient_clf.fit(india_liver_patient_train.iloc[:,:-1],train_labels)

In [26]:
# Make predictions on test data
predictions = india_liver_patient_clf.predict(india_liver_patient_test.iloc[:,:-1])

In [27]:
predictions

array([1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1])

# Make predictions on test data

In [28]:
confusion_matrix(test_labels,predictions)

array([[66, 16],
       [24, 11]])

#### Simple Accuracy

In [29]:
accuracy_score(test_labels,predictions)*100

65.8119658119658

#### Precision, Recall and F1-Score

In [30]:
from sklearn.metrics import precision_score,recall_score,f1_score,classification_report,roc_auc_score

In [31]:
#Precision
precision_score(test_labels,predictions)

0.7333333333333333

In [32]:
recall_score(test_labels,predictions)

0.8048780487804879

In [33]:
f1_score(test_labels,predictions)

0.7674418604651163

In [34]:
print(classification_report(test_labels,predictions))

             precision    recall  f1-score   support

          1       0.73      0.80      0.77        82
          2       0.41      0.31      0.35        35

avg / total       0.64      0.66      0.64       117



In [44]:
# saving the modim
import _pickle as cpickle

with open("hello.pkl",'wb') as fout:
    cpickle.dump(indian_liver_patient_pkl,fout)
    
    
    
reload = cpickle.load(open('hello.pkl'))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

### 6. Cross Validation

In [35]:
# simplify names
y_tr = train_labels

In [36]:
from sklearn.model_selection import cross_val_predict

In [37]:
y_pr=cross_val_predict(india_liver_patient_clf,india_liver_patient_train,train_labels,cv=5)

In [38]:
confusion_matrix(y_tr,y_pr)

array([[334,   0],
       [  0, 132]])

In [39]:
precision_score(y_tr,y_pr)

1.0

In [40]:
recall_score(y_tr,y_pr)

1.0

In [41]:
f1_score(y_tr,y_pr)

1.0