In [231]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import confusion_matrix,accuracy_score
import warnings
warnings.filterwarnings('ignore')
import joblib

In [232]:
cirrhosis_data = pd.read_csv('/content/sample_data/cirrhosis.csv')

In [233]:
cirrhosis_data.head()

Unnamed: 0,Age,Sex,Drug,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,21464,F,D-penicillamine,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,20617,F,D-penicillamine,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,25594,M,D-penicillamine,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,19994,F,D-penicillamine,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,13918,F,Placebo,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [234]:
cirrhosis_data["Age"] = (cirrhosis_data["Age"]/365)
cirrhosis_data["Age"] = cirrhosis_data["Age"].astype("int64")

In [235]:
cirrhosis_data["Age"]

0      58
1      56
2      70
3      54
4      38
       ..
413    67
414    39
415    57
416    58
417    53
Name: Age, Length: 418, dtype: int64

In [236]:
cirrhosis_data.head()

Unnamed: 0,Age,Sex,Drug,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,58,F,D-penicillamine,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,56,F,D-penicillamine,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,70,M,D-penicillamine,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,54,F,D-penicillamine,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,38,F,Placebo,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [237]:
cat_features = []
lis = list(cirrhosis_data.columns)
for i in lis:
    if cirrhosis_data[i].dtype == "object":
        cat_features.append(i)

cat_features

['Sex', 'Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

In [238]:
num_features = []
lis = list(cirrhosis_data.columns)
for i in lis:
    if cirrhosis_data[i].dtype == "int64":
        num_features.append(i)

num_features

['Age']

In [239]:
def clean (cirrhosis_data):
    
    cols_num = ["Cholesterol", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin" ]   
    
    for col in cols_num:
        cirrhosis_data[col].fillna(cirrhosis_data[col].median(), inplace = True)  
        
    cols_cat = ["Ascites", "Hepatomegaly", "Spiders"]
    
    # Replace with most frequent values i.e. mode
    for col in cols_cat:
        cirrhosis_data[col].fillna(cirrhosis_data[col].mode().values[0], inplace = True) 
            
    return cirrhosis_data
                                        
cirrhosis_data = clean(cirrhosis_data) 

In [240]:
cirrhosis_data.isnull().sum()

Age                0
Sex                0
Drug             106
Ascites            0
Hepatomegaly       0
Spiders            0
Edema              0
Bilirubin          0
Cholesterol        0
Albumin            0
Copper             0
Alk_Phos           0
SGOT               0
Tryglicerides      0
Platelets          0
Prothrombin        0
Stage              6
dtype: int64

In [241]:
cirrhosis_data = cirrhosis_data.dropna(subset=['Stage'])

In [242]:
cirrhosis_data.drop('Drug', inplace=True, axis=1)

In [243]:
cirrhosis_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 412 entries, 0 to 417
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            412 non-null    int64  
 1   Sex            412 non-null    object 
 2   Ascites        412 non-null    object 
 3   Hepatomegaly   412 non-null    object 
 4   Spiders        412 non-null    object 
 5   Edema          412 non-null    object 
 6   Bilirubin      412 non-null    float64
 7   Cholesterol    412 non-null    float64
 8   Albumin        412 non-null    float64
 9   Copper         412 non-null    float64
 10  Alk_Phos       412 non-null    float64
 11  SGOT           412 non-null    float64
 12  Tryglicerides  412 non-null    float64
 13  Platelets      412 non-null    float64
 14  Prothrombin    412 non-null    float64
 15  Stage          412 non-null    float64
dtypes: float64(10), int64(1), object(5)
memory usage: 54.7+ KB


In [244]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

cols_cat = ['Sex','Ascites','Hepatomegaly','Spiders','Edema']
for col in cols_cat:
    cirrhosis_data[col] = le.fit_transform(cirrhosis_data[col])
    print(le.classes_)  

['F' 'M']
['N' 'Y']
['N' 'Y']
['N' 'Y']
['N' 'S' 'Y']


In [245]:
cirrhosis_data['Stage'] = cirrhosis_data['Stage'].replace(1.0,0)

In [246]:
cirrhosis_data['Stage'] = cirrhosis_data['Stage'].replace(4.0,1)

In [247]:
cirrhosis_data['Stage'] = cirrhosis_data['Stage'].replace(3.0,0)

In [248]:
cirrhosis_data['Stage'] = cirrhosis_data['Stage'].replace(2.0,0)

In [249]:
cirrhosis_data

Unnamed: 0,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,58,0,1,1,1,2,14.5,261.0,2.60,156.0,1718.0,137.95,172.0,190.0,12.2,1.0
1,56,0,0,1,1,0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,0.0
2,70,1,0,0,0,1,1.4,176.0,3.48,210.0,516.0,96.10,55.0,151.0,12.0,1.0
3,54,0,0,1,1,1,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,1.0
4,38,0,0,1,1,0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,67,0,0,1,0,0,1.2,309.5,2.96,73.0,1259.0,114.70,108.0,174.0,10.9,0.0
414,39,0,0,1,0,0,0.9,309.5,3.83,73.0,1259.0,114.70,108.0,180.0,11.2,1.0
415,57,0,0,1,0,0,1.6,309.5,3.42,73.0,1259.0,114.70,108.0,143.0,9.9,0.0
416,58,0,0,1,0,0,0.8,309.5,3.75,73.0,1259.0,114.70,108.0,269.0,10.4,0.0


In [250]:
x = cirrhosis_data.drop([ 'Stage'], axis = 1)
y = cirrhosis_data['Stage']

In [251]:
cirrhosis_data.head()

Unnamed: 0,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,58,0,1,1,1,2,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,1.0
1,56,0,0,1,1,0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,0.0
2,70,1,0,0,0,1,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,1.0
3,54,0,0,1,1,1,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,1.0
4,38,0,0,1,1,0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,0.0


In [252]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=2)


In [253]:
from xgboost import XGBClassifier
model = XGBClassifier(learning_rate=0.75, max_depth=3, random_state=1, gamma=0, eval_metric='error')
model.fit(x_train,y_train)
xgb_acc=model.score(x_test,y_test)

In [254]:
xgb_acc

0.7951807228915663

In [255]:
joblib.dump(model,'Liver-Cirrhosis')

['Liver-Cirrhosis']