In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [3]:
df=pd.read_csv('cirrhosis_cleaned.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,0,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,1,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0


In [4]:
x=df.drop('Status',axis=1)   #dependent variables
y=df.Status                  #independent variable

In [5]:
cat_features = x.select_dtypes(include="object").columns
num_features = x.select_dtypes(exclude="object").columns
print(cat_features)
print(num_features)

Index(['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema'], dtype='object')
Index(['Unnamed: 0', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin',
       'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets',
       'Prothrombin', 'Stage'],
      dtype='object')


In [6]:
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
         ("OneHotEncoder", oh_transformer, cat_features),  #Applying Onehot Encoding to categorical features
          ("StandardScaler", numeric_transformer, num_features)  #Standardizing the numerical features
    ]
)

In [7]:
x_scaled=preprocessor.fit_transform(x) #Scaling the features

In [8]:
#Creating Train and Test Dataset
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.2,random_state=42)

MODEL SELECTION

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score

In [16]:
models={
    "Support Vector Machine":SVC(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boosting":GradientBoostingClassifier()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred,average="weighted") # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred,average="weighted") # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred,average="weighted") # Calculate Recall
    


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred,average="weighted") # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred,average="weighted") # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred,average="weighted") # Calculate Recall



    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))


    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))


    
    print('='*35)
    print('\n')

Support Vector Machine
Model performance for Training set
- Accuracy: 0.8713
- F1 score: 0.8499
- Precision: 0.8819
- Recall: 0.8713
----------------------------------
Model performance for Test set
- Accuracy: 0.8333
- F1 score: 0.8130
- Precision: 0.7937
- Recall: 0.8333


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8214
- F1 score: 0.8177
- Precision: 0.8151
- Recall: 0.8214


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.7976
- F1 score: 0.7780
- Precision: 0.7594
- Recall: 0.7976


Gradient Boosting
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test 

In [None]:
#Gradient Boosting Algorithm gives the best metrics 

In [18]:
gb_params = {"loss": ["exponential", "log_loss"],
             "learning_rate": [0,1,3,5,10,50],
             "subsample": [1,5, 10, 15, 20],
             "n_estimators": [1,2,3,4,5,6,10]}

In [19]:
randomcv_models = [
                   ("GB", GradientBoostingClassifier(), gb_params)
                   
                   ]

In [20]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1,
                                   random_state=10)
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for GB -------------------
{'subsample': 1, 'n_estimators': 1, 'loss': 'log_loss', 'learning_rate': 1}
