<a href="https://colab.research.google.com/github/blackxhrt2102/cirrhosis-prediction/blob/main/Cirrhosis_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [326]:
!pip install -q kaggle

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

!chmod 600 /root/.kaggle/kaggle.json

!kaggle datasets download -d fedesoriano/cirrhosis-prediction-dataset

cirrhosis-prediction-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [327]:
import os,zipfile

data=zipfile.ZipFile('/content/cirrhosis-prediction-dataset.zip')
data.extractall('/content')

In [328]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [329]:
data=pd.read_csv('/content/cirrhosis.csv',index_col='ID')

In [330]:
# Shape of the dataset:-
data.shape

(418, 19)

In [331]:
# First few rows in data

#data.head(4)
y=data['Status']
y=y.replace({'C':1,'D':2,'CL':3})

In [332]:
# Label Encoder

data['Drug']=data['Drug'].replace({'D-penicillamine':1,'Placebo':2})
data['Sex']=data['Sex'].replace({'F':1,'M':0})
data['Ascites']=data['Ascites'].replace({'Y':1,'N':0})
data['Hepatomegaly']=data['Hepatomegaly'].replace({'Y':1,'N':0})
data['Spiders']=data['Spiders'].replace({'Y':1,'N':0})
data['Edema']=data['Edema'].replace({'N':0,'S':1,'Y':2})

In [333]:
imbalance=['Age','Ascites','Spiders','Edema']

In [334]:
# Changing age to year:-
a=data['Age']/365
data['Age']=a.astype(int)

In [335]:
reg=['Prothrombin','Platelets','Tryglicerides','SGOT','Alk_Phos','Copper','Bilirubin','Cholesterol','Age']

#-----------------------------------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

scale=StandardScaler()
data[reg]=pd.DataFrame(scale.fit_transform(data[reg]),columns=reg)
#------------------------------------------------------------------------------------------------

from sklearn.preprocessing import PowerTransformer

power=PowerTransformer(method='yeo-johnson')
data[reg]=pd.DataFrame(power.fit_transform(data[reg]),columns=reg)

In [336]:
# Dropping unnecessary columns:-

data.drop(['N_Days'],axis=1,inplace=True)

In [337]:
# Imputer
from sklearn.impute import SimpleImputer

impute=SimpleImputer(strategy='most_frequent')
col=['Drug','Ascites','Hepatomegaly','Spiders','Stage']

data[col]=pd.DataFrame(impute.fit_transform(data[col]),columns=col)
data[col]=data[col].dropna()

In [338]:
impute=SimpleImputer(strategy='median')


data=pd.DataFrame(impute.fit_transform(data.drop('Status',axis=1)),columns=data.drop('Status',axis=1).columns)

In [339]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

In [340]:
from imblearn.over_sampling import SMOTE

strategy = {0:100, 1:100, 2:200, 3:200, 4:200, 5:200}
oversample = SMOTE(sampling_strategy=strategy)


smote=SMOTE()
data,y=smote.fit_resample(data,y)

In [341]:
data

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1.000000,1.835784,1.000000,0.000000,1.000000,1.0,2.000000,-0.306313,-1.787409,2.600000,1.418520,-1.558859,-0.359849,-1.671241,-1.148184,1.303973,3.000000
1,1.000000,0.362789,1.000000,0.000000,0.000000,0.0,0.000000,-0.001318,-0.758421,4.140000,-0.213673,1.809725,-1.364951,-0.444401,-0.741066,-0.308892,4.000000
2,1.000000,-1.189693,0.000000,0.000000,1.000000,1.0,1.000000,0.825221,-0.330727,3.480000,0.955848,-1.205362,0.036734,-1.061049,-1.346139,0.450350,4.000000
3,2.000000,1.472447,1.000000,0.000000,1.000000,1.0,1.000000,-0.851949,-0.706153,2.540000,-0.559883,-0.657557,-0.438250,-1.374276,0.043712,0.549215,3.000000
4,2.000000,0.456705,1.000000,0.000000,1.000000,0.0,0.000000,-0.657717,0.106489,3.530000,-0.507493,-0.886968,-1.370634,1.404745,-0.486123,-1.363396,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691,1.035495,0.456705,1.000000,0.000000,1.000000,0.0,0.000000,-0.845055,0.123918,3.761481,-0.853047,-0.193460,-1.185011,0.703853,1.088263,-0.820632,3.000000
692,1.229260,1.541075,0.770740,0.000000,0.000000,0.0,0.000000,1.010685,-0.904760,3.344434,0.807585,-1.088436,-0.307434,-0.472623,-0.534403,1.237784,3.229260
693,2.000000,-0.411912,1.000000,0.000000,1.000000,1.0,0.993734,0.850451,0.325728,3.571128,0.726014,-0.669996,0.173494,0.673620,0.487739,-0.804184,3.000000
694,2.000000,0.052108,1.000000,0.000000,1.000000,0.0,0.000000,-0.775798,-0.325609,3.651588,-0.672764,-0.549759,-1.113239,0.314389,-0.573927,-0.547847,3.000000


In [349]:
model=[]

model.append(('Random Forest',RandomForestClassifier()))

In [350]:
xtrain,xvalid,ytrain,yvalid=train_test_split(data,y,test_size=0.2)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score

In [344]:
data.drop(['Drug','SGOT'],axis=1,inplace=True)

In [345]:
scores={}
scores1={}
for name,mod in model:
  score=cross_val_score(mod,xtrain,ytrain,cv=7,scoring='accuracy')
  scores[name]=np.mean(score)
  mod.fit(xtrain,ytrain)
  yhat=mod.predict(xvalid)
  scores1[name]=precision_score(yvalid,yhat,average='micro')



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [351]:
import joblib 

joblib.dump(model,'lungs.pkl')

['lungs.pkl']

In [348]:
9

9