In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('dataset/liver_dataset.csv')

In [3]:
data

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [5]:
data.isna().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [6]:
data['Albumin_and_Globulin_Ratio'] = data['Albumin_and_Globulin_Ratio'].fillna(data['Albumin_and_Globulin_Ratio'].mean())

In [7]:
data.isna().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

In [8]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

In [9]:
data = binary_encode(data, 'Gender', 'Male')

In [10]:
data = binary_encode(data, 'Dataset', 1)

In [11]:
y = data['Dataset']
X = data.drop('Dataset', axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=0)

In [13]:
std=StandardScaler()
X_train_std=std.fit_transform(X_train)
X_test_std=std.transform(X_test)

In [36]:
import pickle
import os
scaler_path=os.path.join('models/scaler_liver.pkl')
with open(scaler_path,'wb') as scaler_file:
    pickle.dump(std,scaler_file)

In [37]:
X

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40
...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00


In [38]:
X_train

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
351,37,1,1.3,0.4,195,41,38,5.3,2.1,0.6
64,57,1,1.3,0.4,259,40,86,6.5,2.5,0.6
211,42,1,2.7,1.3,219,60,180,7.0,3.2,0.8
205,45,1,2.5,1.2,163,28,22,7.6,4.0,1.1
315,22,1,0.9,0.3,179,18,21,6.7,3.7,1.2
...,...,...,...,...,...,...,...,...,...,...
277,60,1,2.0,0.8,190,45,40,6.0,2.8,0.8
9,55,1,0.7,0.2,290,53,58,6.8,3.4,1.0
359,69,0,0.8,0.2,146,42,70,8.4,4.9,1.4
192,60,1,2.3,0.6,272,79,51,6.6,3.5,1.1


# knn

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn=KNeighborsClassifier()
knn.fit(X_train_std,y_train)

KNeighborsClassifier()

In [40]:
Y_pred_knn=knn.predict(X_test_std)

In [41]:
ac_knn=accuracy_score(y_test,Y_pred_knn)
ac_knn

0.7457627118644068

# LR

In [42]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train_std,y_train)

LogisticRegression()

In [43]:
Y_pred_lr=lr.predict(X_test_std)
ac_lr=accuracy_score(y_test,Y_pred_lr)
ac_lr

0.7796610169491526

# Decision trees

In [44]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train_std,y_train)

DecisionTreeClassifier()

In [45]:
Y_pred_dt=dt.predict(X_test_std)
ac_dt=accuracy_score(y_test,Y_pred_dt)
ac_dt

0.6949152542372882

# rf

In [46]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train_std,y_train)

RandomForestClassifier()

In [47]:
Y_pred_rf=rf.predict(X_test_std)
ac_rf=accuracy_score(y_test,Y_pred_rf)
ac_rf

0.711864406779661

# svc

In [48]:
from sklearn.svm import SVC
sv=SVC()
sv.fit(X_train_std,y_train)

SVC()

In [49]:
Y_pred_sv=sv.predict(X_test_std)
ac_sv=accuracy_score(y_test,Y_pred_sv)
ac_sv

0.7796610169491526

In [50]:
import joblib
model_path=os.path.join('models/sv.sav')
joblib.dump(sv,model_path)

['C:/Users/my pc/Desktop/Stroke-Risk-Prediction-imp/models/sv.sav']

In [51]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train_std,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [52]:
Y_pred_xgb=xgb.predict(X_test_std)
ac_xgb=accuracy_score(y_test,Y_pred_xgb)
ac_xgb



0.7288135593220338

# SGD

In [53]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(X_train_std,y_train)

SGDClassifier()

In [54]:
Y_pred_sgd=sgd.predict(X_test_std)
ac_sgd=accuracy_score(y_test,Y_pred_sgd)
ac_sgd

0.6271186440677966

In [55]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_std,y_train)

GaussianNB()

In [56]:
Y_pred_gnb=gnb.predict(X_test_std)
ac_gnb=accuracy_score(y_test,Y_pred_gnb)
ac_gnb

0.5932203389830508

In [57]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_std,y_train)

BernoulliNB()

In [58]:
Y_pred_bnb=bnb.predict(X_test_std)
ac_bnb=accuracy_score(y_test,Y_pred_bnb)
ac_bnb

0.6949152542372882

In [59]:
X_train

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
351,37,1,1.3,0.4,195,41,38,5.3,2.1,0.6
64,57,1,1.3,0.4,259,40,86,6.5,2.5,0.6
211,42,1,2.7,1.3,219,60,180,7.0,3.2,0.8
205,45,1,2.5,1.2,163,28,22,7.6,4.0,1.1
315,22,1,0.9,0.3,179,18,21,6.7,3.7,1.2
...,...,...,...,...,...,...,...,...,...,...
277,60,1,2.0,0.8,190,45,40,6.0,2.8,0.8
9,55,1,0.7,0.2,290,53,58,6.8,3.4,1.0
359,69,0,0.8,0.2,146,42,70,8.4,4.9,1.4
192,60,1,2.3,0.6,272,79,51,6.6,3.5,1.1


In [66]:
y_train

351    1
64     1
211    1
205    1
315    0
      ..
277    1
9      1
359    0
192    1
559    1
Name: Dataset, Length: 524, dtype: int64

In [68]:
X.corrwith(y)

Age                           0.137351
Gender                        0.082416
Total_Bilirubin               0.220208
Direct_Bilirubin              0.246046
Alkaline_Phosphotase          0.184866
Alamine_Aminotransferase      0.163416
Aspartate_Aminotransferase    0.151934
Total_Protiens               -0.035008
Albumin                      -0.161388
Albumin_and_Globulin_Ratio   -0.162319
dtype: float64