In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv"
data = pd.read_csv(url, header=None)
data.columns = ['age', 'gender', 'total_bilirubin', 'direct_bilirubin', 'alkaline_phosphotase', 'alamine_aminotransferase', 'aspartate_aminotransferase', 'total_proteins', 'albumin', 'albumin_and_globulin_ratio', 'class']

data.rename(columns={'total_bilirubin' :'Bilirubin Total', 'direct_bilirubin':'Bilirubin Direct', 'alkaline_phosphotase':'Alkaline Phosphatase (ALP)',  'alamine_aminotransferase' : 'ALT (SGPT)','aspartate_aminotransferase' : 'AST (SGOT)', 'total_proteins' : 'Total Protein' , 'albumin' :  'Albumin', 'albumin_and_globulin_ratio' : 'A : G Ratio'})
# Drop gender and age
data.drop(['gender', 'age'], axis=1, inplace=True)

# Fill NaN values with mean
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Split features and target variable
X = data_imputed.drop('class', axis=1)
y = data_imputed['class']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define pipelines for different algorithms with hyperparameter tuning
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

pipe_dt = Pipeline([
    ('clf', DecisionTreeClassifier())
])

pipe_rf = Pipeline([
    ('clf', RandomForestClassifier())
])

pipe_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC())
])

pipe_gb = Pipeline([
    ('clf', GradientBoostingClassifier())
])

pipe_ab = Pipeline([
    ('clf', AdaBoostClassifier())
])

# Define parameter grids for each algorithm
param_grid_lr = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__solver': ['liblinear', 'lbfgs']
}

param_grid_dt = {
    'clf__max_depth': [3, 5, 7, 10],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [5, 10, 15],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

param_grid_svm = {
    'clf__C': [0.1, 1, 10],
    'clf__gamma': ['scale', 'auto'],
    'clf__kernel': ['linear', 'rbf']
}

param_grid_gb = {
    'clf__n_estimators': [50, 100, 150],
    'clf__learning_rate': [0.01, 0.1, 0.5],
    'clf__max_depth': [3, 5, 7]
}

param_grid_ab = {
    'clf__n_estimators': [50, 100, 150],
    'clf__learning_rate': [0.01, 0.1, 0.5]
}

# Define grid search objects for each algorithm
grid_lr = GridSearchCV(pipe_lr, param_grid=param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_dt = GridSearchCV(pipe_dt, param_grid=param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm = GridSearchCV(pipe_svm, param_grid=param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_gb = GridSearchCV(pipe_gb, param_grid=param_grid_gb, cv=5, scoring='accuracy', n_jobs=-1)
grid_ab = GridSearchCV(pipe_ab, param_grid=param_grid_ab, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search objects to training data
grid_lr.fit(X_train, y_train)
grid_dt.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
grid_svm.fit(X_train, y_train)
grid_gb.fit(X_train, y_train)
grid_ab.fit(X_train, y_train)

# Get accuracy scores on test data
lr_accuracy = accuracy_score(y_test, grid_lr.predict(X_test))
dt_accuracy = accuracy_score(y_test, grid_dt.predict(X_test))
rf_accuracy = accuracy_score(y_test, grid_rf.predict(X_test))
svm_accuracy = accuracy_score(y_test, grid_svm.predict(X_test))
gb_accuracy = accuracy_score(y_test, grid_gb.predict(X_test))
ab_accuracy = accuracy_score(y_test, grid_ab.predict(X_test))

# Print accuracy scores
print("Logistic Regression accuracy:", lr_accuracy)
print("Decision Tree accuracy:", dt_accuracy)
print("Random Forest accuracy:", rf_accuracy)
print("SVM accuracy:", svm_accuracy)
print("Gradient Boosting accuracy:", gb_accuracy)
print("AdaBoost accuracy:", ab_accuracy)

# Find the best performing algorithm
accuracies = {
    'Logistic Regression': lr_accuracy,
    'Decision Tree': dt_accuracy,
    'Random Forest': rf_accuracy,
    'SVM': svm_accuracy,
    'Gradient Boosting': gb_accuracy,
    'AdaBoost': ab_accuracy
}

best_algorithm = max(accuracies, key=accuracies.get)
print("Best performing algorithm:", best_algorithm)

# Print classification report for the best performing algorithm
if best_algorithm == 'Logistic Regression':
    print(classification_report(y_test, grid_lr.predict(X_test)))
elif best_algorithm == 'Decision Tree':
    print(classification_report(y_test, grid_dt.predict(X_test)))
elif best_algorithm == 'Random Forest':
    print(classification_report(y_test, grid_rf.predict(X_test)))
elif best_algorithm == 'SVM':
        print(classification_report(y_test, grid_svm.predict(X_test)))
elif best_algorithm == 'Gradient Boosting':
    print(classification_report(y_test, grid_gb.predict(X_test)))
else:
    print(classification_report(y_test, grid_ab.predict(X_test)))



Logistic Regression accuracy: 0.7435897435897436
Decision Tree accuracy: 0.7008547008547008
Random Forest accuracy: 0.7435897435897436
SVM accuracy: 0.7435897435897436
Gradient Boosting accuracy: 0.7350427350427351
AdaBoost accuracy: 0.7435897435897436
Best performing algorithm: Logistic Regression
              precision    recall  f1-score   support

         1.0       0.74      1.00      0.85        87
         2.0       0.00      0.00      0.00        30

    accuracy                           0.74       117
   macro avg       0.37      0.50      0.43       117
weighted avg       0.55      0.74      0.63       117



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
from joblib import dump, load

dump(grid_lr.best_estimator_, 'liver_model.joblib')

['liver_model.joblib']

In [7]:
json_data = {
  "totalBilirubin": "2.2",
  "directBilirubin": "0.8",
  "alkalinePhosphatase": "200",
  "alamineAminotransferase": "50",
  "totalProtein": "7.2",
  "albumin": "4.0",
  "albuminAndGlobulinRatio": "1.5"
}

if 'indirectBilirubin' in json_data:
    total_bilirubin = float(json_data['indirectBilirubin']) + float(json_data['directBilirubin'])
    json_data['totalBilirubin'] = str(total_bilirubin)

# elif 'totalBilirubin' in json_data:
#     json_data.pop('totalBilirubin')

# List of elements to consider
elements_to_consider = ['totalBilirubin', 'directBilirubin', 'alkalinePhosphatase', 'alamineAminotransferase', 'totalProtein', 'albumin', 'albuminAndGlobulinRatio']
input_list = []

# Extract elements from JSON data and append to input list
for element in elements_to_consider:
    if element in json_data:
        input_list.append(json_data.pop(element))

print(input_list)



['2.2', '0.8', '200', '50', '7.2', '4.0', '1.5']


In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv"
data = pd.read_csv(url, header=None)
data.columns = ['age', 'gender', 'total_bilirubin', 'direct_bilirubin', 'alkaline_phosphotase', 'alamine_aminotransferase', 'aspartate_aminotransferase', 'total_proteins', 'albumin', 'albumin_and_globulin_ratio', 'class']

data['gender'] = data['gender'].replace({'Male': 0, 'Female': 1})
# Fill NaN values with mean
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Split features and target variable
X = data_imputed.drop('class', axis=1)
y = data_imputed['class']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define pipelines for different algorithms with hyperparameter tuning
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

pipe_dt = Pipeline([
    ('clf', DecisionTreeClassifier())
])

pipe_rf = Pipeline([
    ('clf', RandomForestClassifier())
])

pipe_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC())
])

pipe_gb = Pipeline([
    ('clf', GradientBoostingClassifier())
])

pipe_ab = Pipeline([
    ('clf', AdaBoostClassifier())
])

# Define parameter grids for each algorithm
param_grid_lr = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__solver': ['liblinear', 'lbfgs']
}

param_grid_dt = {
    'clf__max_depth': [3, 5, 7, 10],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [5, 10, 15],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

param_grid_svm = {
    'clf__C': [0.1, 1, 10],
    'clf__gamma': ['scale', 'auto'],
    'clf__kernel': ['linear', 'rbf']
}

param_grid_gb = {
    'clf__n_estimators': [50, 100, 150],
    'clf__learning_rate': [0.01, 0.1, 0.5],
    'clf__max_depth': [3, 5, 7]
}

param_grid_ab = {
    'clf__n_estimators': [50, 100, 150],
    'clf__learning_rate': [0.01, 0.1, 0.5]
}

# Define grid search objects for each algorithm
grid_lr = GridSearchCV(pipe_lr, param_grid=param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_dt = GridSearchCV(pipe_dt, param_grid=param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm = GridSearchCV(pipe_svm, param_grid=param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_gb = GridSearchCV(pipe_gb, param_grid=param_grid_gb, cv=5, scoring='accuracy', n_jobs=-1)
grid_ab = GridSearchCV(pipe_ab, param_grid=param_grid_ab, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search objects to training data
grid_lr.fit(X_train, y_train)
grid_dt.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
grid_svm.fit(X_train, y_train)
grid_gb.fit(X_train, y_train)
grid_ab.fit(X_train, y_train)

# Get accuracy scores on test data
lr_accuracy = accuracy_score(y_test, grid_lr.predict(X_test))
dt_accuracy = accuracy_score(y_test, grid_dt.predict(X_test))
rf_accuracy = accuracy_score(y_test, grid_rf.predict(X_test))
svm_accuracy = accuracy_score(y_test, grid_svm.predict(X_test))
gb_accuracy = accuracy_score(y_test, grid_gb.predict(X_test))
ab_accuracy = accuracy_score(y_test, grid_ab.predict(X_test))

# Print accuracy scores
print("Logistic Regression accuracy:", lr_accuracy)
print("Decision Tree accuracy:", dt_accuracy)
print("Random Forest accuracy:", rf_accuracy)
print("SVM accuracy:", svm_accuracy)
print("Gradient Boosting accuracy:", gb_accuracy)
print("AdaBoost accuracy:", ab_accuracy)

# Find the best performing algorithm
accuracies = {
    'Logistic Regression': lr_accuracy,
    'Decision Tree': dt_accuracy,
    'Random Forest': rf_accuracy,
    'SVM': svm_accuracy,
    'Gradient Boosting': gb_accuracy,
    'AdaBoost': ab_accuracy
}

best_algorithm = max(accuracies, key=accuracies.get)
print("Best performing algorithm:", best_algorithm)

# Print classification report for the best performing algorithm
if best_algorithm == 'Logistic Regression':
    print(classification_report(y_test, grid_lr.predict(X_test)))
elif best_algorithm == 'Decision Tree':
    print(classification_report(y_test, grid_dt.predict(X_test)))
elif best_algorithm == 'Random Forest':
    print(classification_report(y_test, grid_rf.predict(X_test)))
elif best_algorithm == 'SVM':
        print(classification_report(y_test, grid_svm.predict(X_test)))
elif best_algorithm == 'Gradient Boosting':
    print(classification_report(y_test, grid_gb.predict(X_test)))
else:
    print(classification_report(y_test, grid_ab.predict(X_test)))



Logistic Regression accuracy: 0.7435897435897436
Decision Tree accuracy: 0.7264957264957265
Random Forest accuracy: 0.7264957264957265
SVM accuracy: 0.7435897435897436
Gradient Boosting accuracy: 0.7094017094017094
AdaBoost accuracy: 0.7435897435897436
Best performing algorithm: Logistic Regression
              precision    recall  f1-score   support

         1.0       0.74      1.00      0.85        87
         2.0       0.00      0.00      0.00        30

    accuracy                           0.74       117
   macro avg       0.37      0.50      0.43       117
weighted avg       0.55      0.74      0.63       117



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from joblib import dump, load

dump(grid_lr.best_estimator_, 'liver_model_2.joblib')

['liver_model_2.joblib']