In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load dataset
df = pd.read_csv("project-data_(1)_(1)(1).csv")
print("Shape of dataset:", df.shape)
df.head()

Shape of dataset: (615, 13)


Unnamed: 0,category,age,sex,albumin,alkaline_phosphatase,alanine_aminotransferase,aspartate_aminotransferase,bilirubin,cholinesterase,cholesterol,creatinina,gamma_glutamyl_transferase,protein
0,no_disease,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,no_disease,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,no_disease,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,no_disease,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,no_disease,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [2]:
# Step 3: Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("Columns:", df.columns.tolist())


Columns: ['category', 'age', 'sex', 'albumin', 'alkaline_phosphatase', 'alanine_aminotransferase', 'aspartate_aminotransferase', 'bilirubin', 'cholinesterase', 'cholesterol', 'creatinina', 'gamma_glutamyl_transferase', 'protein']


In [3]:
# Step 5: Basic info
print("\nDataset Info:")
print(df.info())

# Step 6: Descriptive statistics (numeric features)
print("\nSummary statistics (numeric):")
print(df.describe().T)


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   category                    615 non-null    object 
 1   age                         615 non-null    int64  
 2   sex                         615 non-null    object 
 3   albumin                     614 non-null    float64
 4   alkaline_phosphatase        597 non-null    float64
 5   alanine_aminotransferase    614 non-null    float64
 6   aspartate_aminotransferase  615 non-null    float64
 7   bilirubin                   615 non-null    float64
 8   cholinesterase              615 non-null    float64
 9   cholesterol                 605 non-null    float64
 10  creatinina                  615 non-null    float64
 11  gamma_glutamyl_transferase  615 non-null    float64
 12  protein                     614 non-null    float64
dtypes: float64(10), int6

In [4]:
# Convert 'protein' column to numeric, coercing non-numeric values to NaN
df['protein'] = pd.to_numeric(df['protein'], errors='coerce')


In [5]:
# Check missing values
print("Missing values before handling:")
print(df.isnull().sum())

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(include=["object", "category"]).columns

# Handle missing values in numerical columns ‚Üí fill with mean
for col in num_cols:
    df[col].fillna(df[col].mean(), inplace=True)

# Handle missing values in categorical columns ‚Üí fill with mode (most frequent value)
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Verify missing values are handled
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values before handling:
category                       0
age                            0
sex                            0
albumin                        1
alkaline_phosphatase          18
alanine_aminotransferase       1
aspartate_aminotransferase     0
bilirubin                      0
cholinesterase                 0
cholesterol                   10
creatinina                     0
gamma_glutamyl_transferase     0
protein                        1
dtype: int64

Missing values after handling:
category                      0
age                           0
sex                           0
albumin                       0
alkaline_phosphatase          0
alanine_aminotransferase      0
aspartate_aminotransferase    0
bilirubin                     0
cholinesterase                0
cholesterol                   0
creatinina                    0
gamma_glutamyl_transferase    0
protein                       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [6]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Define target and features
target_col = "category"
X = df.drop(columns=[target_col])
y = df[target_col]

# Step 3: Encode target if categorical
if y.dtype == "object":
    le = LabelEncoder()
    y = le.fit_transform(y)

# Step 4: One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 6: Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 7: Define models
models = {
  "Logistic Regression": LogisticRegression(max_iter=1000),
  "Decision Tree": DecisionTreeClassifier(random_state=42),
  "Random Forest": RandomForestClassifier(random_state=42)
}

# Step 8: Cross-validation & Evaluation
cv_results = {}
for name, model in models.items():
    # Cross-validation (5-fold)
    scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
    cv_results[name] = scores.mean()

    # Train on train set & evaluate on test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\nüîπ {name} Results:")
    print("Cross-validation Accuracy:", scores.mean())
    print("Test Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


üîπ Logistic Regression Results:
Cross-validation Accuracy: 0.9154471544715447
Test Accuracy: 0.926829268292683
Confusion Matrix:
 [[  5   1   0   0   0]
 [  1   1   1   1   0]
 [  1   0   1   3   0]
 [  0   0   0 107   0]
 [  0   0   0   1   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           1       0.50      0.25      0.33         4
           2       0.50      0.20      0.29         5
           3       0.96      1.00      0.98       107
           4       0.00      0.00      0.00         1

    accuracy                           0.93       123
   macro avg       0.53      0.46      0.47       123
weighted avg       0.90      0.93      0.91       123


üîπ Decision Tree Results:
Cross-validation Accuracy: 0.8910569105691056
Test Accuracy: 0.9024390243902439
Confusion Matrix:
 [[  5   0   1   0   0]
 [  1   2   1   0   0]
 [  2   1   1   1   0]
 [  2   3   0 102   0]
 [  0   0   0   0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# Model Evaluation
# Step 1: Import libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# Step 2: Store results
results = []

# Step 3: Train, cross-validate & evaluate all models
for name, model in models.items():
    # Cross-validation (5-fold)
    cv_score = cross_val_score(model, X, y, cv=5, scoring="accuracy").mean()

    # Train on train set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted")
    rec = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    results.append({
        "Model": name,
        "CV Accuracy": cv_score,
        "Test Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1
        })

# Step 4: Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nüìä Model Evaluation Summary:\n")
print(results_df)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


üìä Model Evaluation Summary:

                 Model  CV Accuracy  Test Accuracy  Precision    Recall  \
0  Logistic Regression     0.915447       0.926829   0.902512  0.926829   
1        Decision Tree     0.891057       0.902439   0.918383  0.902439   
2        Random Forest     0.925203       0.926829   0.893739  0.926829   

   F1-score  
0  0.910036  
1  0.906852  
2  0.909547  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
import joblib

# --- After your evaluation loop ---

# Step 5: Save all trained models
for name, model in models.items():
    filename = f"{name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, filename)
    print(f"‚úÖ {name} saved as {filename}")

# Step 6: (Optional) Save the best model only
best_model_name = results_df.sort_values(by="Test Accuracy", ascending=False).iloc[0]["Model"]
best_model = models[best_model_name]
joblib.dump(best_model, "best_model.pkl")
print(f"\nüèÜ Best model ({best_model_name}) saved as best_model.pkl")


‚úÖ Logistic Regression saved as logistic_regression_model.pkl
‚úÖ Decision Tree saved as decision_tree_model.pkl
‚úÖ Random Forest saved as random_forest_model.pkl

üèÜ Best model (Logistic Regression) saved as best_model.pkl


In [9]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.0/10.0 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.49.1


In [13]:
!pip install streamlit pyngrok

Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [14]:
# app.py
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib

# Load the trained model
model = joblib.load("best_model.pkl")  # Change filename if needed

# Page configuration
st.set_page_config(
    page_title="Liver Disease Predictor",
    page_icon="üíâ",
    layout="centered"
)

# Title and description
st.title("üíâ Liver Disease Prediction App")
st.markdown("""
Enter patient details in the sidebar to predict the likelihood of liver disease.
""")

# Sidebar inputs
st.sidebar.header("Patient Information")

age = st.sidebar.number_input("Age", min_value=1, max_value=120, value=30)
gender = st.sidebar.selectbox("Gender", ["Male", "Female"])
total_bilirubin = st.sidebar.number_input("Total Bilirubin", min_value=0.0, value=1.0)
direct_bilirubin = st.sidebar.number_input("Direct Bilirubin", min_value=0.0, value=0.3)
alk_phosphate = st.sidebar.number_input("Alkaline Phosphotase", min_value=0.0, value=200.0)
alamine_aminotransferase = st.sidebar.number_input("Alamine Aminotransferase", min_value=0.0, value=30.0)
aspartate_aminotransferase = st.sidebar.number_input("Aspartate Aminotransferase", min_value=0.0, value=35.0)
total_proteins = st.sidebar.number_input("Total Proteins", min_value=0.0, value=6.5)
albumin = st.sidebar.number_input("Albumin", min_value=0.0, value=3.5)
albumin_globulin_ratio = st.sidebar.number_input("Albumin and Globulin Ratio", min_value=0.0, value=1.0)

# Convert gender to numeric
gender_val = 1 if gender == "Male" else 0

# Hidden features to match model's 12 features
# Replace with actual feature names if known
feature_x = 0.0
feature_y = 0.0

# Prepare input for prediction
input_data = pd.DataFrame({
    "Age": [age],
    "Gender": [gender_val],
    "Total_Bilirubin": [total_bilirubin],
    "Direct_Bilirubin": [direct_bilirubin],
    "Alkaline_Phosphotase": [alk_phosphate],
    "Alamine_Aminotransferase": [alamine_aminotransferase],
    "Aspartate_Aminotransferase": [aspartate_aminotransferase],
    "Total_Protiens": [total_proteins],
    "Albumin": [albumin],
    "Albumin_and_Globulin_Ratio": [albumin_globulin_ratio],
    "Feature_X": [feature_x],
    "Feature_Y": [feature_y]
})

# Predict button
if st.button("Predict"):
    try:
        prediction = model.predict(input_data.values)[0]  # Use .values to avoid warnings
        st.markdown("---")
        if prediction == 1:
            st.error("‚ö†Ô∏è The patient is likely to have liver disease.")
        else:
            st.success("‚úÖ The patient is unlikely to have liver disease.")
    except ValueError as e:
        st.error(f"Prediction error: {e}")
        st.info("Please check all input values.")


Overwriting app.py


In [15]:
# --- Launch with ngrok ---
from pyngrok import ngrok
import threading, os

# üîë Add your token
ngrok.set_auth_token("32HgEzLcjbyB46LOHRgyIUHyUx9_3naMu8pjKcbB3MSgpeMVm")

# üöÄ Start Streamlit in background
def run_app():
    os.system("streamlit run app.py --server.port 8501")

thread = threading.Thread(target=run_app)
thread.start()

# üåç Public link
public_url = ngrok.connect(8501)
print("‚úÖ Streamlit App is running here:", public_url)

‚úÖ Streamlit App is running here: NgrokTunnel: "https://dc699b44535d.ngrok-free.app" -> "http://localhost:8501"
