## Data Loading

The training and testing datasets were loaded from local CSV files using pandas.
File paths were constructed dynamically to ensure portability across different
environments. An initial inspection of the training data was performed to verify
successful loading and confirm the structure of the dataset.


In [7]:
import os
import pandas as pd

# Automatically get your username + home folder
home = os.path.expanduser("~")

train_path = os.path.join(home, "Downloads", "Hepatitis-Train.csv")
test_path  = os.path.join(home, "Downloads", "Hepatitis-Test.csv")

print("Train path:", train_path)
print("Test path:", test_path)

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df.head()


Train path: /Users/allisonarvizu/Downloads/Hepatitis-Train.csv
Test path: /Users/allisonarvizu/Downloads/Hepatitis-Test.csv


Unnamed: 0,Age,Sex,Steroid,Antivirals,Fatigue,Malaise,Anorexia,Liver Big,Liver Firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,Histology,TARGET
0,30,Female,no,yes,yes,yes,yes,no,yes,yes,yes,yes,yes,1.0,85,18,4.0,62.16,no,2
1,50,Male,no,yes,no,yes,yes,no,yes,yes,yes,yes,yes,0.9,135,42,3.5,62.16,no,2
2,78,Male,yes,yes,no,yes,yes,yes,yes,yes,yes,yes,yes,0.7,96,32,4.0,62.16,no,2
3,31,Male,no,no,yes,yes,yes,yes,yes,yes,yes,yes,yes,0.7,46,52,4.0,80.0,no,2
4,34,Male,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,1.0,104,200,4.0,62.16,no,2


## Feature Types and Preprocessing Pipeline

The dataset contains a mix of numerical and categorical features. Numerical and
categorical columns were identified programmatically to ensure flexibility and
reduce manual specification.

A preprocessing pipeline was constructed to standardize numerical features and
apply one-hot encoding to categorical variables. This approach allows consistent
and reusable preprocessing across all classification models while preventing
data leakage.


In [8]:
import numpy as np

numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_df.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)


Numeric columns: ['Age', 'Bilirubin', 'ALK Phosphate', 'SGOT', 'Albumin', 'PROTIME', 'TARGET']
Categorical columns: ['Sex', 'Steroid', 'Antivirals', 'Fatigue', 'Malaise', 'Anorexia', 'Liver Big', 'Liver Firm', 'Spleen Palpable', 'Spiders', 'Ascites', 'Varices', 'Histology']


In [9]:
target_col = "YOUR_COLUMN_NAME"


In [11]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_df.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_cols = [c for c in numeric_cols if c != target_col]
categorical_cols = [c for c in categorical_cols if c != target_col]

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

print("Preprocessor created.")
print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)


Preprocessor created.
Numeric columns: ['Age', 'Bilirubin', 'ALK Phosphate', 'SGOT', 'Albumin', 'PROTIME', 'TARGET']
Categorical columns: ['Sex', 'Steroid', 'Antivirals', 'Fatigue', 'Malaise', 'Anorexia', 'Liver Big', 'Liver Firm', 'Spleen Palpable', 'Spiders', 'Ascites', 'Varices', 'Histology']


## Target Variable and Modeling Pipeline Setup

The target variable was identified from the dataset and separated from the feature
set for both the training and testing data. Features were divided into numerical and
categorical types to support appropriate preprocessing.

A unified preprocessing pipeline was constructed using a column transformer. Numerical
features were standardized, while categorical features were one-hot encoded. This
pipeline is reused across all classification models to ensure consistent preprocessing
and fair model comparisons.


In [13]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

target_col = train_df.columns[-1]
print("Using target column:", target_col)

X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_test = test_df.drop(columns=[target_col])
y_test = test_df[target_col]

numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

def evaluate_model(name, model):
    pipe = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("clf", model),
        ]
    )
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"\n=== {name} ===")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 Score : {f1:.4f}")

    return {
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "Pipeline": pipe,
    }

results = []
results.append(evaluate_model("LinearSVC (default)", LinearSVC()))
results.append(evaluate_model("DecisionTree (default)", DecisionTreeClassifier()))
results.append(evaluate_model("RandomForest (default)", RandomForestClassifier()))
results.append(evaluate_model("KNN (default)", KNeighborsClassifier()))


Using target column: TARGET
Numeric columns: ['Age', 'Bilirubin', 'ALK Phosphate', 'SGOT', 'Albumin', 'PROTIME']
Categorical columns: ['Sex', 'Steroid', 'Antivirals', 'Fatigue', 'Malaise', 'Anorexia', 'Liver Big', 'Liver Firm', 'Spleen Palpable', 'Spiders', 'Ascites', 'Varices', 'Histology']

=== LinearSVC (default) ===
Accuracy : 0.7273
Precision: 0.7143
Recall   : 0.8333
F1 Score : 0.7692

=== DecisionTree (default) ===
Accuracy : 0.8182
Precision: 0.8333
Recall   : 0.8333
F1 Score : 0.8333

=== RandomForest (default) ===
Accuracy : 0.8182
Precision: 0.8333
Recall   : 0.8333
F1 Score : 0.8333

=== KNN (default) ===
Accuracy : 0.8182
Precision: 0.8333
Recall   : 0.8333
F1 Score : 0.8333


## Random Forest Hyperparameter Tuning

To improve model performance, hyperparameter tuning was performed on the Random Forest
classifier using randomized search with cross-validation. Several key hyperparameters,
including the number of trees, maximum tree depth, minimum samples for splits and leaves,
and feature selection strategy, were explored.

Randomized search was chosen to efficiently sample the hyperparameter space while
balancing computational cost. The best-performing model was selected based on F1 score
and used for subsequent evaluation on the test set.


In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

rf_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", RandomForestClassifier(random_state=42))
    ]
)

param_distributions = {
    "clf__n_estimators": randint(50, 300),
    "clf__max_depth": randint(2, 20),
    "clf__min_samples_split": randint(2, 20),
    "clf__min_samples_leaf": randint(1, 10),
    "clf__max_features": ["sqrt", "log2", None],
}

random_search = RandomizedSearchCV(
    rf_pipe,
    param_distributions=param_distributions,
    n_iter=30,             
    cv=5,                  
    scoring='f1',         
    random_state=42,
    n_jobs=-1
)

print("Running RandomizedSearchCV...")
random_search.fit(X_train, y_train)

print("\nBest Parameters Found:")
print(random_search.best_params_)

best_rf = random_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test)

acc = accuracy_score(y_test, y_pred_tuned)
prec = precision_score(y_test, y_pred_tuned, zero_division=0)
rec = recall_score(y_test, y_pred_tuned, zero_division=0)
f1 = f1_score(y_test, y_pred_tuned, zero_division=0)

print("\n=== RandomForest (Tuned) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 Score : {f1:.4f}")

results.append({
    "Model": "RandomForest (Tuned)",
    "Accuracy": acc,
    "Precision": prec,
    "Recall": rec,
    "F1": f1,
    "Pipeline": best_rf
})


Running RandomizedSearchCV...

Best Parameters Found:
{'clf__max_depth': 2, 'clf__max_features': None, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 18, 'clf__n_estimators': 185}

=== RandomForest (Tuned) ===
Accuracy : 0.4545
Precision: 0.5000
Recall   : 0.1667
F1 Score : 0.2500


## Feature Importance from Tuned Random Forest

To better understand model behavior, feature importance scores from the tuned Random
Forest classifier were examined. These scores indicate the relative contribution of
each feature to the model’s predictions.

The top-ranked features provide insight into which clinical variables are most strongly
associated with hepatitis survival outcomes.


In [15]:
rf_clf = results[-1]["Pipeline"].named_steps["clf"]

ohe = results[-1]["Pipeline"].named_steps["preprocess"].named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(results[-1]["Pipeline"].named_steps["preprocess"].transformers_[1][2])

all_feature_names = np.concatenate([numeric_cols, cat_feature_names])

importances = rf_clf.feature_importances_

feature_importance_df = pd.DataFrame({
    "Feature": all_feature_names,
    "Importance": importances
}).sort_values("Importance", ascending=False)

top5 = feature_importance_df.head(5)
print("\nTop 5 Most Important Features (Tuned RF):")
print(top5)



Top 5 Most Important Features (Tuned RF):
        Feature  Importance
4       Albumin    0.469603
1     Bilirubin    0.184992
5       PROTIME    0.140076
29  Varices_yes    0.065931
28   Varices_no    0.035061


In [16]:
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier

base_estimators = [
    ("svc", LinearSVC()),
    ("dt", DecisionTreeClassifier(random_state=42)),
    ("rf", RandomForestClassifier(random_state=42)),
    ("knn", KNeighborsClassifier())
]

stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=MLPClassifier(random_state=42, max_iter=1000),
    n_jobs=-1
)

stacking_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", stacking_clf),
    ]
)

stacking_pipe.fit(X_train, y_train)
y_pred_stack = stacking_pipe.predict(X_test)

acc = accuracy_score(y_test, y_pred_stack)
prec = precision_score(y_test, y_pred_stack, zero_division=0)
rec = recall_score(y_test, y_pred_stack, zero_division=0)
f1 = f1_score(y_test, y_pred_stack, zero_division=0)

print("\n=== Stacking (LinearSVC + DT + RF + KNN → MLP) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 Score : {f1:.4f}")

results.append({
    "Model": "Stacking (MLP meta)",
    "Accuracy": acc,
    "Precision": prec,
    "Recall": rec,
    "F1": f1,
    "Pipeline": stacking_pipe
})

import pandas as pd

summary_df = pd.DataFrame(
    [
        {
            "Model": r["Model"],
            "Accuracy": r["Accuracy"],
            "Precision": r["Precision"],
            "Recall": r["Recall"],
            "F1": r["F1"],
        }
        for r in results
    ]
)

print("\n=== FINAL SUMMARY TABLE ===")
print(summary_df.to_string(index=False))



=== Stacking (LinearSVC + DT + RF + KNN → MLP) ===
Accuracy : 0.6364
Precision: 1.0000
Recall   : 0.3333
F1 Score : 0.5000

=== FINAL SUMMARY TABLE ===
                 Model  Accuracy  Precision   Recall       F1
   LinearSVC (default)  0.727273   0.714286 0.833333 0.769231
DecisionTree (default)  0.818182   0.833333 0.833333 0.833333
RandomForest (default)  0.818182   0.833333 0.833333 0.833333
         KNN (default)  0.818182   0.833333 0.833333 0.833333
  RandomForest (Tuned)  0.454545   0.500000 0.166667 0.250000
   Stacking (MLP meta)  0.636364   1.000000 0.333333 0.500000
