## Imports & Project Setup
✔ Sets correct project root
✔ Allows importing from src/
✔ Verifies processed data is available
✔ Prevents path-related bugs early

In [1]:
# ========== BASIC IMPORTS ==========
import sys
from pathlib import Path
import yaml
import pandas as pd
import numpy as np

# ========== PROJECT ROOT ==========
PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)
print("Processed data folder exists:",
      (PROJECT_ROOT / "data/processed").exists())


Project root: C:\Project_Final_Year
Processed data folder exists: True


## Load Config & Model Selection
✔ Reads model choice from config (no hardcoding)
✔ Makes the notebook model-agnostic
✔ Allows switching models by editing YAML only

In [2]:
# ========== LOAD CONFIG ==========
with open(PROJECT_ROOT / "config/default.yaml", "r") as f:
    config = yaml.safe_load(f)

# ========== MODEL CONFIG ==========
MODEL_NAME = config["model"]["name"]
MODEL_PARAMS = config["model"]["params"]

print("Selected model:", MODEL_NAME)
print("Model parameters:", MODEL_PARAMS)


Selected model: random_forest
Model parameters: {'n_estimators': 100, 'max_depth': None}


## Load Processed Train & Test Data
✔ Loads model-ready data
✔ No preprocessing here (already done in Notebook 01)
✔ Keeps pipeline modular and clean

In [3]:
# ========== LOAD PROCESSED DATA ==========
train_path = PROJECT_ROOT / "data/processed/dataset_processed_train.csv"
test_path = PROJECT_ROOT / "data/processed/dataset_processed_test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df.head()


Train shape: (26048, 108)
Test shape: (6513, 108)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,label
0,-0.332401,-0.126674,1.132737,-0.146613,-0.217898,-0.03412,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,>50K
1,-1.066162,0.257445,-0.419226,-0.146613,-0.217898,1.588572,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
2,0.03448,-0.790729,-0.419226,-0.146613,-0.217898,-0.03412,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
3,-0.772657,-1.015108,-0.031235,-0.146613,-0.217898,-0.03412,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
4,1.281873,-0.863434,-2.359179,-0.146613,-0.217898,-0.03412,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K


## Separate Features & Labels
✔ Clean separation of inputs and targets
✔ Required for any ML model
✔ Keeps later signal computations simple

In [4]:
# ========== SPLIT FEATURES & LABEL ==========
X_train = train_df.drop(columns=["label"])
y_train = train_df["label"]

X_test = test_df.drop(columns=["label"])
y_test = test_df["label"]

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


X_train: (26048, 107)
y_train: (26048,)
X_test: (6513, 107)
y_test: (6513,)


## Define BaseModel
✔ Forces all models to behave the same
✔ Enables easy model swapping
✔ Required for confidence-based signals later
✔ Clean software-engineering design

In [5]:
# ========== BASE MODEL (ABSTRACT CONTRACT) ==========
from abc import ABC, abstractmethod

class BaseModel(ABC):
    def __init__(self, **params):
        self.params = params
        self.model = None

    @abstractmethod
    def train(self, X, y):
        pass

    @abstractmethod
    def predict(self, X):
        pass

    @abstractmethod
    def predict_proba(self, X):
        pass


## Implement Models + Model Factory
✔ Any model can be added later
✔ Config controls everything
✔ No notebook rewrites needed
✔ Fully reusable engine logic

In [7]:
# ========== MODEL IMPLEMENTATIONS ==========
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

class RandomForestModel(BaseModel):
    def __init__(self, **params):
        super().__init__(**params)
        self.model = RandomForestClassifier(**params)

    def train(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)


class LogisticModel(BaseModel):
    def __init__(self, **params):
        super().__init__(**params)
        self.model = LogisticRegression(max_iter=1000, **params)

    def train(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)


class SVMModel(BaseModel):
    def __init__(self, **params):
        super().__init__(**params)
        self.model = SVC(probability=True, **params)

    def train(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)


# ========== MODEL FACTORY ==========
def get_model(model_name, params):
    if model_name == "random_forest":
        return RandomForestModel(**params)
    elif model_name == "logistic":
        return LogisticModel(**params)
    elif model_name == "svm":
        return SVMModel(**params)
    else:
        raise ValueError(f"Unsupported model: {model_name}")


## Train Model & Evaluate Performance
✔ Trains the selected model
✔ Uses same code for any model
✔ Evaluates performance (baseline)
✔ Confirms data + model are correct

In [8]:
# ========== CREATE MODEL ==========
model = get_model(MODEL_NAME, MODEL_PARAMS)

# ========== TRAIN MODEL ==========
model.train(X_train, y_train)

print(f"Model '{MODEL_NAME}' trained successfully")

# ========== PREDICTIONS ==========
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

# ========== EVALUATION ==========
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model 'random_forest' trained successfully
Accuracy: 0.8522954091816367

Classification Report:
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      4945
        >50K       0.73      0.62      0.67      1568

    accuracy                           0.85      6513
   macro avg       0.81      0.77      0.79      6513
weighted avg       0.85      0.85      0.85      6513



### Notebook 02 establishes a pluggable model training framework. The model choice is controlled via configuration, enabling SLDCE to operate independently of the underlying classifier.