# Method selection experiments using pycaret

## Before all
Need to install ipykernel for this virtual envronment to execute this jupyter python notebook.

In [1]:
import os

# Get current cwd
current_dir = os.path.dirname(os.path.abspath("__file__")) # /src/model_selection

# If the cwd is not set properly, change it to the right cwd
if not os.getcwd().endswith("demography-predictor"):
    os.chdir(os.path.join(current_dir,"..",".."))

# Verify the correct directory (should end with demography-predictor)
print(os.getcwd()) 
assert os.getcwd().endswith("demography-predictor")

/Users/andrew/Andrew/School/4BIT/bc_thesis/demography-predictor


In [2]:
# Standard library imports
import logging
import pandas as pd
from typing import List, Dict, Any
import matplotlib.pyplot as plt

# Pycaret imports
from pycaret.regression import *

# Custom imports
from config import setup_logging, Config


## Basic config 

In [3]:
# Get config
config = Config()

# Get logger
logger = logging.getLogger("method_selection")

# Setup logging
setup_logging()

## Load data 

In [8]:
def load_data() -> pd.DataFrame:
    # Load data from file
    data = pd.read_csv(config.dataset_path)

    # Rename columns to all lower
    mapper = {col: col.lower() for col in data.columns}

    data.rename(columns=mapper, inplace=True)
    return data

## Define regression experiment functions

In [9]:
def regression_experiment(data: pd.DataFrame, target: str) -> List[Any]:
    # Setup experiment
    experiment = setup(data, target=target, session_id=42)

    best_model = compare_models()

    return best_model

def plot_results(best_model: List[Any]) -> None:
    # Plot Residuals
    plot_model(best_model, plot="residuals")

    # Plot Prediction Error
    plot_model(best_model, plot="error")

    # Plot Feature Importance
    plot_model(best_model, plot="feature")

    # Plot Learning Curve
    plot_model(best_model, plot="learning")

## Experiment 1: using whole dataset
Input: whole dataset, Target: population, total

In [10]:
# Load data 
data_df = load_data()

# Run experiment
best_model = regression_experiment(data=data_df, target="population, total")

Unnamed: 0,Description,Value
0,Session id,42
1,Target,"population, total"
2,Target type,Regression
3,Original data shape,"(12387, 20)"
4,Transformed data shape,"(12387, 20)"
5,Transformed train set shape,"(8670, 20)"
6,Transformed test set shape,"(3717, 20)"
7,Numeric features,18
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,3213733.9139,184301043386160.47,13379025.321,0.9997,0.4714,0.5867,0.465
xgboost,Extreme Gradient Boosting,9375911.25,685159607173120.0,25937123.6,0.9989,1.0748,3.1314,0.079
lightgbm,Light Gradient Boosting Machine,10498820.1815,667190723153447.9,25380935.046,0.9989,1.2046,4.5939,0.275
rf,Random Forest Regressor,8077063.376,866922079279225.2,29136885.0789,0.9986,0.6391,1.1605,1.799
gbr,Gradient Boosting Regressor,20904417.1636,1430283324694376.5,37722761.1835,0.9976,1.6842,13.9584,0.894
dt,Decision Tree Regressor,10510802.2618,2672199182017948.0,48211365.0204,0.9958,0.4594,0.5785,0.05
ada,AdaBoost Regressor,154467185.6466,2.929629137321344e+16,170658954.7209,0.952,3.542,173.3784,0.2
knn,K Neighbors Regressor,51435722.4,4.204407652865802e+16,203043696.0,0.9307,0.3034,0.3434,0.018
llar,Lasso Least Angle Regression,114704617.7291,4.987886162111511e+16,222804777.6562,0.9195,2.8888,96.9486,0.011
ridge,Ridge Regression,114695501.3825,4.986942525590208e+16,222782543.9711,0.9195,2.8883,96.9487,0.011
