In [1]:
%load_ext autoreload
%autoreload 2

## Import packages
import pandas as pd  # For file input/output
from scipy import optimize
from scipy.optimize._numdiff import approx_derivative

import sys
import time
import numpy as np
import pickle
import copy
import gc
#import matplotlib.pyplot as plt
#import shap
from rumbooster import rum_train
from utils import bio_to_rumboost
from datasets import load_preprocess_LPMC
from models import LPMC, LPMC_normalised, LPMC_nested_normalised, LPMC_cross_nested_normalised, LPMC_mixed_logit_tt
from benchmarks import return_dataset, prepare_model, estimate_models, prepare_labels, predict_test, predict_proba
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, recall_score, accuracy_score

# Load common functions for the experiments
from expermients_functions import *

In [3]:
# Customize matplotlib
tex_fonts = {
    # Use LaTeX to write all text
    "text.usetex": True, 
    "font.family": "serif",
    # Use 14pt font in plots, to match 10pt font in document
    "axes.labelsize": 14,
    "font.size": 14,
    # Make the legend/label fonts a little smaller
    "legend.fontsize": 12,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12
}

#plt.rcParams.update(tex_fonts)

## Experiment initialization

### Experiment parameters

In [4]:
## Experiment parameters
data_dir = "../Data/"
adjusted_hyperparms_dir = "../Data/adjusted-hyperparameters/"
train_suffix = "_train.csv"
test_suffix = "_test.csv"
hyperparameters_suffix = "_hyperparameters"
reset_crossval_indices = 0 # Set to 0 for reproducibility of the experiment over multiple executions
partial_results_dir = "../Data/Results-RealDatasets/"

recompute_Experiment_4 = True

rounding = 2

CV = 5 # Number of cross-validation
n_iter = 100 #  Number of iterations used on the random search
average_tech = "macro" #"micro"

hyperparameters_suffix = hyperparameters_suffix +'_'+ str(n_iter) + '.csv'

model_type_to_class = {"RUMBoost": "RUMBoost",
                       "Nested_RUMBoost": "Nested_RUMBoost",
                       'Cross-nested_RUMBoost': 'Cross-nested_RUMBoost',
                       "Full_Effect_RUMBoost": "Full_Effect_RUMBoost",
                       'RUMBoost_FI': 'RUMBoost_FI',
                       "MNL":"MNL",
                       "Nested_MNL": "Nested_MNL",
                       "CNL": "CNL",
                       #"Mixed Logit": "Mixed Logit"
                       }

STATIC_PARAMS = {'n_jobs': -1,
                 'num_classes':4,
                 'objective':'multiclass',
                 'boosting': 'gbdt',
                 'monotone_constraints_method': 'advanced',
                 'verbosity': -1,
                 'num_iterations':3000,
                 'early_stopping_round':100,
                 'learning_rate':0.1
                 }

### Load the data

In [5]:
datasets = {"LPMC": {
                  "name": "LPMC",
                  "mode_var": "travel_mode",
                  "individual_id": "household_id",
                  "alt_names": ["Walk", "Bike", "Public transport", "Car"]
                }
}

In [6]:
## Load the data
def load_data(dataset_id, dataset):
    train = pd.read_csv(data_dir + dataset_id + train_suffix, sep=',')
    final_test = pd.read_csv(data_dir + dataset_id + test_suffix, sep=',')

    train['distance'] = train['distance']/1000
    final_test['distance'] = final_test['distance']/1000
    # Divide the dataset into charasteristics and target variable
    X = train.loc[:, train.columns != dataset["mode_var"]]
    y = train[dataset["mode_var"]]
    final_test_X = final_test.loc[:, final_test.columns != dataset["mode_var"]]
    final_test_y = final_test[dataset["mode_var"]]

    alts = list(y.unique()) # List containing al the modes (alternatives) in the dataset

    # Extract the individual ID to later group observations using it
    groups = np.array(X[dataset["individual_id"]].values)
    X = X.drop(columns=dataset["individual_id"])
    final_test_X = final_test_X.drop(columns=dataset["individual_id"])

    # Load the hyperparameters
    try:
        adjusted_hyperparameters_file = pd.read_csv(adjusted_hyperparms_dir + dataset_id + hyperparameters_suffix , index_col=0)
        hyperparameters = adjusted_hyperparameters_file.to_dict()
    except (OSError, IOError) as e:
        print("Error while loading best_hyperparameters for dataset {} - {}...".format(dataset_id, n_iter))
        pass

    return (X, y, final_test_X, final_test_y, alts, groups, hyperparameters)

In [8]:
## Create the classifier
def create_classifier(classifier, hyperparameters, dataset, X, y, for_CV=False):
    clf_hyperparameters = copy.deepcopy(hyperparameters)
    integer_params = ['max_bin','min_data_in_leaf','num_leaves','num_iterations']
    float_params = ['bagging_fraction','feature_fraction','lambda_l1','lambda_l2','min_gain_to_split','min_sum_hessian_in_leaf', 'mu']
    choice_params = {"learning_rate": [0.05, 0.1], 
                    "bagging_freq": [0, 1, 5, 10],
                    "nest": [{0:0, 1:1, 2:2, 3:2}, {0:0, 1:0, 2:1, 3:0}],
                    "max_depth": [0]
                    }

    static_params = copy.deepcopy(STATIC_PARAMS)

    for k in list(clf_hyperparameters[classifier].keys()):
        if k == "_best_iter":
            clf_hyperparameters[classifier]['num_iterations'] = int(clf_hyperparameters[classifier][k])
            del clf_hyperparameters[classifier][k]
            continue
        if k.startswith('_'):
            del clf_hyperparameters[classifier][k]
            continue
        if np.isnan(clf_hyperparameters[classifier][k]):
            del clf_hyperparameters[classifier][k]
            continue
        if k in integer_params:
            clf_hyperparameters[classifier][k] = int(clf_hyperparameters[classifier][k])
        if k in float_params:
            clf_hyperparameters[classifier][k] = clf_hyperparameters[classifier][k]
        if k in choice_params.keys():
            clf_hyperparameters[classifier][k] = choice_params[k][int(clf_hyperparameters[classifier][k])]

    params = {**clf_hyperparameters[classifier], **static_params}
    
    return params

## Experiment 4.1: Which is the best model?

In [9]:
## Construct Experiment 4 - Accuracy, GMPCA and Time Tables
def construct_experiment_4_accuracy_table(Experiment_4_CV_scores, Experiment_4_test_scores):
    columns = ["Accuracy", "GMPCA"]

    # Compute the mean of all the stored results for all the models and construct the final table
    train_scores_df = {}
    test_scores_df = {}
    time_scores_df = {}

    Experiment_4_CV_scores_mean = copy.deepcopy(Experiment_4_CV_scores)
    Experiment_4_test_scores_round = copy.deepcopy(Experiment_4_test_scores)
    for k_clf in model_type_to_class.keys():
        for k_dataset in Experiment_4_CV_scores_mean[k_clf].keys():
            for k_score in Experiment_4_CV_scores_mean[k_clf][k_dataset].keys():
                if k_score in columns + ['Estimation time']:
                    Experiment_4_CV_scores_mean[k_clf][k_dataset][k_score] = np.round(np.mean(Experiment_4_CV_scores_mean[k_clf][k_dataset][k_score]), rounding)
                    Experiment_4_test_scores_round[k_clf][k_dataset][k_score] = np.round(Experiment_4_test_scores_round[k_clf][k_dataset][k_score], rounding)
        
        train_scores_df[k_clf] = pd.DataFrame(Experiment_4_CV_scores_mean[k_clf]).T[columns]
        test_scores_df[k_clf] = pd.DataFrame(Experiment_4_test_scores_round[k_clf]).T[columns]
        time_scores_df[k_clf] = pd.DataFrame(Experiment_4_CV_scores_mean[k_clf]).T['Estimation time']
        
    Experiment_4_CV_table = pd.concat(train_scores_df, axis=1)
    Experiment_4_test_table = pd.concat(test_scores_df, axis=1)
    Experiment_4_time_table = pd.concat(time_scores_df, axis=1)

    return (Experiment_4_CV_table, Experiment_4_test_table, Experiment_4_time_table)

## Execute the experiment

In [10]:
## Execute experiments
%load_ext autoreload
%autoreload 2
## Initialize dictionaries to store partial results
# Load the previous experiment data (deserialize)
try:
    with open(partial_results_dir + '/Experiment_4_CV_scores.pickle', 'rb') as handle:
        Experiment_4_CV_scores = pickle.load(handle)
except:
    Experiment_4_CV_scores = {}
try:
    with open(partial_results_dir + '/Experiment_4_test_scores.pickle', 'rb') as handle:
        Experiment_4_test_scores = pickle.load(handle)
except:
    Experiment_4_test_scores = {}

dataset_train, dataset_test = return_dataset([load_preprocess_LPMC], to_return = 'split')
models_train_rum = prepare_model([LPMC], dataset_train)

#rum_structure, nests, and mu
rnd_effects_attributes = ['age', 'female', 'start_time_linear', 'travel_day', 'day_of_week', 'car_ownership', 'driving_license', 'purpose_B', 'purpose_HBE', 'purpose_HBO', 'purpose_HBW', 'purpose_NHBO', 'fueltype_Average', 'fueltype_Diesel', 'fueltype_Hybrid', 'fueltype_Petrol']

MNL_utilities = {0: 'B_dur_walking_Walk*dur_walking', # + B_age_Walk*age + B_female_Walk*female + B_day_of_week_Walk*day_of_week + B_start_time_linear_Walk*start_time_linear + B_car_ownership_Walk*car_ownership + B_driving_license_Walk*driving_license + B_purpose_B_Walk*purpose_B + B_purpose_HBE_Walk*purpose_HBE + B_purpose_HBO_Walk*purpose_HBO + B_purpose_HBW_Walk*purpose_HBW + B_purpose_NHBO_Walk*purpose_NHBO + B_fueltype_Avrg_Walk*fueltype_Average + B_fueltype_Diesel_Walk*fueltype_Diesel + B_fueltype_Hybrid_Walk*fueltype_Hybrid + B_fueltype_Petrol_Walk*fueltype_Petrol + B_distance_Walk*distance',
                    1: 'ASC_Bike + B_age_Bike*age + B_female_Bike*female + B_day_of_week_Bike*day_of_week + B_start_time_linear_Bike*start_time_linear + B_car_ownership_Bike*car_ownership + B_driving_license_Bike*driving_license + B_purpose_B_Bike*purpose_B + B_purpose_HBE_Bike*purpose_HBE + B_purpose_HBO_Bike*purpose_HBO + B_purpose_HBW_Bike*purpose_HBW + B_purpose_NHBO_Bike*purpose_NHBO + B_fueltype_Avrg_Bike*fueltype_Average + B_fueltype_Diesel_Bike*fueltype_Diesel + B_fueltype_Hybrid_Bike*fueltype_Hybrid + B_fueltype_Petrol_Bike*fueltype_Petrol + B_distance_Bike*distance + B_dur_cycling_Bike*dur_cycling',
                    2: 'ASC_Public_Transport + B_age_Public_Transport*age + B_female_Public_Transport*female + B_day_of_week_Public_Transport*day_of_week + B_start_time_linear_Public_Transport*start_time_linear + B_car_ownership_Public_Transport*car_ownership + B_driving_license_Public_Transport*driving_license + B_purpose_B_Public_Transport*purpose_B + B_purpose_HBE_Public_Transport*purpose_HBE + B_purpose_HBO_Public_Transport*purpose_HBO + B_purpose_HBW_Public_Transport*purpose_HBW + B_purpose_NHBO_Public_Transport*purpose_NHBO + B_fueltype_Avrg_Public_Transport*fueltype_Average + B_fueltype_Diesel_Public_Transport*fueltype_Diesel + B_fueltype_Hybrid_Public_Transport*fueltype_Hybrid + B_fueltype_Petrol_Public_Transport*fueltype_Petrol + B_distance_Public_Transport*distance + B_dur_pt_access_Public_Transport*dur_pt_access + B_dur_pt_rail_Public_Transport*dur_pt_rail + B_dur_pt_bus_Public_Transport*dur_pt_bus + B_dur_pt_int_waiting_Public_Transport*dur_pt_int_waiting + B_dur_pt_int_walking_Public_Transport*dur_pt_int_walking + B_pt_n_interchanges_Public_Transport*pt_n_interchanges + B_cost_transit_Public_Transport*cost_transit',
                    3: 'ASC_Car + B_age_Car*age + B_female_Car*female + B_day_of_week_Car*day_of_week + B_start_time_linear_Car*start_time_linear + B_car_ownership_Car*car_ownership + B_driving_license_Car*driving_license + B_purpose_B_Car*purpose_B + B_purpose_HBE_Car*purpose_HBE + B_purpose_HBO_Car*purpose_HBO + B_purpose_HBW_Car*purpose_HBW + B_purpose_NHBO_Car*purpose_NHBO + B_fueltype_Avrg_Car*fueltype_Average + B_fueltype_Diesel_Car*fueltype_Diesel + B_fueltype_Hybrid_Car*fueltype_Hybrid + B_fueltype_Petrol_Car*fueltype_Petrol + B_distance_Car*distance + B_dur_driving_Car*dur_driving + B_cost_driving_fuel_Car*cost_driving_fuel + B_con_charge_Car*congestion_charge + B_traffic_perc_Car*driving_traffic_percent'}



for dataset_id, dataset in datasets.items():
    dataset_name = dataset["name"]
    print("\n--- {} (ID: {})".format(dataset_name, dataset_id))

    # Load the data and the hyperparameters
    X, y, final_test_X, final_test_y, alts, groups, hyperparameters = load_data(dataset_id, dataset)

    # Obtain datasets for K-Fold cross validation (the same fold splits are used across all the iterations for all models)
    train_indices = []
    test_indices = []
    crossval_pickle_file = data_dir+dataset_id+"_crossval.pickle"
    try:
        train_indices, test_indices = pickle.load(open(crossval_pickle_file, "rb"))
        if reset_crossval_indices == 1: # Reset the indices
            raise FileNotFoundError
    except (OSError, IOError) as e:
        print("Recomputing Cross-val indices...")
        for (train_index, test_index) in stratified_group_k_fold(X, y, groups, k=CV):
            train_indices.append(train_index)
            test_indices.append(test_index)
        pickle.dump([train_indices, test_indices], open(crossval_pickle_file, "wb"))


    # Get results for the selected classifier
    for classifier in model_type_to_class.keys():
        print("\n\t--- {}".format(classifier))
        sys.stdout.flush()
        it_time_init = time.perf_counter()

        n_rounds = 0

        # Create dictionary to store the results
        if not classifier in Experiment_4_CV_scores.keys():
            Experiment_4_CV_scores[classifier] = {}
        if not classifier in Experiment_4_test_scores.keys():
            Experiment_4_test_scores[classifier] = {}

        if recompute_Experiment_4==True or not (dataset_name in Experiment_4_CV_scores[classifier].keys()) or not (dataset_name in Experiment_4_test_scores[classifier].keys()):
            # Create dictionary to store the results
            Experiment_4_CV_scores[classifier][dataset_name] = {}
            Experiment_4_CV_scores[classifier][dataset_name]['Accuracy'] = []
            Experiment_4_CV_scores[classifier][dataset_name]['F1'] = []
            Experiment_4_CV_scores[classifier][dataset_name]['Recall'] = []
            Experiment_4_CV_scores[classifier][dataset_name]['GMPCA'] = []
            Experiment_4_CV_scores[classifier][dataset_name]['Estimation time'] = []
            Experiment_4_test_scores[classifier][dataset_name] = {}

            ## Applying k-Fold Cross Validation over training set
            for iteration in range(0, len(train_indices)):
                print("\t\t CV it: {}".format(iteration))
                sys.stdout.flush()

                # Create the classifier

                if classifier == "RUMBoost":
                    params = copy.deepcopy(STATIC_PARAMS)
                    params['max_depth'] = 1
                    rum_structure = bio_to_rumboost(models_train_rum[0])
                elif classifier == 'RUMBoost_FI':
                    params = copy.deepcopy(STATIC_PARAMS)
                    params['max_depth'] = 2
                    rum_structure = bio_to_rumboost(models_train_rum[0])
                    rum_structure[0]['interaction_constraints'] = [[0, 16], [2], [5], [15], [1], [3], [4], [6], [7], [8], [9], [10], [11], [12], [13], [14]]
                    rum_structure[1]['interaction_constraints'] = [[0, 16], [2], [5], [15], [1], [3], [4], [6], [7], [8], [9], [10], [11], [12], [13], [14]]
                    rum_structure[2]['interaction_constraints'] = [[0, 17], [2], [5], [15], [1], [3], [4], [6], [7], [8], [9], [10], [11], [12], [13], [14], [16], [18], [19], [20], [21], [22]]
                    rum_structure[3]['interaction_constraints'] = [[0, 16], [2], [5], [15], [1], [3], [4], [6], [7], [8], [9], [10], [11], [12], [13], [14], [17]]
                elif classifier == "Nested_RUMBoost":
                    params = copy.deepcopy(STATIC_PARAMS)
                    params['max_depth'] = 1
                    rum_structure = bio_to_rumboost(models_train_rum[0])
                    nest = {0:0, 1:1, 2:2, 3:2}
                    mu = [1, 1, 1.166746773143513],
                elif classifier == 'Cross-nested_RUMBoost':
                    params = copy.deepcopy(STATIC_PARAMS)
                    params['max_depth'] = 1
                    rum_structure = bio_to_rumboost(models_train_rum[0])
                    mu = [1.821282482078, 1.000015988]
                    alphas = np.array([[0, 1], [0, 1], [1, 0], [0.363528056, 1-0.363528056]])
                elif classifier == "Full_Effect_RUMBoost":
                    params_rde = create_classifier(classifier, hyperparameters, dataset, X, y, for_CV=True)
                    rum_structure = bio_to_rumboost(models_train_rum[0], rnd_effect_attributes=rnd_effects_attributes)
                    params = {'verbose': -1,
                    'num_classes':4,
                    'early_stopping_round':100,
                    'learning_rate': params['learning_rate'],
                    'max_depth': 1,
                    #'num_leaves':31,
                    'num_boost_round': 3000,
                    'objective':'multiclass',
                    'boosting': 'gbdt',
                    'monotone_constraints_method': 'advanced'}
                

                # Obtain training and testing data for this iteration (split of de k-Fold)
                X_train, X_test = X.loc[train_indices[iteration]], X.loc[test_indices[iteration]]
                y_train, y_test = y.loc[train_indices[iteration]], y.loc[test_indices[iteration]]

                train_set = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
                test_set = lgb.Dataset(X_test, label=y_test, free_raw_data=False)

                if classifier == "MNL":
                    models_train = prepare_model([LPMC_normalised], [dataset_train[0].loc[train_indices[iteration]]])
                if classifier == "Nested_MNL":
                    models_train = prepare_model([LPMC_nested_normalised], [dataset_train[0].loc[train_indices[iteration]]])
                if classifier == "CNL":
                    models_train = prepare_model([LPMC_cross_nested_normalised], [dataset_train[0].loc[train_indices[iteration]]])
                if classifier == "Mixed Logit":
                    models_train = prepare_model([LPMC_mixed_logit_tt], [dataset_train[0].loc[train_indices[iteration]]])
                

                # Balance dataset
                #X_train, y_train = balance(X_train, y_train, X_train.shape[0], len(dataset["alt_names"]))
                time_ini = time.perf_counter()
                if classifier == "RUMBoost":
                    clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[test_set])
                    n_rounds += clf_trained.best_iteration
                elif classifier == "Nested_RUMBoost":
                    clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[test_set], nests=nest, mu=mu)
                    n_rounds += clf_trained.best_iteration
                elif classifier == "Full_Effect_RUMBoost":
                    clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[test_set], params_rde=params_rde)
                    n_rounds += clf_trained.best_iteration
                elif classifier == 'Cross-nested_RUMBoost':
                    clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[test_set], alphas=alphas, mu=mu)
                    n_rounds += clf_trained.best_iteration
                elif classifier == "RUMBoost_FI":
                    clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[test_set])
                    n_rounds += clf_trained.best_iteration
                elif classifier == "MNL":
                    clf_trained = estimate_models(models_train)
                elif classifier == "Nested_MNL":
                    clf_trained = estimate_models(models_train)
                elif classifier == "CNL":
                    clf_trained = estimate_models(models_train)
                elif classifier == "Mixed Logit":
                    clf_trained = estimate_models(models_train, mixed_logit=True)
                elapsed_time = time.perf_counter() - time_ini

                if classifier == "MNL":
                    proba = predict_proba(dataset_train[0].loc[test_indices[iteration]], clf_trained[0], [0, 1, 2, 3], MNL_utilities)
                elif classifier == "Nested_MNL":
                    labels = prepare_labels([dataset_train[0].loc[test_indices[iteration]]])
                    model_test = prepare_model([LPMC_nested_normalised], [dataset_train[0].loc[test_indices[iteration]]], for_prob=True)
                    proba = predict_test(clf_trained, model_test, labels, return_prob=True)
                elif classifier == "CNL":
                    labels = prepare_labels([dataset_train[0].loc[test_indices[iteration]]])
                    model_test = prepare_model([LPMC_cross_nested_normalised], [dataset_train[0].loc[test_indices[iteration]]], for_prob=True)
                    proba = predict_test(clf_trained, model_test, labels, return_prob=True)
                elif classifier == "Mixed Logit":
                    labels = prepare_labels([dataset_train[0].loc[test_indices[iteration]]])
                    model_test = prepare_model([LPMC_mixed_logit_tt], [dataset_train[0].loc[test_indices[iteration]]], for_prob=True)
                    proba = predict_test(clf_trained, model_test, labels, return_prob=True)
                elif classifier == "Nested_RUMBoost":
                    proba, _, _ = clf_trained.predict(test_set, nests=nest, mu=mu)
                elif classifier == 'Cross-nested_RUMBoost':
                    proba, _, _ = clf_trained.predict(test_set, alphas=alphas, mu=mu)
                else:
                    proba = clf_trained.predict(test_set)
                y_score = np.argmax(proba, axis=1)

                # Compute the accuracy results
                Experiment_4_CV_scores[classifier][dataset_name]['Accuracy'] = np.append(Experiment_4_CV_scores[classifier][dataset_name]['Accuracy'], accuracy_score(y_test, y_score)*100)
                Experiment_4_CV_scores[classifier][dataset_name]['F1'] = np.append(Experiment_4_CV_scores[classifier][dataset_name]['F1'], f1_score(y_test, y_score, average=average_tech)*100)
                Experiment_4_CV_scores[classifier][dataset_name]['Recall'] = np.append(Experiment_4_CV_scores[classifier][dataset_name]['Recall'], recall_score(y_test, y_score, average=average_tech)*100)
                Experiment_4_CV_scores[classifier][dataset_name]['GMPCA'] = np.append(Experiment_4_CV_scores[classifier][dataset_name]['GMPCA'], GMPCA(proba, y_test.values)*100)
                Experiment_4_CV_scores[classifier][dataset_name]['Estimation time'] = np.append(Experiment_4_CV_scores[classifier][dataset_name]['Estimation time'], elapsed_time)

                del clf_trained
                gc.collect()

            ## Out-of-sample results
            # Create the classifier
            if classifier == "RUMBoost":
                params = copy.deepcopy(STATIC_PARAMS)
                params['num_iterations'] = int(n_rounds/CV)
                params['max_depth'] = 1
                rum_structure = bio_to_rumboost(models_train_rum[0])
            elif classifier == 'RUMBoost_FI':
                params = copy.deepcopy(STATIC_PARAMS)
                params['num_iterations'] = int(n_rounds/CV)
                params['max_depth'] = 2
                rum_structure = bio_to_rumboost(models_train_rum[0])
                rum_structure[0]['interaction_constraints'] = [[0, 16], [2], [5], [15], [1], [3], [4], [6], [7], [8], [9], [10], [11], [12], [13], [14]]
                rum_structure[1]['interaction_constraints'] = [[0, 16], [2], [5], [15], [1], [3], [4], [6], [7], [8], [9], [10], [11], [12], [13], [14]]
                rum_structure[2]['interaction_constraints'] = [[0, 17], [2], [5], [15], [1], [3], [4], [6], [7], [8], [9], [10], [11], [12], [13], [14], [16], [18], [19], [20], [21], [22]]
                rum_structure[3]['interaction_constraints'] = [[0, 16], [2], [5], [15], [1], [3], [4], [6], [7], [8], [9], [10], [11], [12], [13], [14], [17]]
            elif classifier == "Nested_RUMBoost":
                params = copy.deepcopy(STATIC_PARAMS)
                params['num_iterations'] = int(n_rounds/CV)
                params['max_depth'] = 1
                rum_structure = bio_to_rumboost(models_train_rum[0])
                nest == {0:0, 1:1, 2:2, 3:2}
                mu = [1, 1, 1.166746773143513]
            elif classifier == 'Cross-nested_RUMBoost':
                params = copy.deepcopy(STATIC_PARAMS)
                params['num_iterations'] = int(n_rounds/CV)
                params['max_depth'] = 1
                rum_structure = bio_to_rumboost(models_train_rum[0])
                mu = [1.821282482078, 1.000015988]
                alphas = np.array([[0, 1], [0, 1], [1, 0], [0.363528056, 1-0.363528056]])
            elif classifier == "Full_Effect_RUMBoost":
                params_rde = create_classifier(classifier, hyperparameters, dataset, X, y, for_CV=True)
                params_rde['num_iterations'] = int(n_rounds/CV)
                rum_structure = bio_to_rumboost(models_train_rum[0], rnd_effect_attributes=rnd_effects_attributes)
                params = {'verbose': -1,
                'num_classes':4,
                #'early_stopping_round':100,
                'learning_rate': 0.1,
                'max_depth': 1,
                #'num_leaves':31,
                'num_iterations': int(n_rounds/CV),
                'objective':'multiclass',
                'boosting': 'gbdt',
                'monotone_constraints_method': 'advanced'}

            train_set = lgb.Dataset(X, label=y, free_raw_data=False)
            test_set = lgb.Dataset(final_test_X, label=final_test_y, free_raw_data=False)

            # Balance dataset
            #X_scaled_balanced, y_balanced = balance(X_scaled, y, X_scaled.shape[0], len(dataset["alt_names"]))
            if classifier == "MNL":
                models_train = prepare_model([LPMC_normalised], dataset_train)
            elif classifier == "Nested_MNL":
                models_train = prepare_model([LPMC_nested_normalised], dataset_train)
            elif classifier == "CNL":
                models_train = prepare_model([LPMC_cross_nested_normalised], dataset_train)
            elif classifier == "Mixed Logit":
                models_train = prepare_model([LPMC_mixed_logit_tt], dataset_train)

            # Fit the classifier on training set
            time_ini = time.perf_counter()
            if classifier == "RUMBoost":
                clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[train_set])
            elif classifier == "Nested_RUMBoost":
                clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[train_set], nests=nest, mu=mu)
            elif classifier == 'Cross-nested_RUMBoost':
                clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[test_set], alphas=alphas, mu=mu)
            elif classifier == "Full_Effect_RUMBoost":
                clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[train_set], params_rde=params_rde)
            elif classifier == "RUMBoost_FI":
                clf_trained = rum_train(params, train_set, rum_structure, valid_sets=[train_set])
            elif classifier == "MNL":
                clf_trained = estimate_models(models_train)
            elif classifier == "Nested_MNL":
                clf_trained = estimate_models(models_train)
            elif classifier == "CNL":
                clf_trained = estimate_models(models_train)
            elif classifier == "Mixed Logit":
                clf_trained = estimate_models(models_train, mixed_logit=True)
            elapsed_time = time.perf_counter() - time_ini

            if classifier == "MNL":
                proba = predict_proba(dataset_test[0], clf_trained[0], [0, 1, 2, 3], MNL_utilities)
            elif classifier == "Nested_MNL":
                labels = prepare_labels(dataset_test)
                model_test = prepare_model([LPMC_nested_normalised], dataset_test, for_prob=True)
                proba = predict_test(clf_trained, model_test, labels, return_prob=True)
            elif classifier == "CNL":
                labels = prepare_labels(dataset_test)
                model_test = prepare_model([LPMC_cross_nested_normalised], dataset_test, for_prob=True)
                proba = predict_test(clf_trained, model_test, labels, return_prob=True)
            elif classifier == "Mixed Logit":
                labels = prepare_labels(dataset_test)
                model_test = prepare_model([LPMC_mixed_logit_tt], dataset_test, for_prob=True)
                proba = predict_test(clf_trained, model_test, labels, return_prob=True)
            elif classifier == "Nested_RUMBoost":
                proba, _, _ = clf_trained.predict(test_set, nests=nest, mu=mu)
            elif classifier == 'Cross-nested_RUMBoost':
                proba, _, _ = clf_trained.predict(test_set, alphas=alphas, mu=mu)
            else:
                proba = clf_trained.predict(test_set)
            
            fitted = True
            
            if classifier == "RUMBoost":
                clf_trained.save_model(partial_results_dir + "LPMC_RUMBoost.json")
            elif classifier == "Nested_RUMBoost":
                clf_trained.save_model(partial_results_dir + "LPMC_RUMBoost_Nested.json")
            elif classifier == 'Cross-nested_RUMBoost':
                clf_trained.save_model(partial_results_dir + "LPMC_RUMBoost_Cross-nested.json")
            elif classifier == "Full_Effect_RUMBoost":
                clf_trained.save_model(partial_results_dir + "LPMC_RUMBoost_Full_Effect.json")
            elif classifier == "RUMBoost_FI":
                clf_trained.save_model(partial_results_dir + "LPMC_RUMBoost_FI.json")
            elif classifier == "MNL":
                pandas_results = clf_trained[0].getEstimatedParameters()
                pandas_results.to_csv(partial_results_dir + 'LPMC_MNL.csv')
            elif classifier == "Nested_MNL":
                pandas_results = clf_trained[0].getEstimatedParameters()
                pandas_results.to_csv(partial_results_dir + 'LPMC_NL.csv')
            elif classifier == "CNL":
                pandas_results = clf_trained[0].getEstimatedParameters()
                pandas_results.to_csv(partial_results_dir + 'LPMC_CNL.csv')


            y_score = np.argmax(proba, axis=1)

            # Compute the accuracy results
            Experiment_4_test_scores[classifier][dataset_name]['Accuracy'] = accuracy_score(final_test_y, y_score)*100
            Experiment_4_test_scores[classifier][dataset_name]['F1'] = f1_score(final_test_y, y_score, average=average_tech)*100
            Experiment_4_test_scores[classifier][dataset_name]['Recall'] = recall_score(final_test_y, y_score, average=average_tech)*100
            Experiment_4_test_scores[classifier][dataset_name]['GMPCA'] = GMPCA(proba, final_test_y.values)*100
            Experiment_4_test_scores[classifier][dataset_name]['Estimation time'] = elapsed_time

            ## Market shares
            #Experiment_4_CV_scores[classifier][dataset_name]['Market_shares'] = np.round(np.sum(clf.predict_proba(X_scaled), axis=0)/X_scaled.shape[0] * 100, 3)
            #Experiment_4_test_scores[classifier][dataset_name]['Market_shares'] = np.round(np.sum(clf.predict_proba(final_test_X_scaled), axis=0)/final_test_X_scaled.shape[0] * 100, 3)
            
            ## WTP
            #Experiment_4_CV_scores[classifier][dataset_name]["WTP_history"] = None
            #Experiment_4_test_scores[classifier][dataset_name]["WTP_history"] = None
            # if dataset["WTP"] is not None:
            #     Experiment_4_CV_scores[classifier][dataset_name]["WTP_history"] = {}
            #     Experiment_4_CV_scores[classifier][dataset_name]["n_WTP_nan"] = 0
            #     Experiment_4_CV_scores[classifier][dataset_name]["n_WTP_inf"] = 0
            #     Experiment_4_test_scores[classifier][dataset_name]["WTP_history"] = {}
            #     Experiment_4_test_scores[classifier][dataset_name]["n_WTP_nan"] = 0
            #     Experiment_4_test_scores[classifier][dataset_name]["n_WTP_inf"] = 0

            #     for alt in dataset["WTP"].keys():
            #         v1_name, v2_name, d_1, d_2 = dataset["WTP"][alt]

            #         # WTP over training set 
            #         filtered_WTP, n_WTP_nan, n_WTP_inf = compute_WTP(clf, dataset, X, v1_name, v2_name, d_1, d_2, scaler)
            #         Experiment_4_CV_scores[classifier][dataset_name]["n_WTP_nan"] += n_WTP_nan
            #         Experiment_4_CV_scores[classifier][dataset_name]["n_WTP_inf"] += n_WTP_inf
            #         Experiment_4_CV_scores[classifier][dataset_name]["WTP_history"][dataset["alt_names"][alt]] = filtered_WTP

            #         # WTP over test set 
            #         filtered_WTP, n_WTP_nan, n_WTP_inf = compute_WTP(clf, dataset, final_test_X, v1_name, v2_name, d_1, d_2, scaler)
            #         Experiment_4_test_scores[classifier][dataset_name]["n_WTP_nan"] += n_WTP_nan
            #         Experiment_4_test_scores[classifier][dataset_name]["n_WTP_inf"] += n_WTP_inf
            #         Experiment_4_test_scores[classifier][dataset_name]["WTP_history"][dataset["alt_names"][alt]] = filtered_WTP

            del clf_trained
            gc.collect()

        # Store the partial experiment data (serialize)
        with open(partial_results_dir + 'Experiment_4_CV_scores.pickle', 'wb') as handle:
            pickle.dump(Experiment_4_CV_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(partial_results_dir + 'Experiment_4_test_scores.pickle', 'wb') as handle:
            pickle.dump(Experiment_4_test_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)

        print("\t    + Elapsed: {} seconds".format(np.round(time.perf_counter()-it_time_init), 2))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

--- LPMC (ID: LPMC)

	--- Mixed Logit
		 CV it: 0


[INFO] 2024-07-28 22:02:23,800 LPMC Mixed Logit <models.py:449>
[INFO] 2024-07-28 22:02:23,805 Parameters read from biogeme.toml <toml.py:66>
[INFO] 2024-07-28 22:02:58,958 *** Initial values of the parameters are obtained from the file __LPMC_mixed_logit.iter <biogeme.py:1271>
[INFO] 2024-07-28 22:02:59,076 Parameter values restored from __LPMC_mixed_logit.iter <biogeme.py:1051>
[DEBUG] 2024-07-28 22:03:07,619 Log likelihood (N = 43812):  -61705.77 <biogeme.py:853>
[DEBUG] 2024-07-28 22:03:07,619 Run simple_bounds <biogeme.py:1515>
[INFO] 2024-07-28 22:03:07,619 Optimization algorithm: hybrid Newton/BFGS with simple bounds [simple_bounds] <optimization.py:437>
[INFO] 2024-07-28 22:03:07,620 ** Optimization: Hybrid Newton 10.0%/BFGS with trust region for simple bounds <optimization.py:480>
[DEBUG] 2024-07-29 02:50:33,918 Log likelihood (N = 43812):  -61705.77 Gradient norm:      6e+05 Hessian norm:       3e+07  <biogeme.py:935>
[DEBUG] 2024-07-29 02:54:33,057 Log likelihood (N = 43812)

## Export LaTeX tables/figures

In [10]:
try:
    with open(partial_results_dir + '/Experiment_4_CV_scores_all.pickle', 'rb') as handle:
        Experiment_4_CV_scores = pickle.load(handle)
except:
    Experiment_4_CV_scores = {}
try:
    with open(partial_results_dir + '/Experiment_4_test_scores_all.pickle', 'rb') as handle:
        Experiment_4_test_scores = pickle.load(handle)
except:
    Experiment_4_test_scores = {}

In [11]:
# Obtain accuracy tables
Experiment_4_CV_table, Experiment_4_test_table, Experiment_4_time_table = construct_experiment_4_accuracy_table(Experiment_4_CV_scores, Experiment_4_test_scores)

# Obtain market shares table over the test set
#Experiment_4_MS_table = construct_experiment_4_market_shares_table(Experiment_4_test_scores, datasets)

In [12]:
Experiment_4_CV_table

Unnamed: 0_level_0,RUMBoost,RUMBoost,Nested_RUMBoost,Nested_RUMBoost,Cross-nested_RUMBoost,Cross-nested_RUMBoost,Full_Effect_RUMBoost,Full_Effect_RUMBoost,RUMBoost_FI,RUMBoost_FI,MNL,MNL,Nested_MNL,Nested_MNL,CNL,CNL
Unnamed: 0_level_1,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA
LPMC,74.76,51.84,74.76,51.85,74.86,51.96,75.44,52.48,74.42,51.28,73.31,50.09,73.41,50.05,73.43,50.12


In [13]:
Experiment_4_test_table

Unnamed: 0_level_0,RUMBoost,RUMBoost,Nested_RUMBoost,Nested_RUMBoost,Cross-nested_RUMBoost,Cross-nested_RUMBoost,Full_Effect_RUMBoost,Full_Effect_RUMBoost,RUMBoost_FI,RUMBoost_FI,MNL,MNL,Nested_MNL,Nested_MNL,CNL,CNL
Unnamed: 0_level_1,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA,Accuracy,GMPCA
LPMC,74.15,50.98,74.15,51.01,74.1,51.09,74.64,51.55,73.6,50.39,72.69,49.24,72.77,49.21,72.93,49.31


In [14]:
Experiment_4_time_table

Unnamed: 0,RUMBoost,Nested_RUMBoost,Cross-nested_RUMBoost,Full_Effect_RUMBoost,RUMBoost_FI,MNL,Nested_MNL,CNL
LPMC,7.86,48.53,183.91,10.9,5.01,242.14,1067.04,5120.01
