## All possible ways to create a ChoiceDataset

Listed below ! 

In [None]:
import os
import sys
from pathlib import Path

sys.path.append("../")

import numpy as np
import pandas as pd

from choice_learn.data import ChoiceDataset
from choice_learn.data.storage import FeaturesStorage

We will use the CanadaMode dataset for this example. We can download it directly:

In [None]:
from choice_learn.datasets import load_modecanada, load_swissmetro

canada_transport_df = load_modecanada(as_frame=True)
print(canada_transport_df.head())

Let's create a column indicating whether the considered transport alternative is individual or not transport.

In [None]:
canada_transport_df["is_individual"] = canada_transport_df.apply(lambda row: 1 if row.alt =="car" else 0,
                                                                 axis=1)
print(canada_transport_df.head())

### From a single dataframe

In [None]:
dataset = ChoiceDataset.from_single_df(df=canada_transport_df,
                                       fixed_items_features_columns=["is_individual"],
                                       contexts_features_columns=["income"],
                                       contexts_items_features_columns=["dist", "cost", "ivt", "ovt"],
                                       items_id_column="alt",
                                       contexts_id_column="case",
                                       choices_column="choice",
                                       choice_mode="one_zero", # the choice columns indicates if the item is chosen (1) or not (0)
                                       )
print(dataset.summary())

Another mode is possible, if the dataframe indicates the name of the chosen item instead of ones and zeros:

In [None]:
id_df = canada_transport_df.copy(deep=True)
one_hot_choice = [0] * len(id_df)
for n_row, row in id_df.iterrows():
    if row.choice == 0:
        sub_df = id_df[id_df.case == row.case]
        choice = sub_df.loc[sub_df.choice == 1].alt.to_numpy()[0]
        one_hot_choice[n_row-1] = choice

for n_row, row in id_df.iterrows():
    if row.choice == 1:
         one_hot_choice[n_row-1] = row.alt

id_df["one_hot_choice"] = one_hot_choice

print(id_df.head())


In [None]:
dataset = ChoiceDataset.from_single_df(df=id_df,
                                       fixed_items_features_columns=["is_individual"],
                                       contexts_features_columns=["income"],
                                       contexts_items_features_columns=["dist", "cost", "ivt", "ovt"],
                                       items_id_column="alt",
                                       contexts_id_column="case",
                                       choices_column="one_hot_choice",
                                       choice_mode="items_id", # the choice columns indicates if the item is chosen (1) or not (0)
                                       )
print(dataset.summary())

Now, let's say that you have your data split into several files:

In [None]:
canada_transport_df.alt.unique()

In [None]:
fixed_items_features = pd.DataFrame({"item_id": ["car", "train", "bus", "air"],
                                     "is_individual": [1, 0, 0, 0]})
# The item_id column is necessery, otherwise it will keep the order
# however it is less safe with pd.DataFrame

In [None]:
contexts_features = canada_transport_df[["case", "income"]].drop_duplicates()
contexts_features = contexts_features.rename(columns={"case": "context_id"})
# If the context_id column does not exist, the index is used

In [None]:
contexts_items_features = canada_transport_df[["case", "alt", "dist", "cost", "ivt", "ovt"]]
contexts_items_features = contexts_items_features.rename(columns={"case": "context_id", "alt": "item_id"})

In [None]:
canada_transport_df

In [None]:
choices = canada_transport_df.loc[canada_transport_df.choice==1][["case", "alt"]]

In [None]:
dataset = ChoiceDataset(fixed_items_features=fixed_items_features,
                        contexts_features=contexts_features,
                        contexts_items_features=contexts_items_features,
                        choices=choices)