# Deep dive on FeaturesStorage

In [None]:
import os

# Remove GPU use
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import sys
from pathlib import Path

sys.path.append("../")

import numpy as np
import pandas as pd

In [None]:
from choice_learn.data.storage import FeaturesStorage, OneHotStorage
from choice_learn.data import ChoiceDataset

## Different Instantiation Possibilities for Storage:
### 1 - from dict

In [None]:
features = {"customerA": [1, 2, 3], "customerB": [4, 5, 6], "customerC": [7, 8, 9]}
# dict must be {id: features}
storage = FeaturesStorage(values=features,
                          values_names=["age", "income", "children_nb"],
                          name="customers_features")

In [None]:
# Subset in order to only keep som ids
storage[["customerA", "customerC"]]

<choice_learn.data.storage.FeaturesStorage at 0x7fce75cdd950>

In [None]:
# Batch to access the features values
storage.batch[["customerA", "customerC", "customerA", "customerC"]]

array([[1, 2, 3],
       [7, 8, 9],
       [1, 2, 3],
       [7, 8, 9]])

### 2 - from list

In [None]:
features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
ids = ["customerA", "customerB", "customerC"]

storage = FeaturesStorage(ids=ids,
                          values=features,
                          values_names=["age", "income", "children_nb"],
                          name="customers")
# We get the same result as before
storage.batch[["customerA", "customerC", "customerA", "customerC"]]

array([[1, 2, 3],
       [7, 8, 9],
       [1, 2, 3],
       [7, 8, 9]])

### 3 - from list, without ids

The ids are generated automatically as increasing integers:

In [None]:
features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

storage = FeaturesStorage(values=features, values_names=["age", "income", "children_nb"], name="customers")
storage.batch[[0, 2, 0, 2]]

array([[1, 2, 3],
       [7, 8, 9],
       [1, 2, 3],
       [7, 8, 9]])

### 4 - from pandas.DataFrame

In [None]:
# Here the DataFrame has a column "id" that identifies the keys from the features values
features = {"age": [1, 4, 7], "income": [2, 5, 8], "children_nb": [3, 6, 9], "id": ["customerA", "customerB", "customerC"]}
features = pd.DataFrame(features)
storage = FeaturesStorage(values=features, name="customers")
storage.batch[["customerA", "customerC", "customerA", "customerC"]]

array([[1, 2, 3],
       [7, 8, 9],
       [1, 2, 3],
       [7, 8, 9]])

In [None]:
# Here the DataFrame does not have a column "id" that identifies the keys from the features values
# We thus specify the 'index'
features = {"age": [1, 4, 7], "income": [2, 5, 8], "children_nb": [3, 6, 9]}
features = pd.DataFrame(features, index=["customerA", "customerB", "customerC"])
storage = FeaturesStorage(values=features, name="customers")
storage.batch[["customerA", "customerC", "customerA", "customerC"]]

array([[1, 2, 3],
       [7, 8, 9],
       [1, 2, 3],
       [7, 8, 9]])

### 5 - OneHotStorage from list

In [None]:
ids = [0, 1, 2, 3, 4]
values = [4, 3, 2, 1, 0]

# Here the Storage will map the ids to the values
# value = 4 means that the fifth value is a one, the rest are zeros
oh_storage = OneHotStorage(ids=ids, values=values, name="OneHotTest")

In [None]:
# Get OneHot vectors:
oh_storage.batch[[0, 2, 4]]

array([[0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0]], dtype=uint8)

In [None]:
# Get the Storage value
oh_storage.get_element_from_index(0), oh_storage.storage

(4, {0: 4, 1: 3, 2: 2, 3: 1, 4: 0})

### 6 - OneHotStorage from single list

If only the values are given, the ids are created as increasing integers.

In [None]:
oh_storage = OneHotStorage(values=values, name="OneHotTest")
oh_storage.batch[[0, 2, 4]]

array([[0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0]], dtype=uint8)

If the values are not given, they are also created from the ids as increasing integers.

In [None]:
oh_storage = OneHotStorage(ids=ids, name="OneHotTest")
oh_storage.batch[[0, 2, 4]]
# Note that here it changes the order !

array([[1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1]], dtype=uint8)

### 7 - OneHotStorage from dict

In [None]:
values_dict = {k:v for k, v in zip(ids, values)}
oh_storage = OneHotStorage(values=values_dict, name="OneHotTest")
oh_storage.batch[[0, 2, 4]]

array([[0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0]], dtype=uint8)

## Use of FeaturesByID and Storage in the ChoiceDataset

In [None]:
features = {"customerA": [1, 2, 3], "customerB": [4, 5, 6], "customerC": [7, 8, 9]}
storage = FeaturesStorage(values=features, values_names=["age", "income", "children_nb"], name="customers_features")

In [None]:
features = {"item1": [1, 2, 3], "item2": [4, 5, 6], "item3": [7, 8, 9], "item4": [10, 11, 12]}
storage = FeaturesStorage(values=features, values_names=["f1", "f2", "f3"], name="items_features")

price_storage = {"price1": [1], "price2": [2], "price3": [3], "price4": [4]}
price_storage = FeaturesStorage(values=price_storage, values_names=["price"], name="items_prices")

fixed_items_features = np.eye(3)
prices = [[[4, 1], [4, 1], [5, 1]], [[5, 2], [4, 2], [6, 2]],
          [[6, 3], [7, 3], [8, 3]], [[4, 4], [5, 4], [4, 4]]]
contexts_items_features = [[["item1", "price1"], ["item2", "price2"], ["item3", "price3"]],
                           [["item1", "price1"], ["item4", "price2"], ["item3", "price4"]],
                           [["item1", "price1"], ["item2", "price3"], ["item3", "price4"]],
                           [["item1", "price1"], ["item2", "price3"], ["item3", "price4"]]]
choices = [0, 1 , 2, 2]
contexts_features = [[0, 1], [1, 0], [1, 1], [0, 0]]

dataset = ChoiceDataset(
    fixed_items_features=fixed_items_features,
    contexts_features=contexts_features,
    choices=choices,
    contexts_items_features=contexts_items_features,
    features_by_ids=[storage, price_storage],
    contexts_items_features_names=["items_features", "items_prices"],
    )

## Example with the SwissMetro dataset

In [None]:
from choice_learn.datasets import load_swissmetro

df = load_swissmetro(as_frame=True)
df = df.loc[df.CHOICE!=0]
df.head()

Unnamed: 0,GROUP,SURVEY,SP,ID,PURPOSE,FIRST,TICKET,WHO,LUGGAGE,AGE,...,TRAIN_CO,TRAIN_HE,SM_TT,SM_CO,SM_HE,SM_SEATS,CAR_TT,CAR_CO,CHOICE,CAR_HE
0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,...,48.0,120.0,63.0,52.0,20.0,0.0,117.0,65.0,2.0,0.0
1,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,...,48.0,30.0,60.0,49.0,10.0,0.0,117.0,84.0,2.0,0.0
2,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,...,48.0,60.0,67.0,58.0,30.0,0.0,117.0,52.0,2.0,0.0
3,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,...,40.0,30.0,63.0,52.0,20.0,0.0,72.0,52.0,2.0,0.0
4,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,...,36.0,60.0,63.0,42.0,20.0,0.0,90.0,84.0,2.0,0.0


The ID column refers to a unique participant to the survey. Each participant answered several cases. We therefore have several times the features concerning this participant. A perfect example for FeaturesStorage.

In [None]:
customer_columns = ['ID', 'GROUP', 'SURVEY', 'SP', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',
                    'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST']
customer_features = df[customer_columns].drop_duplicates()
customer_features = customer_features.rename(columns={"ID": "id"})
customer_storage = FeaturesStorage(values=customer_features, name="customer_features")

contexts_features = df[["ID"]]
contexts_features = contexts_features.rename(columns={"ID": "customer_features"})

In [None]:
choices = df.CHOICE.to_numpy() - 1
contexts_items_availabilities = df[["TRAIN_AV", "SM_AV", "CAR_AV"]].to_numpy()
contexts_items_features = np.stack([df[["TRAIN_TT", "TRAIN_CO", "TRAIN_HE"]].to_numpy(),
                                    df[["SM_TT", "SM_CO", "SM_HE"]].to_numpy(),
                                    df[["CAR_TT", "CAR_CO", "CAR_HE"]].to_numpy()], axis=1)

In [None]:
choice_dataset = ChoiceDataset(contexts_features=contexts_features,
                               contexts_items_features=contexts_items_features,
                               contexts_items_availabilities=contexts_items_availabilities,
                               choices=choices,
                               features_by_ids=[customer_storage],)

Et voilà !

In [None]:
batch = choice_dataset.batch[[0, 10, 200]]
print("Batch Fixed Items Features:", batch[0])
print("Batch Contexts Features:", batch[1])
print("Batch Contexts Items Features:", batch[2])
print("Batch Contexts Items Availabilities:", batch[3])
print("Batch Choices:", batch[4])