# RAM usage with Choice-Learn

## On the ICDM 2013 Expedia Dataset

- [Choice-Learn's FeaturesStorage](#features-storage-efficiency-with-dummy-data)
- [pandas.DataFrame on Long format](#pandasdataframe-long-format-(pylogit))
- [pandas.DataFrame on Wide format](#pandasdataframe-wide-format-(biogeme))
- [Torch-Choice](#torch-choice)
- [Plots and Illustrations](#plots-and-illustrations)

In [None]:
import sys

sys.path.append("../../")
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from choice_learn.datasets import load_expedia
from choice_learn.data import ChoiceDataset


Accessing the RAM usag is actually not that obvious. Here is a code snippet finding all references link to an object in order to addition their memory consumption.
This is explained [here](https://stackoverflow.com/questions/13530762/how-to-know-bytes-size-of-python-object-like-arrays-and-dictionaries-the-simp).

In [None]:
import gc
import sys

def get_obj_size(obj):
    marked = {id(obj)}
    obj_q = [obj]
    sz = 0

    while obj_q:
        sz += sum(map(sys.getsizeof, obj_q))

        # Lookup all the object referred to by the object in obj_q.
        all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))

        # Filter object that are already marked.
        new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}

        obj_q = new_refr.values()
        marked.update(new_refr.keys())

    return sz

## Features Storage efficiency with dummy data

In [None]:
from choice_learn.data import OneHotStorage

### Small Example

n_locations = 10
n_data = 100

indexes = np.random.randint(n_locations, size=(n_data, ))

dense_features = np.zeros((n_data, n_locations))
dense_features[np.arange(n_data), indexes] = 1

storage = OneHotStorage(ids=list(range(n_locations)))

assert (storage.batch[indexes] == dense_features).all()

### Dense features memory usage:
print("Dense memory usage:", get_obj_size(dense_features))

### FeaturesByIDs memory usage:
# Storage memory usage + ids memory stirage
print("FeaturesByIDs memory usage:", get_obj_size(storage)+get_obj_size(indexes))

In [None]:
from choice_learn.data import OneHotStorage


dense_sizes = []
fbid_sizes = []
ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]


n_locations = 10
for n_data in ds_lengths:

    # random draw of store apparition
    indexes = np.random.randint(n_locations, size=(n_data, ))

    # Creation of the "dense" dataset
    dense_dataset = np.zeros((n_data, n_locations))
    dense_dataset[np.arange(n_data), indexes] = 1

    # Creation of the Features storage
    storage = OneHotStorage(ids=list(range(n_locations)))
    
    # Memory print of the features storage and the indexes
    fbid_sizes.append(get_obj_size(indexes) + get_obj_size(storage))
    # Memory print of the dense dataset
    dense_sizes.append(get_obj_size(dense_dataset))

plt.plot(ds_lengths, dense_sizes, label='w/o FeaturesById - n_locations=10', c="darkblue")
plt.plot(ds_lengths, fbid_sizes, label='w/ FeaturesById - n_locations=10', c="turquoise")
plt.scatter(ds_lengths, dense_sizes, c="darkblue")
plt.scatter(ds_lengths, fbid_sizes, c="turquoise")
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")

n_locations = 100


dense_sizes = []
fbid_sizes = []
for n_data in ds_lengths:


    # random draw of store apparition
    indexes = np.random.randint(n_locations, size=(n_data, ))

    # Creation of the "dense" dataset
    dense_dataset = np.zeros((n_data, n_locations))
    dense_dataset[np.arange(n_data), indexes] = 1

    # Creation of the Features storage
    storage = OneHotStorage(ids=list(range(n_locations)))

    # Memory print of the features storage and the indexes
    fbid_sizes.append(get_obj_size(indexes) + get_obj_size(storage))

    # Memory print of the dense dataset
    dense_sizes.append(get_obj_size(dense_dataset))
    
plt.plot(ds_lengths, dense_sizes, label='w/o FeaturesById - n_locations=100', c="cornflowerblue")
plt.plot(ds_lengths, fbid_sizes, label='w/ FeaturesById - n_locations=100', c="teal")
plt.scatter(ds_lengths, dense_sizes, c="cornflowerblue")
plt.scatter(ds_lengths, fbid_sizes, c="teal")
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")
plt.legend()

## Expedia Dataset

You might want to run the different methods individually.\
The results are stored later in the notebook.

### With Choice-Learn and FeaturesByIDs

In [None]:
# Takes some time
dataset = load_expedia(as_frame=False, preprocessing="rumnet")

# Defining tested data lengths
data_lengths = [100, 1000, 10000, 100000, 397618]

In [None]:
clearn_memory_size = []
for length in data_lengths:
    sub_dataset = dataset[:length]
    clearn_memory_size.append(get_obj_size(sub_dataset))

### pandas.DataFrame Long format (PyLogit)

The raw dataframe needs formatting to have the right format. It can take some time.

In [None]:
import pandas as pd
import numpy as np

# Load DF
expedia_df = load_expedia(as_frame=True)

# Format dates & time features
expedia_df.date_time = pd.to_datetime(expedia_df.date_time, format="%Y-%m-%d %H:%M:%S")
expedia_df.loc[:, "day_of_week"] = expedia_df.loc[:, "date_time"].dt.dayofweek
expedia_df.loc[:, "month"] = expedia_df.loc[:, "date_time"].dt.month
expedia_df.loc[:, "hour"] = expedia_df.loc[:, "date_time"].dt.hour

# Filtering ids with less than 1000 occurrences
for id_col in [
    "site_id",
    "visitor_location_country_id",
    "prop_country_id",
    "srch_destination_id",
]:
    value_counts = expedia_df[["srch_id", id_col]].drop_duplicates()[id_col].value_counts()
    kept_ids = value_counts.index[value_counts.gt(1000)]
    for id_ in expedia_df[id_col].unique():
        if id_ not in kept_ids:
            expedia_df.loc[expedia_df[id_col] == id_, id_col] = -1

# "Filtering DF for price, stay length, booking window, etc.
expedia_df = expedia_df[expedia_df.price_usd <= 1000]
expedia_df = expedia_df[expedia_df.price_usd >= 10]
expedia_df["log_price"] = expedia_df.price_usd.apply(np.log)
expedia_df = expedia_df[expedia_df.srch_length_of_stay <= 14]
expedia_df = expedia_df[expedia_df.srch_booking_window <= 365]
expedia_df["booking_window"] = np.log(expedia_df["srch_booking_window"] + 1)
expedia_df = expedia_df.fillna(-1)

# Sorting DF columns
order_cols = [
    "srch_id",
    "prop_id",
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1",
    "prop_location_score2",
    "prop_log_historical_price",
    "position",
    "promotion_flag",
    "srch_length_of_stay",
    "srch_adults_count",
    "srch_children_count",
    "srch_room_count",
    "srch_saturday_night_bool",
    "orig_destination_distance",
    "random_bool",
    "day_of_week",
    "month",
    "hour",
    "log_price",
    "booking_window",
    "site_id",
    "visitor_location_country_id",
    "prop_country_id",
    "srch_destination_id",
    "click_bool",
    "booking_bool",
]
expedia_df = expedia_df[order_cols]

# Creating dummy availabilities
# getting rid of search & prop_id and the clickbool and bookingbool
# adding no_purchase fixed effect
expedia_df["is_no_purchase"] = 0

# adding the no_purchase option to the data
df1 = (
    expedia_df.groupby("srch_id")
    .filter(lambda x: x.booking_bool.sum() == 1)
    .groupby("srch_id")
    .max()
    .reset_index(drop=False)
)
df1.loc[:, "is_no_purchase"] = 1
df1.loc[:, "log_price"] = 0
df1.loc[:, "booking_bool"] = 0

df2 = (
    expedia_df.groupby("srch_id")
    .filter(lambda x: x.booking_bool.sum() == 0)
    .groupby("srch_id")
    .max()
    .reset_index(drop=False)
)
df2.loc[:, "is_no_purchase"] = 1
df2.loc[:, "log_price"] = 0
df2.loc[:, "booking_bool"] = 1

# Concatenating the created DFs
expedia_df = pd.concat([expedia_df, df1, df2])

# One Hot encoding 
site_id_one_hot = pd.get_dummies(expedia_df.site_id, prefix="site_id")
visitor_location_country_id_one_hot = pd.get_dummies(expedia_df.visitor_location_country_id, prefix="visitor_location_country_id")
srch_destination_id_one_hot =pd.get_dummies(expedia_df.srch_destination_id, prefix="srch_destination_id")
prop_country_id_one_hpt = pd.get_dummies(expedia_df.prop_country_id, prefix="prop_country_id")
expedia_df = pd.concat([expedia_df, site_id_one_hot, visitor_location_country_id_one_hot, srch_destination_id_one_hot, prop_country_id_one_hpt], axis=1)

# Sorting
expedia_df = expedia_df.sort_values("srch_id")
choices = ["booking_bool"]

# Final selection of the columns to match the ChoiceDataset
contexts_features_names = [
    "srch_id",
    "srch_length_of_stay",
    "srch_adults_count",
    "srch_children_count",
    "srch_room_count",
    "srch_saturday_night_bool",
    "booking_window",
    "random_bool",
    "day_of_week",
    "month",
    "hour",
    "site_id",
    "visitor_location_country_id",
    "srch_destination_id",
]
contexts_features_names += site_id_one_hot.columns.tolist()
contexts_features_names += visitor_location_country_id_one_hot.columns.tolist()
contexts_features_names += srch_destination_id_one_hot.columns.tolist()
contexts_features_names += prop_country_id_one_hpt.columns.tolist()

contexts_items_features_names = [
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1",
    "prop_location_score2",
    "prop_log_historical_price",
    "position",
    "promotion_flag",
    "orig_destination_distance",
    "log_price",
    "prop_country_id",
]

long_df = expedia_df[contexts_features_names + contexts_items_features_names + choices]

In [None]:
# Accessinng RAM usage
long_df_memory_size = []
for length in data_lengths:
    srch_ids = long_df.srch_id.unique()[:length]
    sub_long_df = long_df[long_df.srch_id.isin(srch_ids)]
    long_df_memory_size.append(get_obj_size(sub_long_df))

print(long_df_memory_size)

### pandas.DataFrame wide format (Biogeme)

In [None]:
import pandas as pd

df = load_expedia(as_frame=True)

In [None]:
df.columns

Transformation of the pandas.DataFrame.

In [None]:

expedia_df = load_expedia(as_frame=True)

# Format dates & time features
expedia_df.date_time = pd.to_datetime(expedia_df.date_time, format="%Y-%m-%d %H:%M:%S")
expedia_df.loc[:, "day_of_week"] = expedia_df.loc[:, "date_time"].dt.dayofweek
expedia_df.loc[:, "month"] = expedia_df.loc[:, "date_time"].dt.month
expedia_df.loc[:, "hour"] = expedia_df.loc[:, "date_time"].dt.hour

# Filtering ids with less than 1000 occurrences
for id_col in [
    "site_id",
    "visitor_location_country_id",
    "prop_country_id",
    "srch_destination_id",
]:
    value_counts = expedia_df[["srch_id", id_col]].drop_duplicates()[id_col].value_counts()
    kept_ids = value_counts.index[value_counts.gt(1000)]
    for id_ in expedia_df[id_col].unique():
        if id_ not in kept_ids:
            expedia_df.loc[expedia_df[id_col] == id_, id_col] = -1

# "Filtering DF for price, stay length, booking window, etc.
expedia_df = expedia_df[expedia_df.price_usd <= 1000]
expedia_df = expedia_df[expedia_df.price_usd >= 10]
expedia_df["log_price"] = expedia_df.price_usd.apply(np.log)
expedia_df = expedia_df[expedia_df.srch_length_of_stay <= 14]
expedia_df = expedia_df[expedia_df.srch_booking_window <= 365]
expedia_df["booking_window"] = np.log(expedia_df["srch_booking_window"] + 1)
expedia_df = expedia_df.fillna(-1)

# Sorting DF columns
order_cols = [
    "srch_id",
    "prop_id",
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1",
    "prop_location_score2",
    "prop_log_historical_price",
    "position",
    "promotion_flag",
    "srch_length_of_stay",
    "srch_adults_count",
    "srch_children_count",
    "srch_room_count",
    "srch_saturday_night_bool",
    "orig_destination_distance",
    "random_bool",
    "day_of_week",
    "month",
    "hour",
    "log_price",
    "booking_window",
    "site_id",
    "visitor_location_country_id",
    "prop_country_id",
    "srch_destination_id",
    "click_bool",
    "booking_bool",
]
expedia_df = expedia_df[order_cols]

# Creating dummy availabilities
# getting rid of search & prop_id and the clickbool and bookingbool
# adding no_purchase fixed effect
expedia_df["is_no_purchase"] = 0

# adding the no_purchase option to the data
df1 = (
    expedia_df.groupby("srch_id")
    .filter(lambda x: x.booking_bool.sum() == 1)
    .groupby("srch_id")
    .max()
    .reset_index(drop=False)
)
df1.loc[:, "is_no_purchase"] = 1
df1.loc[:, "log_price"] = 0
df1.loc[:, "booking_bool"] = 0

df2 = (
    expedia_df.groupby("srch_id")
    .filter(lambda x: x.booking_bool.sum() == 0)
    .groupby("srch_id")
    .max()
    .reset_index(drop=False)
)
df2.loc[:, "is_no_purchase"] = 1
df2.loc[:, "log_price"] = 0
df2.loc[:, "booking_bool"] = 1

# Concatenating the created DFs
expedia_df = pd.concat([expedia_df, df1, df2])

site_id_one_hot = pd.get_dummies(expedia_df.site_id, prefix="site_id")
visitor_location_country_id_one_hot = pd.get_dummies(expedia_df.visitor_location_country_id, prefix="visitor_location_country_id")
srch_destination_id_one_hot = pd.get_dummies(expedia_df.srch_destination_id, prefix="srch_destination_id")
prop_country_id_one_hpt = pd.get_dummies(expedia_df.prop_country_id, prefix="prop_country_id")
expedia_df = pd.concat([expedia_df, site_id_one_hot, visitor_location_country_id_one_hot, srch_destination_id_one_hot, prop_country_id_one_hpt], axis=1)

In [None]:
contexts_items_features_names = [
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1",
    "prop_location_score2",
    "prop_log_historical_price",
    "position",
    "promotion_flag",
    "orig_destination_distance",
    "log_price",
    "prop_country_id",
]
contexts_features_names = [
    "srch_id",
    "srch_length_of_stay",
    "srch_adults_count",
    "srch_children_count",
    "srch_room_count",
    "srch_saturday_night_bool",
    "booking_window",
    "random_bool",
    "day_of_week",
    "month",
    "hour",
    "site_id",
    "visitor_location_country_id",
    "srch_destination_id",
]
for col in expedia_df.columns:
    if col.startswith("prop_country_id"):
        contexts_items_features_names += [col]
    if col.startswith("site_id"):
        contexts_features_names += [col]
    if col.startswith("visitor_location_country_id"):
        contexts_features_names += [col]
    if col.startswith("srch_destination_id"):
        contexts_features_names += [col]

wide_items = []
for i in range(39):
    try:
        sub_df = expedia_df.groupby("srch_id").apply(lambda x: x[contexts_items_features_names].iloc[i])
    except IndexError:
        # Add dummy row
        sub_df = pd.DataFrame({col: -1 for col in contexts_items_features_names}, index=[0])
    wide_items.append(sub_df)
contexts_features = expedia_df.groupby("srch_id").apply(lambda x: x[contexts_features_names].iloc[0])
wide_df = pd.concat(wide_items+[contexts_features], axis=1)

In [None]:
wide_df_memory_size = []
for length in data_lengths:
    sub_wide_df = wide_df.iloc[:length].copy()
    wide_df_memory_size.append(get_obj_size(sub_wide_df))

print(wide_df_memory_size)

### Torch-Choice

For this part you will need the torch-choice package: ```pip install torch-choice```

In [None]:
import torch
from torch_choice.utils.easy_data_wrapper import EasyDatasetWrapper

In [None]:
# First create long_df with previous section

long_df = long_df.reset_index(drop=True)
long_df.sort_values("srch_id", inplace=True, ignore_index=True)

In [None]:
items_ids = []
for nit in long_df.srch_id.value_counts().sort_index():
    items_ids.append(np.arange(nit))
long_df["items_id"] = np.concatenate(items_ids)

In [None]:
tc_mem_sizes = []
data_lengths = [100, 1000, 10000, 100000, 397618]
for length in data_lengths:
    ids = long_df.srch_id.unique()[:length]
    sub_long_df = long_df[long_df.srch_id.isin(ids)].copy(deep=True)

    data_1 = EasyDatasetWrapper(main_data=sub_long_df,
                            purchase_record_column='srch_id',
                            choice_column='booking_bool',
                            item_name_column='items_id',
                            session_index_column='srch_id',
                            user_index_column='srch_id',
                            # it can be derived from columns of the dataframe or supplied as
                            user_observable_columns=['srch_length_of_stay',
                                                    'srch_adults_count',
                                                    'srch_children_count',
                                                    'srch_room_count',
                                                    'srch_saturday_night_bool'],
                            price_observable_columns=['log_price'],
                            device="cpu")
    tc_mem_sizes.append(get_obj_size(data_1))
print(tc_mem_sizes)

### Results

| Dataset size | 100 | 1.000 | 10.000 | 100.000 | 397.618 |
|---|---|---|---|---|---|
| Choice-Learn | 369.157 | 3.574.957 | 35.632.957 | 356.212.957 | 1.416.328.273 |
| Torch-Choice | 5.825.168 | 5.5093.676 | 550.480.667 | 5.515.236.600 | 10.448.857.759 |
| Long format DF | 5.521.360 | 52.463.080 | 524.667.470 | 5.234.198.450 | 20.815.361.140 |
| Wide format DF | 3.834.744 | 35.262.904 | 349.542.904 | 3.492.342.904 | 13.885.163.464 |


data_lengths: [100, 1000, 10000, 100000, 397618]\
Choice-Learn [369157, 3574957, 35632957, 356212957, 1416328273]\
Torch-Choice: [5825088, 55093564, 550480555, 5515236488, 10448857759]\
DF Long Format: [5521360, 52463080, 524667470, 5234198450, 20815361140]\
DF Wide Format: [3834744, 35262904, 349542904, 3492342904, 13885163464]

## Plots and Illustrations

In [None]:
colors=["#e69f00", "#56b4e9", "#009e73", "#0072b2", "#d55e00", "#cc79a7", "#f0e442"]
linestyle=["-", "--", "-.", ":", "-", "--", "-."]

fig = plt.figure(figsize=(12, 5))
fig.tight_layout()
fig.subplots_adjust(hspace=0.35)

sizes = [10, 100, 1000, 10000, 100000, 10000000]
dense_1 = [928, 8128, 80128, 800128, 8000128, 800000128]
fbid_1 =[1036, 1756, 8956, 80956, 800956, 80000956]
dense_2 = [8128, 80128, 800128, 8000128, 80000128, 8000000128]
fbid_2 = [7892, 8612, 15812, 87812, 807812, 80007812]
plt.subplot(1, 2, 1)
plt.plot(sizes, dense_1, label='w/o FeaturesStorage - 10 locations', c=colors[0], ls=linestyle[0])
plt.plot(sizes, fbid_1, label='w/ FeaturesStorage - 10 locations', c=colors[1], ls=linestyle[1])
plt.scatter(sizes, dense_1, c=colors[0])
plt.scatter(sizes, fbid_1, c=colors[1])

plt.plot(sizes, dense_2, label='w/o FeaturesStorage - 100 locations', c=colors[2], ls=linestyle[2])
plt.plot(sizes, fbid_2, label='w/ FeaturesStorage - 100 locations', c=colors[3], ls=linestyle[3])
plt.scatter(sizes, dense_2, c=colors[2])
plt.scatter(sizes, fbid_2, c=colors[3])
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")
plt.legend(prop={'size': 8})

plt.title("(a) Memory usage efficiency of FeaturesStorage \n for one hot encoded features", y=-.3)

plt.subplot(1, 2, 2)
sizes = [100, 1000, 10000, 100000, 397618]
expedia_choice_learn= [369157, 3574957, 35632957, 356212957, 1416328273]
expedia_torch_choice = [5825088, 55093564, 550480555, 5515236488, 10448857759]
expedia_long = [5521360, 52463080, 524667470, 5234198450, 20815361140]
expedia_wide = [3834744, 35262904, 349542904, 3492342904, 13885163464]

plt.plot(sizes, expedia_choice_learn, label="Choice-Learn", c=colors[3], ls=linestyle[3])
plt.plot(sizes, expedia_torch_choice, label="Torch-Choice", c=colors[1], ls=linestyle[1])
plt.plot(sizes, expedia_long, label="PyLogit (long format)", c=colors[0], ls=linestyle[0])
plt.plot(sizes, expedia_wide, label="Biogeme (wide format)", c=colors[2], ls=linestyle[2])
plt.scatter(sizes, expedia_choice_learn, c=colors[3])
plt.scatter(sizes, expedia_torch_choice, c=colors[1])
plt.scatter(sizes, expedia_long, c=colors[0])
plt.scatter(sizes, expedia_wide, c=colors[2])
plt.legend(prop={'size': 8})
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")
plt.title("(b) Memory usage of the Expedia dataset \n for different dataset sizes", y=-.3)

In [None]:
colors=["#e69f00", "#56b4e9", "#009e73", "#0072b2", "#d55e00", "#cc79a7", "#f0e442"]
linestyle=["-", "--", "-.", ":", "-", "--", "-."]

fig = plt.figure(figsize=(12, 10))
fig.tight_layout()
fig.subplots_adjust(hspace=0.35)

sizes = [10, 100, 1000, 10000, 100000, 10000000]
dense_1 = [928, 8128, 80128, 800128, 8000128, 800000128]
fbid_1 =[1036, 1756, 8956, 80956, 800956, 80000956]
dense_2 = [8128, 80128, 800128, 8000128, 80000128, 8000000128]
fbid_2 = [7892, 8612, 15812, 87812, 807812, 80007812]
plt.subplot(2, 2, 1)
plt.plot(sizes, dense_1, label='w/o FeaturesStorage - 10 locations', c=colors[0], ls=linestyle[0])
plt.plot(sizes, fbid_1, label='w/ FeaturesStorage - 10 locations', c=colors[1], ls=linestyle[1])
plt.scatter(sizes, dense_1, c=colors[0])
plt.scatter(sizes, fbid_1, c=colors[1])

plt.plot(sizes, dense_2, label='w/o FeaturesStorage - 100 locations', c=colors[2], ls=linestyle[2])
plt.plot(sizes, fbid_2, label='w/ FeaturesStorage - 100 locations', c=colors[3], ls=linestyle[3])
plt.scatter(sizes, dense_2, c=colors[2])
plt.scatter(sizes, fbid_2, c=colors[3])
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")
plt.legend(prop={'size': 8})

plt.title("(a) Memory usage efficiency of FeaturesStorage \n for one hot encoded features", y=-.3)

plt.subplot(2, 2, 2)
sizes = [100, 1000, 10000, 100000, 397618]
expedia_choice_learn= [369157, 3574957, 35632957, 356212957, 1416328273]
expedia_torch_choice = [5825088, 55093564, 550480555, 5515236488, 10448857759]
expedia_long = [5521360, 52463080, 524667470, 5234198450, 20815361140]
expedia_wide = [3834744, 35262904, 349542904, 3492342904, 13885163464]

plt.plot(sizes, expedia_choice_learn, label="Choice-Learn", c=colors[3], ls=linestyle[3])
plt.plot(sizes, expedia_torch_choice, label="Torch-Choice", c=colors[1], ls=linestyle[1])
plt.plot(sizes, expedia_long, label="PyLogit (long format)", c=colors[0], ls=linestyle[0])
plt.plot(sizes, expedia_wide, label="Biogeme (wide format)", c=colors[2], ls=linestyle[2])
plt.scatter(sizes, expedia_choice_learn, c=colors[3])
plt.scatter(sizes, expedia_torch_choice, c=colors[1])
plt.scatter(sizes, expedia_long, c=colors[0])
plt.scatter(sizes, expedia_wide, c=colors[2])
plt.legend(prop={'size': 8})
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")
plt.title("(b) Memory usage of the Expedia dataset \n for different dataset sizes", y=-.3)

plt.subplot(2, 2, 3)

### Results with our proprietary dataset
sizes = [100, 1000, 10000.0, 100000.0, 1000000.0, 4789225]
tc = [3933312, 4854912, 14070912, 106230912, 1027830912, 4907997312]
# Wide 
wide = [629748, 5954312, 59178320, 591244240, 5914324260, 28317686484]
# Long
long = [729260, 7216256, 79819560, 1000640616, 10190708220, 47241911756]
# CL
cl = [163734, 526146, 3921306, 34453314, 319713662, 1546499942]

plt.plot(sizes, cl, label="Choice-Learn", c=colors[3], ls=linestyle[3])
plt.plot(sizes, tc, label="Torch-Choice", c=colors[1], ls=linestyle[1])
plt.plot(sizes, long, label="PyLogit (long format)", c=colors[0], ls=linestyle[0])
plt.plot(sizes, wide, label="Biogeme (wide format)", c=colors[2], ls=linestyle[2])
plt.scatter(sizes, cl, c=colors[3])
plt.scatter(sizes, tc, c=colors[1])
plt.scatter(sizes, long, c=colors[0])
plt.scatter(sizes, wide, c=colors[2])
plt.legend(prop={'size': 8})
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")
plt.title("(c) Memory usage of our retail dataset \n for different dataset sizes", y=-.3)

plt.subplot(2, 2, 4)
### Results with our proprietary dataset
n_stores = [0, 10, 100, 250, 692]
cl_mem_usage = [352083911, 352084711, 352163911, 352583911, 355914823]
other_mem = [1027830912, 1027831712, 1027910912, 1028330912, 1031661824]
long_mem_usage = [4654663932, 4734664572, 5454670332, 6654679932, 10190708220]
wide_mem_usage = [378279972, 458280612, 1178286372, 2378295972, 5914324260]

plt.plot(n_stores, cl_mem_usage, label="Choice-Learn", c=colors[3], ls=linestyle[3])
plt.plot(n_stores, other_mem, label="Torch-Choice", c=colors[1], ls=linestyle[1])
plt.plot(n_stores, long_mem_usage, label="PyLogit (long format)", c=colors[0], ls=linestyle[0])
plt.plot(n_stores, wide_mem_usage, label="Biogeme (wide format)", c=colors[2], ls=linestyle[2])
plt.scatter(n_stores, cl_mem_usage, c=colors[3])
plt.scatter(n_stores, other_mem, c=colors[1])
plt.scatter(n_stores, long_mem_usage, c=colors[0])
plt.scatter(n_stores, wide_mem_usage, c=colors[2])
plt.legend(prop={'size': 8})
plt.yscale("log")
plt.xlabel("Stores Number")
plt.ylabel("Memory usage (bytes)")
plt.title("(d) Memory usage of our retail dataset \n for different number of stores", y=-.3)
# plt.xticks([0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000],
#            [0, 50, 100, 150, 200, 250, 300, 350, 400])
plt.show()

## FeaturesByIDs Study

In [None]:
n_fixed_features = 10
n_different_values = 10

normal_sizes = []
cd_sizes = []
ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]
for dataset_len in ds_lengths:

    normal_dataset = np.ones((dataset_len, n_fixed_features))
    cd_dataset = (np.ones((dataset_len, 1)), np.ones((n_different_values, n_fixed_features)))
    
    cd_sizes.append(sys.getsizeof(cd_dataset[0]) + sys.getsizeof(cd_dataset[1]))
    normal_sizes.append(sys.getsizeof(normal_dataset))

plt.plot(ds_lengths, normal_sizes, label='w/o FeaturesById - (10, 10)', c="darkblue")
plt.plot(ds_lengths, cd_sizes, label='w/ FeaturesById - (10, 10)', c="turquoise")
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")

n_fixed_features = 100
n_different_values = 100

normal_sizes = []
cd_sizes = []
ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]
for dataset_len in ds_lengths:

    normal_dataset = np.ones((dataset_len, n_fixed_features))
    cd_dataset = (np.ones((dataset_len, 1)), np.ones((n_different_values, n_fixed_features)))
    
    cd_sizes.append(sys.getsizeof(cd_dataset[0]) + sys.getsizeof(cd_dataset[1]))
    normal_sizes.append(sys.getsizeof(normal_dataset))

plt.plot(ds_lengths, normal_sizes, label='w/o FeaturesById - (100, 100)', c="cornflowerblue")
plt.plot(ds_lengths, cd_sizes, label='w/ FeaturesById - (100, 100)', c="teal")
plt.yscale("log")
plt.xscale("log")
plt.xlabel("Dataset Size")
plt.ylabel("Memory usage (bytes)")
plt.legend()