# RAM usage with Choice-Learn

## On the ICDM 2013 Expedia Dataset

In [None]:
import sys

sys.path.append("../../")
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pf

from choice_learn.datasets import load_expedia
from choice_learn.data import ChoiceDataset


Accessing the RAM usag is actually not that obvious. Here is a code snippet finding all references link to an object in order to addition their memory consumption.
This is explained [here](https://stackoverflow.com/questions/13530762/how-to-know-bytes-size-of-python-object-like-arrays-and-dictionaries-the-simp).

In [None]:
import gc
import sys

def get_obj_size(obj):
    marked = {id(obj)}
    obj_q = [obj]
    sz = 0

    while obj_q:
        sz += sum(map(sys.getsizeof, obj_q))

        # Lookup all the object referred to by the object in obj_q.
        all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))

        # Filter object that are already marked.
        new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}

        obj_q = new_refr.values()
        marked.update(new_refr.keys())

    return sz

# Defining tested data lengths
data_lengths = [100, 1000, 10000, 100000, 397618]

### With Choice-Learn and FeaturesByIDs

In [None]:
# Takes some time
dataset = load_expedia(as_frame=False, preprocessing="rumnet")

In [None]:
cl_w_fbid_memory_size = []
for length in data_lengths:
    sub_dataset = dataset[:length]
    mem_size = 0
    mem_size += get_obj_size(np.copy(sub_dataset.shared_features_by_choice))
    mem_size += get_obj_size(np.copy(sub_dataset.items_features_by_choice))
    mem_size += get_obj_size(np.copy(sub_dataset.choices))
    mem_size += get_obj_size(np.copy(sub_dataset.available_items_by_choice))
    mem_size += get_obj_size(sub_dataset.features_by_ids)
    cl_w_fbid_memory_size.append(mem_size)

In [None]:
print(cl_w_fbid_memory_size)

### Choice-Learn without FeaturesByIDs

In [None]:
# dataset = load_expedia(as_frame=False, preprocessing="rumnet")
sfbc = []
ifbc = []
for batch in dataset.iter_batch(batch_size=1024):
    batch_sfbc = batch[0]
    sfbc.append(batch[0])
    ifbc.append(batch[1])

sfbc = np.concatenate(sfbc, axis=0)
ifbc = np.concatenate(ifbc, axis=0)
print(sfbc.shape, ifbc.shape)

In [None]:
wo_fbid_dataset = ChoiceDataset(
    shared_features_by_choice=sfbc,
    items_features_by_choice=ifbc,
    available_items_by_choice=dataset.available_items_by_choice,
    choices=dataset.choices
)

wofbid_mem = []
for length in data_lengths:
    sub_dataset = wo_fbid_dataset[:length]
    mem_size = 0
    mem_size += get_obj_size(np.copy(sub_dataset.shared_features_by_choice))
    mem_size += get_obj_size(np.copy(sub_dataset.items_features_by_choice))
    mem_size += get_obj_size(np.copy(sub_dataset.choices))
    mem_size += get_obj_size(np.copy(sub_dataset.available_items_by_choice))
    mem_size += get_obj_size(sub_dataset.features_by_ids)
    wofbid_mem.append(mem_size)

In [None]:
print(wofbid_mem)

In [None]:
cl_w_fbid_memory_size

In [None]:
plt.plot(wofbid_mem, label="without")
plt.plot(cl_w_fbid_memory_size, label="with")
plt.legend()

In [None]:
wo_fbid_dataset.available_items_by_choice[0].dtype

In [None]:
get_obj_size(np.copy(dataset.shared_features_by_choice[0])) / get_obj_size(np.copy(wo_fbid_dataset.shared_features_by_choice[0]))

### pandas.DataFrame Long format

In [None]:
import pandas as pd
import numpy as np

expedia_df = load_expedia(as_frame=True)
logging.info("rumnet preprocessing selected, starting preprocessing...")
expedia_df.date_time = pd.to_datetime(expedia_df.date_time, format="%Y-%m-%d %H:%M:%S")
expedia_df.loc[:, "day_of_week"] = expedia_df.loc[:, "date_time"].dt.dayofweek
expedia_df.loc[:, "month"] = expedia_df.loc[:, "date_time"].dt.month
expedia_df.loc[:, "hour"] = expedia_df.loc[:, "date_time"].dt.hour

logging.info("Filtering ids with less than 1000 occurrences")
for id_col in [
    "site_id",
    "visitor_location_country_id",
    "prop_country_id",
    "srch_destination_id",
]:
    value_counts = expedia_df[["srch_id", id_col]].drop_duplicates()[id_col].value_counts()
    kept_ids = value_counts.index[value_counts.gt(1000)]
    for id_ in expedia_df[id_col].unique():
        if id_ not in kept_ids:
            expedia_df.loc[expedia_df[id_col] == id_, id_col] = -1

logging.info("Filtering DF for price, stay length, booking window, etc.")
# Filtering
expedia_df = expedia_df[expedia_df.price_usd <= 1000]
expedia_df = expedia_df[expedia_df.price_usd >= 10]
expedia_df["log_price"] = expedia_df.price_usd.apply(np.log)
expedia_df = expedia_df[expedia_df.srch_length_of_stay <= 14]
expedia_df = expedia_df[expedia_df.srch_booking_window <= 365]
expedia_df["booking_window"] = np.log(expedia_df["srch_booking_window"] + 1)
expedia_df = expedia_df.fillna(-1)

logging.info("Sorting DF columns")
order_cols = [
    "srch_id",
    "prop_id",
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1",
    "prop_location_score2",
    "prop_log_historical_price",
    "position",
    "promotion_flag",
    "srch_length_of_stay",
    "srch_adults_count",
    "srch_children_count",
    "srch_room_count",
    "srch_saturday_night_bool",
    "orig_destination_distance",
    "random_bool",
    "day_of_week",
    "month",
    "hour",
    "log_price",
    "booking_window",
    "site_id",
    "visitor_location_country_id",
    "prop_country_id",
    "srch_destination_id",
    "click_bool",
    "booking_bool",
]
expedia_df = expedia_df[order_cols]

logging.info("Creating dummy availabilities")

# getting rid of search & prop_id and the clickbool and bookingbool
# adding no_purchase fixed effect
expedia_df["is_no_purchase"] = 0

logging.info("Creating the no purchase option")
# adding the no_purchase option to the data
df1 = (
    expedia_df.groupby("srch_id")
    .filter(lambda x: x.booking_bool.sum() == 1)
    .groupby("srch_id")
    .max()
    .reset_index(drop=False)
)
df1.loc[:, "is_no_purchase"] = 1
df1.loc[:, "log_price"] = 0
df1.loc[:, "booking_bool"] = 0

df2 = (
    expedia_df.groupby("srch_id")
    .filter(lambda x: x.booking_bool.sum() == 0)
    .groupby("srch_id")
    .max()
    .reset_index(drop=False)
)
df2.loc[:, "is_no_purchase"] = 1
df2.loc[:, "log_price"] = 0
df2.loc[:, "booking_bool"] = 1
expedia_df = pd.concat([expedia_df, df1, df2])

site_id_one_hot = pd.get_dummies(expedia_df.site_id, prefix="site_id")
visitor_location_country_id_one_hot = pd.get_dummies(expedia_df.visitor_location_country_id, prefix="visitor_location_country_id")
srch_destination_id_one_hot =pd.get_dummies(expedia_df.srch_destination_id, prefix="srch_destination_id")
prop_country_id_one_hpt = pd.get_dummies(expedia_df.prop_country_id, prefix="prop_country_id")
expedia_df = pd.concat([expedia_df, site_id_one_hot, visitor_location_country_id_one_hot, srch_destination_id_one_hot, prop_country_id_one_hpt], axis=1)

logging.info("Sorting the data frame")
expedia_df = expedia_df.sort_values("srch_id")
choices = ["booking_bool"]

logging.info("DF to NDarray and creating the ChoiceDataset object")
contexts_features_names = [
    "srch_id",
    "srch_length_of_stay",
    "srch_adults_count",
    "srch_children_count",
    "srch_room_count",
    "srch_saturday_night_bool",
    "booking_window",
    "random_bool",
    "day_of_week",
    "month",
    "hour",
    "site_id",
    "visitor_location_country_id",
    "srch_destination_id",
]
contexts_features_names += site_id_one_hot.columns.tolist()
contexts_features_names += visitor_location_country_id_one_hot.columns.tolist()
contexts_features_names += srch_destination_id_one_hot.columns.tolist()
contexts_features_names += prop_country_id_one_hpt.columns.tolist()

contexts_items_features_names = [
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1",
    "prop_location_score2",
    "prop_log_historical_price",
    "position",
    "promotion_flag",
    "orig_destination_distance",
    "log_price",
    "prop_country_id",
]

long_df = expedia_df[contexts_features_names + contexts_items_features_names + choices]

In [None]:
long_df_memory_size = []
for length in data_lengths:
    srch_ids = long_df.srch_id.unique()[:length]
    sub_long_df = long_df[long_df.srch_id.isin(srch_ids)]
    long_df_memory_size.append(get_obj_size(sub_long_df))

### Results

| Dataset size | 100 | 1.000 | 10.000 | 100.000 | 397.618 |
|---|---|---|---|---|---|
| CD w. FeaturesByIDs | 398.887 | 3.869.287 | 38.573.287 | 385.613.287 | 1.533.228.295 |
| CD wo FeaturesByIDs | 190.028 | 1.887.428 | 18.861.428 | 188.601.428 | 749.908.976 |

data_lengths: [100, 1000, 10000, 100000, 397618]\
ChoiceDataset with FeaturesByIDs: [85640, 771440, 7629440, 76209440, 302994356]\
ChoiceDataset without FeaturesByIDs: [220400, 2198600, 21980600, 219800600, 873964964]\
DF Long Format: [5521360, 52463080, 524667470, 5234198450, 20815361140]