In [1]:
DTYPES = {
    'date_time': 'string',
    'site_name': 'uint8',
    'posa_continent': 'uint8',
    'user_location_country': 'uint8',
    'user_location_region': 'uint16',
    'user_location_city': 'uint16',
    'orig_destination_distance': 'float32',
    'user_id': 'uint32',
    'is_mobile': 'bool',
    'is_package': 'bool',
    'channel': 'uint8',
    'srch_ci': 'string',
    'srch_co': 'string',
    'srch_adults_cnt': 'uint8',
    'srch_children_cnt': 'uint8',
    'srch_rm_cnt': 'uint8',
    'srch_destination_id': 'uint16',
    'srch_destination_type_id': 'uint8',
    'is_booking': 'bool',
    'cnt': 'uint16',
    'hotel_continent': 'uint8',
    'hotel_country': 'uint8',
    'hotel_market': 'uint16',
    'hotel_cluster': 'uint8',
}
DATETIME_COLUMNS = ['date_time', 'srch_ci', 'srch_co']
BOOLEAN_COLUMNS = ['is_booking', 'is_mobile', 'is_package']

In [2]:
import numpy as np
import polars as pl


def map_to_polars(dtype: str):
    conversion = {
        'string': pl.String,
        'uint8': pl.UInt8,
        'uint16': pl.UInt16,
        'uint32': pl.UInt32,
        'float32': pl.Float32,
        'bool': pl.UInt8 
    }
    return conversion[dtype]

dtypes = {k: map_to_polars(v) for k, v in DTYPES.items()}
df = pl.read_csv('../data/raw/train.csv', dtypes=dtypes)
df = df.with_columns(
    *[pl.col(col).str.to_datetime() for col in DATETIME_COLUMNS]
)
for col in BOOLEAN_COLUMNS:
    df = df.replace(col, df[col] == 1)

    df = df.with_columns(new_column.alias(column_name))
instead.
  df = df.replace(col, df[col] == 1)


In [3]:
TARGET = 'is_booking'

In [4]:
df = df.with_columns(
    co_ci_diff=(pl.col('srch_co') - pl.col('srch_ci')).dt.total_days().cast(pl.Int16),
    ci_dt_diff=(pl.col('srch_ci') - pl.col('date_time')).dt.total_days().cast(pl.Int32)
)

In [5]:
numerical_columns = [column for column in df.columns if df[column].dtype not in [pl.Boolean, pl.String, pl.Datetime]]
categorical_columns = [column for column in df.columns if df[column].dtype in [pl.Boolean, pl.String]]
str_columns = [column for column in df.columns if df[column].dtype == pl.String]
datetime_columns = [column for column in df.columns if column not in numerical_columns and column not in categorical_columns]

In [6]:
data = df[['date_time'] + numerical_columns + categorical_columns]

In [7]:
x = data.drop(TARGET)
y = data[TARGET]

In [8]:
from collections.abc import Callable

from numpy.typing import NDArray
from sklearn.base import BaseEstimator


Metric = Callable[[pl.DataFrame, pl.DataFrame, BaseEstimator], float]

In [9]:
from datetime import datetime, timedelta
from math import floor

import pandas as pd

from sklearn.base import BaseEstimator, clone
from tqdm.notebook import tqdm


def get_interval(start: datetime, end: datetime, dt_column: pl.Series) -> pl.Series:
    return (dt_column >= start) & (dt_column < end)

def handle_nans(df: pd.DataFrame) -> pd.DataFrame:
    df[df.isna()] = np.nan
    return df


def blocked_cross_validation(
    start: datetime,
    end: datetime,
    training_interval_len: timedelta,
    test_interval_len: timedelta,
    estimator: BaseEstimator,
    x: pl.DataFrame,
    y: pl.DataFrame,
    dt_column: pl.Series,
    metrics: dict[str, Metric]
) -> tuple[dict[str, list[float]], list[BaseEstimator]]:
    result_metrics = {key: [] for key in metrics.keys()}
    estimators = []
    n_intervals = floor((end - test_interval_len - start) / training_interval_len)

    for i in tqdm(range(n_intervals - 1)):
        est = clone(estimator)
        training_dt_interval = start + i * training_interval_len, start + (i + 1) * training_interval_len
        test_dt_interval = training_dt_interval[-1], training_dt_interval[-1] + test_interval_len
        training_interval = get_interval(*training_dt_interval, dt_column)
        test_interval = get_interval(*test_dt_interval, dt_column)
        est = est.fit(
            handle_nans(x.filter(training_interval).to_pandas()), 
            y.filter(training_interval).to_pandas()
        )
        x_test = x.filter(test_interval)
        y_test = y.filter(test_interval)
        #x_test = handle_nans(x.filter(test_interval).to_pandas())
        #y_test = y.filter(test_interval).to_pandas()

        for key, metric in metrics.items():
            result_metrics[key].append(metric(x_test, y_test, est))
        
        estimators.append(est)

    return result_metrics, estimators        

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


cat_columns = categorical_columns.copy()
cat_columns.remove('is_booking')
train_data = x.drop('date_time')

numerical_transform = Pipeline([
    ('NumericalImputer', SimpleImputer(strategy='median', missing_values=np.nan)),
    ('Scaler', StandardScaler()),
])

cat_transform = Pipeline([
    ('CategoricalImputer', SimpleImputer(strategy='most_frequent')),
    ('OneHot', OneHotEncoder())
])


base_pipeline = Pipeline([
    (
        'TransformingColumns',
        ColumnTransformer([
            ('Numerical', numerical_transform, numerical_columns),
            ('Categorical', cat_transform, str_columns)
        ])
    ),
    ('Logreg', LogisticRegression())
])

In [11]:
from collections import defaultdict


cluster_country = x.select('hotel_country', 'hotel_market', 'hotel_cluster')
unique_values = cluster_country.unique()
country_to_hotel_data = defaultdict(list)

for row in unique_values.iter_rows():
    country_to_hotel_data[row[0]].append(row[1:])

In [20]:
x.select('hotel_cluster', 'hotel_country').n_unique()

6912

In [12]:
from sklearn.metrics import accuracy_score


def accuracy(x_test: NDArray, y_test: NDArray, estimator: BaseEstimator) -> float:
    return accuracy_score(y_test, estimator.predict(x_test))


def sample_rows(row, columns: list[str], n: int) -> list[tuple]:
    country_idx = columns.index('hotel_country')
    market_idx = columns.index('hotel_market')
    cluster_idx = columns.index('hotel_cluster')
    country = row[country_idx]
    result = [row]
    i = 0
    hd = country_to_hotel_data[country]
    while len(result) < n and i < len(hd):
        if hd[i] == (row[market_idx], row[cluster_idx]):
            i += 1
            continue

        row_copy = list(row)
        row_copy[market_idx], row_copy[cluster_idx] = hd[i]
        result.append(tuple(row_copy))
        i += 1
    return result


def single_mrr(row: tuple, x_test: pl.DataFrame, 
               estimator: BaseEstimator, n: int = 5) -> float:
    rows = sample_rows(row, x_test.columns, n)
    rows = pl.DataFrame(
        data={
            col: [row[idx] for row in rows] for idx, col in enumerate(x_test.columns)
        },
        schema=x_test.schema
    )
    preds = list(estimator.predict_proba(rows.to_pandas())[:, 1].reshape(-1))
    pred = preds[0]
    preds.sort(reverse=True)
    return 1 / (1 + preds.index(pred))


def mrr(x_test: pl.DataFrame, y_test: pl.DataFrame, 
        estimator: BaseEstimator, n: int = 5) -> float:
    x_test = x_test.filter(y_test)
    result = 0
    for row in x_test.iter_rows():
        result += single_mrr(row, x_test, estimator, n) / len(x_test)
    return result
    

In [13]:
metrics, estimators = blocked_cross_validation(
    x['date_time'].min(),
    x['date_time'].max(),
    timedelta(days=14),
    timedelta(days=7),
    base_pipeline,
    x=x.drop('date_time'),
    y=y,
    dt_column=x['date_time'],
    metrics={
        'mrr': mrr
    }
)

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
metrics