In [1]:
DTYPES = {
    'date_time': 'string',
    'site_name': 'uint8',
    'posa_continent': 'uint8',
    'user_location_country': 'uint8',
    'user_location_region': 'uint16',
    'user_location_city': 'uint16',
    'orig_destination_distance': 'float32',
    'user_id': 'uint32',
    'is_mobile': 'bool',
    'is_package': 'bool',
    'channel': 'uint8',
    'srch_ci': 'string',
    'srch_co': 'string',
    'srch_adults_cnt': 'uint8',
    'srch_children_cnt': 'uint8',
    'srch_rm_cnt': 'uint8',
    'srch_destination_id': 'uint16',
    'srch_destination_type_id': 'uint8',
    'is_booking': 'bool',
    'cnt': 'uint16',
    'hotel_continent': 'uint8',
    'hotel_country': 'uint8',
    'hotel_market': 'uint16',
    'hotel_cluster': 'uint8',
}
DATETIME_COLUMNS = ['date_time', 'srch_ci', 'srch_co']
BOOLEAN_COLUMNS = ['is_booking', 'is_mobile', 'is_package']

In [2]:
import numpy as np
import polars as pl


def map_to_polars(dtype: str):
    conversion = {
        'string': pl.String,
        'uint8': pl.UInt8,
        'uint16': pl.UInt16,
        'uint32': pl.UInt32,
        'float32': pl.Float32,
        'bool': pl.UInt8 
    }
    return conversion[dtype]

dtypes = {k: map_to_polars(v) for k, v in DTYPES.items()}
df = pl.read_csv('../data/raw/train.csv', dtypes=dtypes)
df = df.with_columns(
    *[pl.col(col).str.to_datetime() for col in DATETIME_COLUMNS]
)
for col in BOOLEAN_COLUMNS:
    df = df.replace(col, df[col] == 1)

    df = df.with_columns(new_column.alias(column_name))
instead.
  df = df.replace(col, df[col] == 1)


In [3]:
TARGET = 'is_booking'

In [4]:
df = df.with_columns(
    co_ci_diff=(pl.col('srch_co') - pl.col('srch_ci')).dt.total_days().cast(pl.Int16),
    ci_dt_diff=(pl.col('srch_ci') - pl.col('date_time')).dt.total_days().cast(pl.Int32)
)

In [32]:
numerical_columns = [column for column in df.columns if df[column].dtype not in [pl.Boolean, pl.String, pl.Datetime]]
categorical_columns = [column for column in df.columns if df[column].dtype in [pl.Boolean, pl.String]]
str_columns = [column for column in df.columns if df[column].dtype == pl.String]
datetime_columns = [column for column in df.columns if column not in numerical_columns and column not in categorical_columns]

In [33]:
data = df[['date_time'] + numerical_columns + categorical_columns]

In [34]:
x = data.drop(TARGET)
y = data[TARGET]

In [35]:
from collections.abc import Callable

from numpy.typing import NDArray


Metric = Callable[[NDArray, NDArray, NDArray], float]

In [112]:
from datetime import datetime, timedelta
from math import floor

import pandas as pd

from sklearn.base import BaseEstimator, clone
from tqdm.notebook import tqdm


def get_interval(start: datetime, end: datetime, dt_column: pl.Series) -> pl.Series:
    return (dt_column >= start) & (dt_column < end)

def handle_nans(df: pd.DataFrame) -> pd.DataFrame:
    df[df.isna()] = np.nan
    return df


def blocked_cross_validation(
    start: datetime,
    end: datetime,
    training_interval_len: timedelta,
    test_interval_len: timedelta,
    estimator: BaseEstimator,
    x: pl.DataFrame,
    y: pl.DataFrame,
    dt_column: pl.Series,
    metrics: dict[str, Metric]
) -> tuple[dict[str, list[float]], list[BaseEstimator]]:
    result_metrics = {key: [] for key in metrics.keys()}
    estimators = []
    n_intervals = floor((end - test_interval_len - start) / training_interval_len)

    for i in tqdm(range(n_intervals - 1)):
        est = clone(estimator)
        training_dt_interval = start + i * training_interval_len, start + (i + 1) * training_interval_len
        test_dt_interval = training_dt_interval[-1], training_dt_interval[-1] + test_interval_len
        training_interval = get_interval(*training_dt_interval, dt_column)
        test_interval = get_interval(*test_dt_interval, dt_column)
        est = est.fit(
            handle_nans(x.filter(training_interval).to_pandas()), 
            y.filter(training_interval).to_pandas()
        )
        x_test = handle_nans(x.filter(test_interval).to_pandas())
        y_test = y.filter(test_interval).to_pandas()
        test_probas_predictions = est.predict_proba(x_test)
        test_predictions = est.predict(x_test)

        for key, metric in metrics.items():
            result_metrics[key].append(metric(y_test, test_predictions, test_probas_predictions))
        
        estimators.append(est)

    return result_metrics, estimators        

In [113]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


cat_columns = categorical_columns.copy()
cat_columns.remove('is_booking')
train_data = x.drop('date_time')

numerical_transform = Pipeline([
    ('NumericalImputer', SimpleImputer(strategy='median', missing_values=np.nan)),
    ('Scaler', StandardScaler()),
])

cat_transform = Pipeline([
    ('CategoricalImputer', SimpleImputer(strategy='most_frequent')),
    ('OneHot', OneHotEncoder())
])


base_pipeline = Pipeline([
    (
        'TransformingColumns',
        ColumnTransformer([
            ('Numerical', numerical_transform, numerical_columns),
            ('Categorical', cat_transform, str_columns)
        ])
    ),
    ('Logreg', LogisticRegression())
])

In [114]:
from sklearn.metrics import accuracy_score


def accuracy(y_true: NDArray, y_pred: NDArray, probas: NDArray) -> float:
    return accuracy_score(y_true, y_pred)

In [115]:
metrics, estimators = blocked_cross_validation(
    x['date_time'].min(),
    x['date_time'].max(),
    timedelta(days=14),
    timedelta(days=7),
    base_pipeline,
    x=x.drop('date_time'),
    y=y,
    dt_column=x['date_time'],
    metrics={
        'accuracy': accuracy
    }
)

  0%|          | 0/50 [00:00<?, ?it/s]

In [116]:
metrics

{'accuracy': [0.9108279953448325,
  0.8931218466758941,
  0.9124201409617835,
  0.9108920891163276,
  0.9116017527201136,
  0.9089401576668465,
  0.9080102717031147,
  0.9048914993040842,
  0.9042779249667694,
  0.9067562336331549,
  0.9079329427639332,
  0.9116167443577791,
  0.9129467000754129,
  0.9125516588969548,
  0.9096090568026917,
  0.9065447071094102,
  0.9060179663120645,
  0.9076864576148281,
  0.9029109906915481,
  0.9054218967921897,
  0.9049106672750886,
  0.9046437487606331,
  0.9069737895986963,
  0.9061032606911045,
  0.9166078536544053,
  0.9144604032342826,
  0.9153258179818532,
  0.9145987002182547,
  0.9142649146434682,
  0.9205328281151857,
  0.9212114411111942,
  0.9180942962682878,
  0.9185104074732511,
  0.9169454531360147,
  0.9163258066107663,
  0.9199963812247348,
  0.9222884269909526,
  0.9260913074462117,
  0.9298683914169288,
  0.9306258803078361,
  0.9283309046966098,
  0.9271749907304715,
  0.9284368363698443,
  0.9279566754834317,
  0.9261252220911419