In [1]:
DTYPES = {
    'date_time': 'string',
    'site_name': 'uint8',
    'posa_continent': 'uint8',
    'user_location_country': 'uint8',
    'user_location_region': 'uint16',
    'user_location_city': 'uint16',
    'orig_destination_distance': 'float32',
    'user_id': 'uint32',
    'is_mobile': 'bool',
    'is_package': 'bool',
    'channel': 'uint8',
    'srch_ci': 'string',
    'srch_co': 'string',
    'srch_adults_cnt': 'uint8',
    'srch_children_cnt': 'uint8',
    'srch_rm_cnt': 'uint8',
    'srch_destination_id': 'uint16',
    'srch_destination_type_id': 'uint8',
    'is_booking': 'bool',
    'cnt': 'uint16',
    'hotel_continent': 'uint8',
    'hotel_country': 'uint8',
    'hotel_market': 'uint16',
    'hotel_cluster': 'uint8',
}
DATETIME_COLUMNS = ['date_time', 'srch_ci', 'srch_co']
BOOLEAN_COLUMNS = ['is_booking', 'is_mobile', 'is_package']

In [12]:
import numpy as np
import polars as pl


def map_to_polars(dtype: str):
    conversion = {
        'string': pl.String,
        'uint8': pl.UInt8,
        'uint16': pl.UInt16,
        'uint32': pl.UInt32,
        'int16': pl.Int16,
        'int32': pl.Int32,
        'float32': pl.Float32,
        'bool': pl.UInt8 
    }
    return conversion[dtype]

def map_to_np(dtype):
    conversion = {
        pl.String: 'string',
        pl.UInt8: 'uint8',
        pl.UInt16: 'uint16',
        pl.UInt32: 'uint32',
        pl.Float32: 'float32',
        pl.Int8: 'int8',
        pl.Int16: 'int16',
        pl.Int32: 'int32',
        pl.Boolean: 'bool'
    }
    return conversion.get(dtype, 'datetime')

In [13]:
dtypes = {k: map_to_polars(v) for k, v in DTYPES.items()}
df = pl.read_csv('../data/raw/train.csv', dtypes=dtypes)
df = df.with_columns(
    *[pl.col(col).str.to_datetime() for col in DATETIME_COLUMNS]
)
for col in BOOLEAN_COLUMNS:
    df = df.replace(col, df[col] == 1)

    df = df.with_columns(new_column.alias(column_name))
instead.
  df = df.replace(col, df[col] == 1)


In [14]:
TARGET = 'is_booking'

In [26]:
from src.features import FeatureExtractorPipeline, AddColumns, DropColumns, ColumnSplitter, Cast


feature_extractor = FeatureExtractorPipeline([
    AddColumns({
        'co_ci_diff': ((pl.col('srch_co') - pl.col('srch_ci')).dt.total_days(), 'int16'),
        'ci_dt_diff': ((pl.col('srch_ci') - pl.col('date_time')).dt.total_days(), 'int32'),
        'ci_weekday': (pl.col('srch_ci').dt.weekday(), 'uint8'),
        'co_weekday': (pl.col('srch_co').dt.weekday(), 'uint8'),
        'date_time_weekday': (pl.col('date_time').dt.weekday(), 'uint8')
    }),
    DropColumns(['orig_destination_distance', 'srch_ci', 'srch_co']),
    Cast({
        'is_mobile': 'uint8',
        'is_package': 'uint8'
    }),
    ColumnSplitter(num_cat_threshold=250)
])

In [27]:
data, state = feature_extractor.fit_transform(df)

In [29]:
x = data.drop(TARGET)
y = data[TARGET]

In [30]:
from collections.abc import Callable

from sklearn.base import BaseEstimator


Metric = Callable[[pl.DataFrame, pl.DataFrame, BaseEstimator], float]

In [31]:
from datetime import datetime, timedelta
from math import floor

import pandas as pd

from sklearn.base import BaseEstimator, clone
from tqdm.notebook import tqdm


def get_interval(start: datetime, end: datetime, dt_column: pl.Series) -> pl.Series:
    return (dt_column >= start) & (dt_column < end)

def handle_nans(df: pd.DataFrame) -> pd.DataFrame:
    df[df.isna()] = np.nan
    return df


def blocked_cross_validation(
    start: datetime,
    end: datetime,
    training_interval_len: timedelta,
    test_interval_len: timedelta,
    estimator: BaseEstimator,
    x: pl.DataFrame,
    y: pl.DataFrame,
    dt_column: pl.Series,
    metrics: dict[str, Metric],
) -> tuple[dict[str, list[float]], list[BaseEstimator]]:
    result_metrics = {key: [] for key in metrics.keys()}
    estimators = []
    n_intervals = floor((end - test_interval_len - start) / training_interval_len)
    for i in tqdm(range(n_intervals - 1)):
        est = clone(estimator)
        training_dt_interval = start + i * training_interval_len, start + (i + 1) * training_interval_len
        test_dt_interval = training_dt_interval[-1], training_dt_interval[-1] + test_interval_len
        training_interval = get_interval(*training_dt_interval, dt_column)
        test_interval = get_interval(*test_dt_interval, dt_column)
        est = est.fit(
            handle_nans(x.filter(training_interval).to_pandas()), 
            y.filter(training_interval).to_pandas()
        )
        x_test = x.filter(test_interval)
        y_test = y.filter(test_interval)

        for key, metric in metrics.items():
            result_metrics[key].append(metric(x_test, y_test, est))
        
        estimators.append(est)

    return result_metrics, estimators        

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin


class ModelWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, base: ClassifierMixin) -> None:
        self.base = base
    

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


cat_columns = state.categorical_columns.copy()
cat_columns.remove('is_booking')
cat_columns.remove('date_time')

numerical_transform = Pipeline([
    ('NumericalImputer', SimpleImputer(strategy='median', missing_values=np.nan)),
    ('Scaler', StandardScaler()),
])

cat_transform = Pipeline([
    ('CategoricalImputer', SimpleImputer(strategy='most_frequent')),
    ('OneHot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_oh_transform = Pipeline([
    ('Imputer', SimpleImputer(strategy='most_frequent')),
    ('OneHot', OneHotEncoder(handle_unknown='ignore'))
])


base_pipeline = Pipeline([
    (
        'TransformingColumns',
        ColumnTransformer([
            ('Numerical', numerical_transform, state.numerical_columns),
            ('Categorical', cat_transform, cat_columns),
            ('NumCategorical', numerical_oh_transform, state.numerical_categorical_columns)
        ])
    ),
    ('Logreg', LogisticRegression(max_iter=1_000))
])

In [18]:
x.select('hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster').group_by(['hotel_continent', 'hotel_country', 'hotel_market']).n_unique()

hotel_continent,hotel_country,hotel_market,hotel_cluster
u8,u8,u16,u32
0,185,185,25
6,204,27,81
2,50,214,35
6,105,12,60
2,50,627,57
…,…,…,…
6,127,0,1
6,17,1730,8
4,25,172,1
5,172,1575,1


In [28]:
from collections import defaultdict

from numpy.typing import NDArray


class DataSampler:
    '''
    Class responsible for sampling negative samples for validation
    '''

    def __init__(self, df: pl.DataFrame) -> None:
        cluster_country = df.select('hotel_country', 'hotel_market', 'hotel_cluster')
        unique_values = cluster_country.unique()
        data = defaultdict(set)

        for row in unique_values.iter_rows():
            data[row[0], row[1]].add(row[-1])

        self.hotel_data = {
            key: np.array(list(data[key]))
            for key in data.keys()
        }
        self.columns = df.columns
        self.country_idx = self.columns.index('hotel_country')
        self.market_idx = self.columns.index('hotel_market')
        self.cluster_idx = self.columns.index('hotel_cluster')
    
    def sample(self, row: tuple, n: int) -> NDArray:
        market, country = row[self.market_idx], row[self.country_idx]
        clusters = self.hotel_data[country, market]
        clusters = np.random.choice(clusters, size=n)
        row = np.array(row)[None, :].repeat(n + 1, 0)
        row[1:, self.cluster_idx] = clusters
        return row

In [29]:
from collections.abc import Generator

from sklearn.metrics import accuracy_score


def accuracy(x_test: pl.DataFrame, y_test: pl.DataFrame, estimator: BaseEstimator) -> float:
    return accuracy_score(y_test, estimator.predict(x_test))


def iterate_batched(data: pl.DataFrame, batch_size: int) -> Generator[pl.DataFrame, None, None]:
    i = 0
    j = min(batch_size, len(data))
    yield data[i:j]
    while j < len(data):
        i += batch_size
        j = min(j + batch_size, len(data))
        yield data[i:j]


def batch_mrr(x_test: pl.DataFrame, estimator: BaseEstimator, sampler: DataSampler, n: int = 5) -> float:
    rows = np.row_stack([
        sampler.sample(row, n)
        for row in x_test.iter_rows()
    ])
    rows = pl.DataFrame(
        data={
            col: rows[:, idx].astype(map_to_np(x_test[col].dtype)) for idx, col in enumerate(x_test.columns)
        },
        schema=x_test.schema
    )
    predictions = estimator.predict_proba(rows.to_pandas())[:, 1]
    predictions = predictions.reshape(len(x_test), n + 1)
    real_values = predictions[:, 0]
    predictions = np.sort(predictions, axis=1)[:, ::-1]
    indices = (predictions == real_values.reshape(-1, 1)).argmax(axis=1)
    return np.sum(1 / (1 + indices))


def mrr(x_test: pl.DataFrame, y_test: pl.DataFrame, 
        estimator: BaseEstimator, sampler: DataSampler, 
        n: int = 5, batch_size: int = 1024) -> float:
    x_test = x_test.filter(y_test)
    mrr_ = 0
    for batch in iterate_batched(x_test, batch_size):
        mrr_ += batch_mrr(batch, estimator, sampler, n)
    return mrr_ / len(x_test)


In [30]:
from functools import partial


metrics, estimators = blocked_cross_validation(
    x['date_time'].min(),
    x['date_time'].max(),
    timedelta(days=14),
    timedelta(days=7),
    base_pipeline,
    x=x.drop('date_time'),
    y=y,
    dt_column=x['date_time'],
    metrics={
        'mrr': partial(mrr, sampler=DataSampler(x), n=20, batch_size=1024),
    }
)

  0%|          | 0/50 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [31]:
metrics

{'mrr': [0.2514054627039801,
  0.24604246116184006,
  0.2429972635089392,
  0.24775715450181135,
  0.23111330718826373,
  0.23232540591869363,
  0.2302143697276338,
  0.22953785697145926,
  0.2295741040687718,
  0.23621720125537815,
  0.23478251899859395,
  0.2376430963712376,
  0.23764499617212462,
  0.24398948499908685,
  0.23406927830775634,
  0.23713344266400052,
  0.24210865868750986,
  0.24289978666743223,
  0.23554257543078316,
  0.2394396626060353,
  0.2373548593656031,
  0.23755818593939285,
  0.23991595646844896,
  0.24684451944485625,
  0.24428859527867636,
  0.2525491350068459,
  0.23994625625729807,
  0.24107214030172242,
  0.24760294339204955,
  0.23806871577252514,
  0.2371216566632705,
  0.2365458577565375,
  0.2339456200784841,
  0.22820938983233205,
  0.2299033463857955,
  0.2290864631297007,
  0.22364609519252723,
  0.2284398370908542,
  0.23340714833080575,
  0.2356383225546567,
  0.23443292773341984,
  0.2263919187003552,
  0.23104342119812132,
  0.2306574618581730

In [33]:
import pickle


with open('../models/baseline.pkl', 'wb') as file:
    pickle.dump(estimators, file)