In [14]:
%pip install pytorch-lifestream

Defaulting to user installation because normal site-packages is not writeable
Collecting urllib3<1.27,>=1.21.1 (from requests->transformers==4.*->pytorch-lifestream)
  Obtaining dependency information for urllib3<1.27,>=1.21.1 from https://files.pythonhosted.org/packages/b0/53/aa91e163dcfd1e5b82d8a890ecf13314e3e149c05270cc644581f77f17fd/urllib3-1.26.18-py2.py3-none-any.whl.metadata
  Using cached urllib3-1.26.18-py2.py3-none-any.whl.metadata (48 kB)
Collecting charset-normalizer~=2.0.0 (from requests->transformers==4.*->pytorch-lifestream)
  Using cached charset_normalizer-2.0.12-py3-none-any.whl (39 kB)
Using cached urllib3-1.26.18-py2.py3-none-any.whl (143 kB)
[33mDEPRECATION: pytorch-lightning 1.6.5 has a non-standard dependency specifier torch>=1.8.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussio

In [1]:
from functools import partial

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from lightgbm import LGBMClassifier
from ptls.data_load.datasets import MemoryMapDataset, inference_data_loader
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames import PtlsDataModule
from ptls.frames.coles import ColesDataset, CoLESModule
from ptls.frames.coles.split_strategy import NoSplit, SampleSlices
from ptls.nn import AggFeatureSeqEncoder, RnnSeqEncoder, TrxEncoder
from ptls.preprocessing import PandasDataPreprocessor
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.metrics import roc_auc_score



In [2]:
df_train = pd.read_parquet(
    "/home/jupyter/datasphere/project/train_dataset_hackaton2023_train.gzip"
)
df_test = pd.read_parquet("/home/jupyter/datasphere/project/hackaton2023_test.gzip")

In [3]:
df_train.head()

Unnamed: 0,customer_id,date_diff_post,buy_post,group_name,revenue,startdatetime,dish_name,ownareaall_sqm,format_name
0,29891,9.0,1,train,69.99,2022-12-05 12:03:58,Кинг Фри станд,300.0,Отдельно стоящий без внешней зоны
1,29891,9.0,1,train,190.0,2022-12-05 12:03:58,Чикен Тар-Тар,300.0,Отдельно стоящий без внешней зоны
2,29891,9.0,1,train,9.99,2022-12-05 12:03:58,Соус Сырный,300.0,Отдельно стоящий без внешней зоны
3,29891,9.0,1,train,119.99,2022-12-05 12:03:58,Энергет.нап. Адреналин Раш,300.0,Отдельно стоящий без внешней зоны
4,29891,9.0,1,train,119.99,2022-12-05 14:28:35,Латте (СТАНД.),300.0,Отдельно стоящий без внешней зоны


In [4]:
for df in [df_train, df_test]:
    df["dish_name"] = df["dish_name"].astype(str)
    df["format_name"] = df["format_name"].astype(str)
    df["hour"] = df["startdatetime"].dt.strftime("%H").astype(int)

    df.drop(["group_name"], axis=1, inplace=True)

In [5]:
preprocessor = PandasDataPreprocessor(
    col_id="customer_id",
    col_event_time="startdatetime",
    cols_category=["dish_name", "format_name"],
    cols_numerical=["revenue", "ownareaall_sqm", "hour"],
    cols_first_item=["buy_post"],
    return_records=True,
)

In [6]:
%%time

dataset_train = preprocessor.fit_transform(df_train)
dataset_test = preprocessor.transform(df_test)

CPU times: user 4min 39s, sys: 25 s, total: 5min 4s
Wall time: 4min 53s


In [7]:
from sklearn.model_selection import train_test_split

train, valid_test = train_test_split(dataset_train, test_size=0.2, random_state=42)
valid, test = train_test_split(valid_test, test_size=0.5, random_state=42)

len(train), len(valid), len(test)

(400000, 50000, 50000)

AggFeatureSeqEncoder + LightGBM

In [8]:
params = {
    "numeric_values": {
        "revenue": {"identity"},
        "hour": {"in": 24},
    },
    "embeddings": {
        "dish_name": {"in": 1000},
        "format_name": {"in": 10},
    },
}

seq_encoder = AggFeatureSeqEncoder(**params)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(
        torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9
    ),
)

In [9]:
trainer = pl.Trainer(
    max_epochs=30,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
train_dl = inference_data_loader(train, num_workers=0, batch_size=16)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=16)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

Missing logger folder: /home/jupyter/work/resources/lightning_logs
  rank_zero_warn(
  rank_zero_warn(


In [11]:
X_train = train_embeds.numpy()
X_test = test_embeds.numpy()

y_train = np.array([cl["buy_post"] for cl in train])
y_test = np.array([cl["buy_post"] for cl in test])

In [12]:
lgbm = LGBMClassifier(n_estimators=100, verbose=-1)
lgbm.fit(X_train, y_train)

In [13]:
print(f'Train score: {roc_auc_score(y_train, lgbm.predict_proba(X_train)[:, 1]):.4f}', )
print(f'Test score: {roc_auc_score(y_test, lgbm.predict_proba(X_test)[:, 1]):.4f}')

Train score: 0.7031
Test score: 0.6890
