In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder

from lightgbm import LGBMClassifier

In [2]:
path_to_data = 'data'

In [3]:
df_train = pd.read_parquet(path_to_data + '/train_dataset_hackaton2023_train.gzip')
df_train.head()
# df_test = pd.read_parquet(path_to_data + '/hackaton2023_test.gzip')

Unnamed: 0,customer_id,date_diff_post,buy_post,group_name,revenue,startdatetime,dish_name,ownareaall_sqm,format_name
0,29891,9.0,1,train,69.99,2022-12-05 12:03:58,Кинг Фри станд,300.0,Отдельно стоящий без внешней зоны
1,29891,9.0,1,train,190.0,2022-12-05 12:03:58,Чикен Тар-Тар,300.0,Отдельно стоящий без внешней зоны
2,29891,9.0,1,train,9.99,2022-12-05 12:03:58,Соус Сырный,300.0,Отдельно стоящий без внешней зоны
3,29891,9.0,1,train,119.99,2022-12-05 12:03:58,Энергет.нап. Адреналин Раш,300.0,Отдельно стоящий без внешней зоны
4,29891,9.0,1,train,119.99,2022-12-05 14:28:35,Латте (СТАНД.),300.0,Отдельно стоящий без внешней зоны


In [4]:
df_train = df_train.drop_duplicates()

In [5]:
data = df_train.groupby(['customer_id', 'startdatetime']).agg({'revenue': 'sum', 'buy_post': 'first'}).reset_index()
data.head()

Unnamed: 0,customer_id,startdatetime,revenue,buy_post
0,29891,2022-12-05 12:03:58,389.97,1
1,29891,2022-12-05 14:28:35,119.99,1
2,29891,2022-12-15 00:37:19,269.99,1
3,29891,2022-12-20 09:20:38,144.97,1
4,29891,2022-12-21 09:46:23,184.96,1


In [6]:
data['delta'] = data.groupby('customer_id')['startdatetime'].diff() 

In [7]:
data.head()

Unnamed: 0,customer_id,startdatetime,revenue,buy_post,delta
0,29891,2022-12-05 12:03:58,389.97,1,NaT
1,29891,2022-12-05 14:28:35,119.99,1,0 days 02:24:37
2,29891,2022-12-15 00:37:19,269.99,1,9 days 10:08:44
3,29891,2022-12-20 09:20:38,144.97,1,5 days 08:43:19
4,29891,2022-12-21 09:46:23,184.96,1,1 days 00:25:45


In [8]:
data['delta_hours'] = data['delta'].dt.total_seconds() // (60*60)

In [9]:
data = data.drop('delta', axis=1).fillna(0)

In [10]:
data.head()

Unnamed: 0,customer_id,startdatetime,revenue,buy_post,delta_hours
0,29891,2022-12-05 12:03:58,389.97,1,0.0
1,29891,2022-12-05 14:28:35,119.99,1,2.0
2,29891,2022-12-15 00:37:19,269.99,1,226.0
3,29891,2022-12-20 09:20:38,144.97,1,128.0
4,29891,2022-12-21 09:46:23,184.96,1,24.0


In [11]:
data = data.groupby('customer_id').agg({
    'revenue': ['mean', 'median', 'std', 'max', 'min', 'count'],
    'delta_hours': ['mean', 'median', 'std', 'max', 'min',], 
    'startdatetime': ['min', 'max'],
    'buy_post': 'first'
}).reset_index()

data.columns = [f'{header}_{stat}' for header, stat in data.columns]

In [12]:
data.head()

Unnamed: 0,customer_id_,revenue_mean,revenue_median,revenue_std,revenue_max,revenue_min,revenue_count,delta_hours_mean,delta_hours_median,delta_hours_std,delta_hours_max,delta_hours_min,startdatetime_min,startdatetime_max,buy_post_first
0,29891,203.494,199.96,123.170275,439.98,1.0,25,55.24,24.0,72.50246,266.0,0.0,2022-12-05 12:03:58,2023-02-01 09:55:59,1
1,30477,227.024,229.99,124.933425,499.95,44.99,25,49.4,23.0,60.578462,216.0,0.0,2022-10-04 09:25:05,2022-11-25 08:53:01,1
2,31426,349.2775,274.99,327.626906,1079.97,1.0,24,49.708333,34.5,46.635944,164.0,0.0,2023-05-12 16:05:44,2023-07-01 20:54:48,1
3,44491,128.725,59.98,144.471912,344.97,49.97,4,166.75,11.5,318.351352,644.0,0.0,2023-06-10 21:59:25,2023-07-08 18:10:44,1
4,44939,554.943333,554.94,49.985,604.93,504.96,3,123.333333,172.0,107.598017,198.0,0.0,2022-12-10 11:54:04,2022-12-25 22:38:48,1


In [13]:
data['lambda'] = data['revenue_count'] / ((data['startdatetime_max'] - data['startdatetime_min']).dt.days + 1)
data = data.drop(['startdatetime_min', 'startdatetime_max'], axis=1)

In [14]:
data.head()

Unnamed: 0,customer_id_,revenue_mean,revenue_median,revenue_std,revenue_max,revenue_min,revenue_count,delta_hours_mean,delta_hours_median,delta_hours_std,delta_hours_max,delta_hours_min,buy_post_first,lambda
0,29891,203.494,199.96,123.170275,439.98,1.0,25,55.24,24.0,72.50246,266.0,0.0,1,0.431034
1,30477,227.024,229.99,124.933425,499.95,44.99,25,49.4,23.0,60.578462,216.0,0.0,1,0.480769
2,31426,349.2775,274.99,327.626906,1079.97,1.0,24,49.708333,34.5,46.635944,164.0,0.0,1,0.470588
3,44491,128.725,59.98,144.471912,344.97,49.97,4,166.75,11.5,318.351352,644.0,0.0,1,0.142857
4,44939,554.943333,554.94,49.985,604.93,504.96,3,123.333333,172.0,107.598017,198.0,0.0,1,0.1875


In [15]:
data_train, data_valid = train_test_split(data, test_size=.2, random_state=42)

In [18]:
X_train = data_train.drop(['buy_post_first', 'customer_id_'], axis=1)
y_train = data_train['buy_post_first']

X_valid = data_valid.drop(['buy_post_first', 'customer_id_'], axis=1)
y_valid = data_valid['buy_post_first']

In [19]:
model = LGBMClassifier(verbose=-1)

res = cross_val_score(model, X_train, y_train, scoring='roc_auc')
print(f'ROC-AUC: {res.mean():.3f} ± {res.std():.3f}')

ROC-AUC: 0.731 ± 0.001


In [20]:
model.fit(X_train, y_train)

In [21]:
pd.Series(model.feature_importances_, index=X_train.columns).sort_values()

delta_hours_min         0
revenue_max           202
lambda                202
revenue_mean          210
delta_hours_std       213
delta_hours_max       220
revenue_min           242
revenue_std           256
revenue_median        270
delta_hours_median    369
delta_hours_mean      387
revenue_count         429
dtype: int32

In [33]:
X_train.corr()

Unnamed: 0,revenue_mean,revenue_median,revenue_std,revenue_max,revenue_min,revenue_count,delta_hours_mean,delta_hours_median,delta_hours_std,delta_hours_max,delta_hours_min,lambda
revenue_mean,1.0,0.93555,0.614021,0.840854,0.725301,-0.163515,0.170957,0.180376,0.101493,0.074578,,-0.087631
revenue_median,0.93555,1.0,0.447379,0.675092,0.683809,-0.162116,0.175947,0.18421,0.104965,0.076654,,-0.101094
revenue_std,0.614021,0.447379,1.0,0.878893,0.001053,-0.068328,0.016444,-0.040205,0.051999,0.038232,,0.019604
revenue_max,0.840854,0.675092,0.878893,1.0,0.368512,0.015596,0.036712,0.043387,0.016526,0.029825,,-0.036691
revenue_min,0.725301,0.683809,0.001053,0.368512,1.0,-0.252427,0.25115,0.286171,0.137796,0.081327,,-0.104167
revenue_count,-0.163515,-0.162116,-0.068328,0.015596,-0.252427,1.0,-0.346464,-0.203846,-0.329529,-0.200837,,0.076713
delta_hours_mean,0.170957,0.175947,0.016444,0.036712,0.25115,-0.346464,1.0,0.633628,0.87741,0.84201,,-0.4903
delta_hours_median,0.180376,0.18421,-0.040205,0.043387,0.286171,-0.203846,0.633628,1.0,0.21487,0.215117,,-0.299137
delta_hours_std,0.101493,0.104965,0.051999,0.016526,0.137796,-0.329529,0.87741,0.21487,1.0,0.954231,,-0.427306
delta_hours_max,0.074578,0.076654,0.038232,0.029825,0.081327,-0.200837,0.84201,0.215117,0.954231,1.0,,-0.468585
