In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/burger-king-churn/train_dataset_hackaton2023_train.gzip
/kaggle/input/burger-king-churn/hackaton2023_test.gzip


In [2]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Data preprocessing & feature generation

In [3]:
data = pd.read_parquet('/kaggle/input/burger-king-churn/train_dataset_hackaton2023_train.gzip')
display(data.sample(5))

Unnamed: 0,customer_id,date_diff_post,buy_post,group_name,revenue,startdatetime,dish_name,ownareaall_sqm,format_name
9684984,17917429,,0,train,149.98,2022-11-29 10:54:59,Чикенбургер новый,112.7,Отдельно стоящий без внешней зоны без туалета
15136978,24180734,14.0,1,train,60.0,2023-02-22 21:10:46,Чизбургер,315.1,Отдельно стоящий
17412333,26933716,,0,train,79.99,2022-12-27 13:20:30,Ord.Сырный и Кисло-сладкий соусы,76.1,Фудкорт без туалета
18179796,28003299,1.0,1,train,14.98,2023-07-26 19:14:12,"Up Фрустайл Лимон Лайм мал 0,4",236.7,Отдельно стоящий с внешней зоной
4624670,8220911,,0,train,220.0,2022-10-04 16:31:46,Воппер,102.0,Фудкорт с туалетом


In [4]:
has_toilet_statuses = [
    "Отдельно стоящий с внешней зоной",
    "Отдельно стоящий без внешней зоны",
    "Отдельно стоящий",
    "Фудкорт с туалетом",
    "Отдельно стоящий с внешн.зоной без туалета",
]

hasnt_toilet_statuses = [
    "Фудкорт без туалета",
    "Отдельно стоящий без внешней зоны без туалета",
    "Отдельно стоящий без туалета",
]

with_external_zone = [
    "Отдельно стоящий с внешней зоной",
    "Отдельно стоящий с внешн.зоной без туалета",
]

without_external_zone = [
    "Отдельно стоящий без внешней зоны",
    "Отдельно стоящий без внешней зоны без туалета",
]

is_foodcourt = [
    "Фудкорт с туалетом",
    "Фудкорт без туалета",
]

In [5]:
def preprocess(data, is_train=True):
    
    if is_train:
        data = data[data.format_name != 'Не заполнено']
        #slight date_diff_post aligning
        data.date_diff_post.fillna(-1, inplace=True)
        data.date_diff_post = data.date_diff_post.astype(int)
    
    # make has_toilet column and store there true or false based on has_toilet_statuses
    data['has_toilet'] = data.format_name.isin(has_toilet_statuses)

    # if has external zone then store 2, else 1, if not in with/without_external_zone then store 0
    data['has_external_zone'] = data.format_name.apply(
        lambda x: 2 if x in with_external_zone else 1 if x in without_external_zone else 0
    )

    # 2 - has external zone
    # 1 - hasnt external zone
    # 0 - not defined

    data['is_foodcourt'] = data.format_name.isin(is_foodcourt)

    data.drop('format_name', axis=1, inplace=True)
    
    if is_train:
        # filter outliers in dish_name that have less than 10000 rows
        dish_name_counts = data.dish_name.value_counts()
        dish_name_counts = dish_name_counts[dish_name_counts > 50000]
        data = data[data.dish_name.isin(dish_name_counts.index)]
    
    # create bill_id based on startdatetime and customer_id
    data['bill_id'] = data.groupby(['startdatetime', 'customer_id']).ngroup()
    
    # drop group_name
    data.drop('group_name', axis=1, inplace=True)
    
    # label encode dish_name
    le = LabelEncoder()
    data['dish_name'] = le.fit_transform(data.dish_name)
    
    if is_train:
        data = data.groupby('bill_id').agg({
            'bill_id': 'first',
            'customer_id': 'first',
            'dish_name': list,
            'startdatetime': 'first',
            'has_toilet': 'first',
            'has_external_zone': 'first',
            'is_foodcourt': 'first',
            'date_diff_post': 'first',
            'buy_post': 'first',
            'revenue': 'sum',
            'ownareaall_sqm': 'first',
        }).reset_index(drop=True)
    else:
        data = data.groupby('bill_id').agg({
            'bill_id': 'first',
            'customer_id': 'first',
            'dish_name': list,
            'startdatetime': 'first',
            'has_toilet': 'first',
            'has_external_zone': 'first',
            'is_foodcourt': 'first',
            'revenue': 'sum',
            'ownareaall_sqm': 'first',
        }).reset_index(drop=True)
    
    #datetime to timestamp
    data['startdatetime'] = [datetime.timestamp(n) for n in data['startdatetime']]
    
    data['dish_amount'] = [len(n) for n in data['dish_name']]
    
    return data

In [6]:
data = preprocess(data)
data.sample(5)

Unnamed: 0,bill_id,customer_id,dish_name,startdatetime,has_toilet,has_external_zone,is_foodcourt,date_diff_post,buy_post,revenue,ownareaall_sqm,dish_amount
1689070,1689070,38028356,[32],1681672000.0,True,0,True,-1,0,139.99,120.52,1
1908550,1908550,28850470,"[35, 52]",1683999000.0,True,0,False,8,1,119.98,319.6,2
1781785,1781785,42523601,"[30, 30, 23, 28, 52, 30]",1682678000.0,True,2,False,10,1,250.99,300.2,6
948604,948604,38524989,"[1, 7, 11, 54, 30, 44, 21, 27, 4, 28, 17, 27, 7]",1673789000.0,False,0,True,-1,0,906.99,83.8,13
540107,540107,19346224,"[36, 7, 7]",1669231000.0,False,0,True,14,1,579.98,92.3,3


In [7]:
cat_feat = ['has_external_zone', 'has_toilet', 'is_foodcourt']

# Train test split

In [8]:
x = data[[column for column in data.columns if column not in ['buy_post', 'date_diff_post', 'customer_id', 'dish_name']]]
y = data[['buy_post', 'date_diff_post']]
y_clf = y['buy_post']
y_reg = y['date_diff_post']

x_train, x_test, y_train, y_test = train_test_split(x, y)
y_train_clf, y_test_clf = y_train['buy_post'], y_test['buy_post']
y_train_reg, y_test_reg = y_train['date_diff_post'], y_test['date_diff_post'] 

In [9]:
train_pool_clf = Pool(x_train, y_train_clf, cat_features=cat_feat)
eval_pool_clf = Pool(x_test, y_test_clf, cat_features=cat_feat)
full_train_clf = Pool(x, y_clf, cat_features=cat_feat)

train_pool_reg = Pool(x_train, y_train_reg, cat_features=cat_feat)
eval_pool_reg = Pool(x_test, y_test_reg, cat_features=cat_feat)
full_train_reg = Pool(x, y_reg, cat_features=cat_feat)

# Model training

In [10]:
cbc = CatBoostClassifier(iterations=100, learning_rate=0.4, depth=8, l2_leaf_reg=10, loss_function='Logloss')
cbc.fit(train_pool_clf, eval_set=eval_pool_clf)

0:	learn: 0.5797496	test: 0.5799408	best: 0.5799408 (0)	total: 534ms	remaining: 52.8s
1:	learn: 0.5348462	test: 0.5351617	best: 0.5351617 (1)	total: 929ms	remaining: 45.5s
2:	learn: 0.5158532	test: 0.5162667	best: 0.5162667 (2)	total: 1.39s	remaining: 44.9s
3:	learn: 0.5076401	test: 0.5081021	best: 0.5081021 (3)	total: 1.79s	remaining: 43s
4:	learn: 0.5041006	test: 0.5045812	best: 0.5045812 (4)	total: 2.25s	remaining: 42.8s
5:	learn: 0.5026040	test: 0.5031273	best: 0.5031273 (5)	total: 2.68s	remaining: 42s
6:	learn: 0.5019108	test: 0.5024666	best: 0.5024666 (6)	total: 3.05s	remaining: 40.6s
7:	learn: 0.5015303	test: 0.5021058	best: 0.5021058 (7)	total: 3.48s	remaining: 40.1s
8:	learn: 0.5012973	test: 0.5018810	best: 0.5018810 (8)	total: 3.91s	remaining: 39.5s
9:	learn: 0.5011223	test: 0.5017284	best: 0.5017284 (9)	total: 4.29s	remaining: 38.6s
10:	learn: 0.5010316	test: 0.5016425	best: 0.5016425 (10)	total: 4.67s	remaining: 37.8s
11:	learn: 0.5009096	test: 0.5015203	best: 0.5015203 (11

<catboost.core.CatBoostClassifier at 0x787177f10610>

In [11]:
cbr = CatBoostRegressor(iterations=100, learning_rate=0.4, depth=8, l2_leaf_reg=10)
cbr.fit(train_pool_reg, eval_set=eval_pool_reg)

0:	learn: 16.0728678	test: 16.0691891	best: 16.0691891 (0)	total: 423ms	remaining: 41.9s
1:	learn: 16.0662657	test: 16.0623961	best: 16.0623961 (1)	total: 783ms	remaining: 38.3s
2:	learn: 16.0621837	test: 16.0585213	best: 16.0585213 (2)	total: 1.21s	remaining: 39s
3:	learn: 16.0602910	test: 16.0569597	best: 16.0569597 (3)	total: 1.58s	remaining: 37.9s
4:	learn: 16.0581120	test: 16.0548728	best: 16.0548728 (4)	total: 1.98s	remaining: 37.7s
5:	learn: 16.0563998	test: 16.0535971	best: 16.0535971 (5)	total: 2.36s	remaining: 36.9s
6:	learn: 16.0553378	test: 16.0530240	best: 16.0530240 (6)	total: 2.68s	remaining: 35.7s
7:	learn: 16.0533340	test: 16.0509530	best: 16.0509530 (7)	total: 3.06s	remaining: 35.1s
8:	learn: 16.0521507	test: 16.0499435	best: 16.0499435 (8)	total: 3.41s	remaining: 34.5s
9:	learn: 16.0512745	test: 16.0493672	best: 16.0493672 (9)	total: 3.75s	remaining: 33.8s
10:	learn: 16.0503865	test: 16.0487033	best: 16.0487033 (10)	total: 4.1s	remaining: 33.2s
11:	learn: 16.0493722	

<catboost.core.CatBoostRegressor at 0x787177f132e0>

In [12]:
'''
parameters = {'depth'         : [8, 10],
              'learning_rate' : [0.3, 0.4],
              'iterations'    : [10, 20],
              'l2_leaf_reg'   : [10, 15, 20]}
              
randomized_search_results = cbc.randomized_search(
    parameters,
    train_pool,
    n_iter=9,
    plot=True
)

randomized_search_results
'''

"\nparameters = {'depth'         : [8, 10],\n              'learning_rate' : [0.3, 0.4],\n              'iterations'    : [10, 20],\n              'l2_leaf_reg'   : [10, 15, 20]}\n              \nrandomized_search_results = cbc.randomized_search(\n    parameters,\n    train_pool,\n    n_iter=9,\n    plot=True\n)\n\nrandomized_search_results\n"

# Evaluating model

In [13]:
print(cbc.predict_proba(x_test),
      '\n\n',
      cbr.predict(x_test))

[[0.18898328 0.81101672]
 [0.21395741 0.78604259]
 [0.18258773 0.81741227]
 ...
 [0.20170275 0.79829725]
 [0.14904938 0.85095062]
 [0.116862   0.883138  ]] 

 [15.85208247 18.28288518 16.54535997 ... 15.06035996 14.73412407
 14.57807941]


In [14]:
cbc.eval_metrics(eval_pool_clf, metrics=['F1', 'Precision', 'Recall', 'Accuracy'])

{'F1': [0.8857928446307615,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857958659063362,
  0.8857869580308333,
  0.8857869580308333,
  0.8857879651498594,
  0.885787758680689,
  0.8857887153603213,
  0.8857887153603213,
  0.8857887153603213,
  0.8857873205088572,
  0.8857847120596787,
  0.8857960272084247,
  0.8857970343747852,
  0.8857962336738447,
  0.8857974221506304,
  0.8857974221506304,
  0.8857957955786321,
  0.8857947884002266,
  0.8857947884002266,
  0.8857941941630674,
  0.8857941941630674,
  0.8857941941630674,
  0.8857941941630674,
  0.8857933934574566,
  0.8857941941630674,
  0.8857941941630674,
  0.8857941941630674,
  0.8

In [15]:
cbr.eval_metrics(eval_pool_reg, metrics=['MAE', 'Poisson', 'Quantile', 'RMSE', 'MedianAbsoluteError'])

{'MAE': [13.238725690241663,
  13.229006718880258,
  13.222293604841202,
  13.219025374283955,
  13.21498488267643,
  13.212962539884034,
  13.211745606751826,
  13.207929953662832,
  13.206403484108716,
  13.205214331368527,
  13.204072307733606,
  13.202957080143904,
  13.202111602116062,
  13.200161000876246,
  13.199273322003505,
  13.198491400609408,
  13.197958816035575,
  13.196812623119905,
  13.19620323341511,
  13.195734778291541,
  13.195243874241225,
  13.194556004148417,
  13.194199063321815,
  13.193732462683295,
  13.193442808472568,
  13.193113592053816,
  13.192942236205624,
  13.192793536822453,
  13.192519809116748,
  13.192408033558372,
  13.19216242817311,
  13.191566433104304,
  13.190759924144645,
  13.190282401252945,
  13.189857755193316,
  13.189515613223321,
  13.189226401432423,
  13.188640913027934,
  13.188666341351297,
  13.188230921223674,
  13.187926142323006,
  13.187775799754762,
  13.18744607664862,
  13.186999966158103,
  13.186765951478595,
  13.18

# Train model on the full data

In [16]:
#cbc.load_model('/kaggle/working/cbc')
#cbr.load_model('/kaggle/working/cbc')

In [17]:
cbc = CatBoostClassifier(iterations=100, learning_rate=0.4, depth=8, l2_leaf_reg=10, loss_function='Logloss')
cbc.fit(full_train_clf, eval_set=eval_pool_clf)

0:	learn: 0.5796386	test: 0.5797814	best: 0.5797814 (0)	total: 633ms	remaining: 1m 2s
1:	learn: 0.5348528	test: 0.5350674	best: 0.5350674 (1)	total: 1.27s	remaining: 1m 2s
2:	learn: 0.5156388	test: 0.5159322	best: 0.5159322 (2)	total: 1.82s	remaining: 59s
3:	learn: 0.5073826	test: 0.5077121	best: 0.5077121 (3)	total: 2.31s	remaining: 55.4s
4:	learn: 0.5039588	test: 0.5043051	best: 0.5043051 (4)	total: 2.82s	remaining: 53.5s
5:	learn: 0.5025325	test: 0.5028911	best: 0.5028911 (5)	total: 3.46s	remaining: 54.1s
6:	learn: 0.5018764	test: 0.5022402	best: 0.5022402 (6)	total: 4.01s	remaining: 53.3s
7:	learn: 0.5015927	test: 0.5019642	best: 0.5019642 (7)	total: 4.54s	remaining: 52.2s
8:	learn: 0.5013786	test: 0.5017397	best: 0.5017397 (8)	total: 5.08s	remaining: 51.4s
9:	learn: 0.5012358	test: 0.5016086	best: 0.5016086 (9)	total: 5.56s	remaining: 50s
10:	learn: 0.5010974	test: 0.5014602	best: 0.5014602 (10)	total: 6.11s	remaining: 49.4s
11:	learn: 0.5009454	test: 0.5013061	best: 0.5013061 (11

<catboost.core.CatBoostClassifier at 0x787177f13820>

In [18]:
cbr = CatBoostRegressor(iterations=100, learning_rate=0.4, depth=8, l2_leaf_reg=10)
cbr.fit(full_train_reg, eval_set=eval_pool_reg)

0:	learn: 16.0713491	test: 16.0683444	best: 16.0683444 (0)	total: 561ms	remaining: 55.5s
1:	learn: 16.0634790	test: 16.0601729	best: 16.0601729 (1)	total: 1.1s	remaining: 54s
2:	learn: 16.0598301	test: 16.0565823	best: 16.0565823 (2)	total: 1.56s	remaining: 50.4s
3:	learn: 16.0576570	test: 16.0543137	best: 16.0543137 (3)	total: 2.02s	remaining: 48.4s
4:	learn: 16.0555959	test: 16.0521066	best: 16.0521066 (4)	total: 2.45s	remaining: 46.6s
5:	learn: 16.0543889	test: 16.0507857	best: 16.0507857 (5)	total: 2.95s	remaining: 46.2s
6:	learn: 16.0530220	test: 16.0494813	best: 16.0494813 (6)	total: 3.41s	remaining: 45.3s
7:	learn: 16.0515835	test: 16.0480759	best: 16.0480759 (7)	total: 3.86s	remaining: 44.4s
8:	learn: 16.0503993	test: 16.0468824	best: 16.0468824 (8)	total: 4.32s	remaining: 43.7s
9:	learn: 16.0496325	test: 16.0462011	best: 16.0462011 (9)	total: 4.75s	remaining: 42.7s
10:	learn: 16.0485801	test: 16.0452590	best: 16.0452590 (10)	total: 5.22s	remaining: 42.2s
11:	learn: 16.0479071	

<catboost.core.CatBoostRegressor at 0x7871aee79810>

In [19]:
cbc.save_model('cbc')
cbr.save_model('cbr')

# Predicting results

In [20]:
test_data = pd.read_parquet('/kaggle/input/burger-king-churn/hackaton2023_test.gzip')
test_data.sample(5)

Unnamed: 0,customer_id,group_name,revenue,startdatetime,dish_name,ownareaall_sqm,format_name
2041315,18390788,test,40.0,2023-06-30 16:27:10,Соус Сырный,85.0,Фудкорт без туалета
3559548,27467922,test,109.99,2023-06-07 18:34:17,Сырные Медальоны (6 шт.),361.0,Отдельно стоящий без внешней зоны
1778799,13253784,test,139.99,2022-10-18 20:40:29,Карт. Деревен. мал,245.8,Отдельно стоящий с внешней зоной
2265763,19487003,test,119.99,2023-05-04 16:54:10,"Липтон Лимон в бутылке 0,5л",297.0,Отдельно стоящий с внешней зоной
4675537,40097498,test,14.99,2023-06-17 12:41:23,Кинг Фри мал,128.8,Фудкорт без туалета


In [21]:
print(len(test_data.customer_id.unique()))
unique = test_data.customer_id.unique()
test_data = preprocess(test_data, is_train=False)
print(len(test_data['customer_id'].unique()))
test_data.head()

112334
112334


Unnamed: 0,bill_id,customer_id,dish_name,startdatetime,has_toilet,has_external_zone,is_foodcourt,revenue,ownareaall_sqm,dish_amount
0,0,28039329,"[934, 788, 789]",1662077000.0,False,0,True,1179.96,169.73,3
1,1,33602398,"[599, 81, 611, 282, 803, 860, 561, 860, 544, 8...",1662078000.0,True,2,False,619.95,306.84,13
2,2,28039329,[934],1662078000.0,False,0,True,289.99,169.73,1
3,3,29250460,"[846, 605, 666, 666, 605, 471, 183, 471, 289, ...",1662079000.0,True,2,False,828.93,300.0,13
4,4,19197657,[916],1662080000.0,True,1,False,69.99,338.3,1


In [22]:
out = pd.DataFrame()
out['customer_id'] = test_data['customer_id']
test_x = test_data[[column for column in data.columns if column not in ['buy_post', 'date_diff_post', 'customer_id', 'dish_name']]]

out['date_diff_post'] = cbr.predict(test_x)
out['buy_post'] = cbc.predict(test_x)

print(out.dtypes)
out.head()

customer_id         int64
date_diff_post    float64
buy_post            int64
dtype: object


Unnamed: 0,customer_id,date_diff_post,buy_post
0,28039329,13.225997,0
1,33602398,17.092892,1
2,28039329,12.592316,0
3,29250460,15.483408,1
4,19197657,13.414683,1


In [23]:
#aggregate predictions for paychecks to formulate a prediction for a single user
out = out.groupby('customer_id', as_index=False).mean().round(0)
out['date_diff_post'] = out['date_diff_post'].astype(int)
out['buy_post'] = out['buy_post'].astype(int)

for n, val in enumerate(out['buy_post']):
    if val == 0:
        out['date_diff_post'][n] = None
        
print(len(out.customer_id.unique()))
print(out.dtypes)
out.head()

112334
customer_id         int64
date_diff_post    float64
buy_post            int64
dtype: object


Unnamed: 0,customer_id,date_diff_post,buy_post
0,52341,17.0,1
1,69175,16.0,1
2,73427,15.0,1
3,134577,15.0,1
4,156357,17.0,1


In [24]:
out.to_csv('/kaggle/working/solution.csv', sep=';')