In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/burger-king-churn/train_dataset_hackaton2023_train.gzip
/kaggle/input/burger-king-churn/hackaton2023_test.gzip


In [2]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Data preprocessing & feature generation

In [3]:
data = pd.read_parquet('/kaggle/input/burger-king-churn/train_dataset_hackaton2023_train.gzip')
display(data.sample(5))

Unnamed: 0,customer_id,date_diff_post,buy_post,group_name,revenue,startdatetime,dish_name,ownareaall_sqm,format_name
15099691,24135393,,0,train,38.32,2022-12-13 11:13:10,Up Соус Гриль Особый,125.0,Фудкорт без туалета
7735536,12169465,31.0,1,train,219.99,2023-04-14 18:23:35,Двойной Чизбургер,93.2,Фудкорт без туалета
213046,315415,13.0,1,train,249.99,2023-04-26 17:51:39,Чеддер Начос Чикен Ролл,78.6,Фудкорт без туалета
12835713,21953338,5.0,1,train,84.01,2023-07-18 17:00:01,Карт. Деревен. джун,117.55,Фудкорт с туалетом
5268814,8869794,,0,train,150.0,2023-01-26 11:25:42,Чикен Тар-Тар,349.5,Отдельно стоящий с внешней зоной


In [4]:
has_toilet_statuses = [
    "Отдельно стоящий с внешней зоной",
    "Отдельно стоящий без внешней зоны",
    "Отдельно стоящий",
    "Фудкорт с туалетом",
    "Отдельно стоящий с внешн.зоной без туалета",
]

hasnt_toilet_statuses = [
    "Фудкорт без туалета",
    "Отдельно стоящий без внешней зоны без туалета",
    "Отдельно стоящий без туалета",
]

with_external_zone = [
    "Отдельно стоящий с внешней зоной",
    "Отдельно стоящий с внешн.зоной без туалета",
]

without_external_zone = [
    "Отдельно стоящий без внешней зоны",
    "Отдельно стоящий без внешней зоны без туалета",
]

is_foodcourt = [
    "Фудкорт с туалетом",
    "Фудкорт без туалета",
]

In [5]:
def preprocess(data, is_train=True):
    
    if is_train:
        data = data[data.format_name != 'Не заполнено']
        #slight date_diff_post aligning
        data.date_diff_post.fillna(-1, inplace=True)
        data.date_diff_post = data.date_diff_post.astype(int)
    
    # make has_toilet column and store there true or false based on has_toilet_statuses
    data['has_toilet'] = data.format_name.isin(has_toilet_statuses)

    # if has external zone then store 2, else 1, if not in with/without_external_zone then store 0
    data['has_external_zone'] = data.format_name.apply(
        lambda x: 2 if x in with_external_zone else 1 if x in without_external_zone else 0
    )

    # 2 - has external zone
    # 1 - hasnt external zone
    # 0 - not defined

    data['is_foodcourt'] = data.format_name.isin(is_foodcourt)

    data.drop('format_name', axis=1, inplace=True)
    
    if is_train:
        # filter outliers in dish_name that have less than 10000 rows
        dish_name_counts = data.dish_name.value_counts()
        dish_name_counts = dish_name_counts[dish_name_counts > 50000]
        data = data[data.dish_name.isin(dish_name_counts.index)]
    
    # create bill_id based on startdatetime and customer_id
    data['bill_id'] = data.groupby(['startdatetime', 'customer_id']).ngroup()
    
    # drop group_name
    data.drop('group_name', axis=1, inplace=True)
    
    # label encode dish_name
    le = LabelEncoder()
    data['dish_name'] = le.fit_transform(data.dish_name)
    
    if is_train:
        data = data.groupby('bill_id').agg({
            'bill_id': 'first',
            'customer_id': 'first',
            'dish_name': list,
            'startdatetime': 'first',
            'has_toilet': 'first',
            'has_external_zone': 'first',
            'is_foodcourt': 'first',
            'date_diff_post': 'first',
            'buy_post': 'first',
            'revenue': 'sum',
            'ownareaall_sqm': 'first',
        }).reset_index(drop=True)
    else:
        data = data.groupby('bill_id').agg({
            'bill_id': 'first',
            'customer_id': 'first',
            'dish_name': list,
            'startdatetime': 'first',
            'has_toilet': 'first',
            'has_external_zone': 'first',
            'is_foodcourt': 'first',
            'revenue': 'sum',
            'ownareaall_sqm': 'first',
        }).reset_index(drop=True)
    
    #datetime to timestamp
    data['startdatetime'] = [datetime.timestamp(n) for n in data['startdatetime']]
    
    data['dish_amount'] = [len(n) for n in data['dish_name']]
    
    return data

In [6]:
data = preprocess(data)
data.sample(5)

Unnamed: 0,bill_id,customer_id,dish_name,startdatetime,has_toilet,has_external_zone,is_foodcourt,date_diff_post,buy_post,revenue,ownareaall_sqm,dish_amount
1993704,1993704,12878437,"[26, 42]",1684862000.0,True,0,True,16,1,149.98,125.1,2
1099791,1099791,28756773,"[50, 9]",1675444000.0,False,0,True,56,1,269.99,127.8,2
866643,866643,9563738,[12],1672865000.0,True,2,False,6,1,1.0,296.5,1
2255096,2255096,828203,[6],1687460000.0,False,0,True,20,1,10.0,101.7,1
421090,421090,21087884,"[18, 50, 9, 52, 42]",1667934000.0,True,0,True,-1,0,445.97,114.4,5


In [7]:
cat_feat = ['has_external_zone', 'has_toilet', 'is_foodcourt']

# Train test split

In [8]:
x = data[[column for column in data.columns if column not in ['buy_post', 'date_diff_post', 'customer_id', 'dish_name']]]
y = data[['buy_post', 'date_diff_post']]
y_clf = y['buy_post']
y_reg = y['date_diff_post']

x_train, x_test, y_train, y_test = train_test_split(x, y)
y_train_clf, y_test_clf = y_train['buy_post'], y_test['buy_post']
y_train_reg, y_test_reg = y_train['date_diff_post'], y_test['date_diff_post'] 

In [9]:
train_pool_clf = Pool(x_train, y_train_clf, cat_features=cat_feat)
eval_pool_clf = Pool(x_test, y_test_clf, cat_features=cat_feat)
full_train_clf = Pool(x, y_clf, cat_features=cat_feat)

train_pool_reg = Pool(x_train, y_train_reg, cat_features=cat_feat)
eval_pool_reg = Pool(x_test, y_test_reg, cat_features=cat_feat)
full_train_reg = Pool(x, y_reg, cat_features=cat_feat)

# Model training

In [10]:
cbc = CatBoostClassifier(iterations=100, learning_rate=0.4, depth=8, l2_leaf_reg=10, loss_function='Logloss')
cbc.fit(train_pool_clf, eval_set=eval_pool_clf)

0:	learn: 0.5796722	test: 0.5799911	best: 0.5799911 (0)	total: 531ms	remaining: 52.6s
1:	learn: 0.5347545	test: 0.5352576	best: 0.5352576 (1)	total: 908ms	remaining: 44.5s
2:	learn: 0.5157446	test: 0.5164024	best: 0.5164024 (2)	total: 1.34s	remaining: 43.2s
3:	learn: 0.5075106	test: 0.5082680	best: 0.5082680 (3)	total: 1.72s	remaining: 41.4s
4:	learn: 0.5039922	test: 0.5048139	best: 0.5048139 (4)	total: 2.14s	remaining: 40.7s
5:	learn: 0.5024962	test: 0.5033795	best: 0.5033795 (5)	total: 2.52s	remaining: 39.5s
6:	learn: 0.5017684	test: 0.5026971	best: 0.5026971 (6)	total: 2.88s	remaining: 38.2s
7:	learn: 0.5014475	test: 0.5024268	best: 0.5024268 (7)	total: 3.27s	remaining: 37.6s
8:	learn: 0.5012051	test: 0.5022059	best: 0.5022059 (8)	total: 3.65s	remaining: 36.9s
9:	learn: 0.5010146	test: 0.5020182	best: 0.5020182 (9)	total: 4.01s	remaining: 36.1s
10:	learn: 0.5008748	test: 0.5018970	best: 0.5018970 (10)	total: 4.38s	remaining: 35.5s
11:	learn: 0.5007811	test: 0.5018169	best: 0.5018169

<catboost.core.CatBoostClassifier at 0x7fb489bdb250>

In [11]:
cbr = CatBoostRegressor(iterations=100, learning_rate=0.4, depth=8, l2_leaf_reg=10)
cbr.fit(train_pool_reg, eval_set=eval_pool_reg)

0:	learn: 16.0738994	test: 16.0618466	best: 16.0618466 (0)	total: 421ms	remaining: 41.6s
1:	learn: 16.0669538	test: 16.0556686	best: 16.0556686 (1)	total: 755ms	remaining: 37s
2:	learn: 16.0635625	test: 16.0528762	best: 16.0528762 (2)	total: 1.17s	remaining: 37.7s
3:	learn: 16.0609090	test: 16.0505951	best: 16.0505951 (3)	total: 1.54s	remaining: 36.9s
4:	learn: 16.0586830	test: 16.0486387	best: 16.0486387 (4)	total: 1.91s	remaining: 36.3s
5:	learn: 16.0574273	test: 16.0475964	best: 16.0475964 (5)	total: 2.25s	remaining: 35.3s
6:	learn: 16.0564887	test: 16.0468653	best: 16.0468653 (6)	total: 2.57s	remaining: 34.1s
7:	learn: 16.0556790	test: 16.0461890	best: 16.0461890 (7)	total: 2.91s	remaining: 33.5s
8:	learn: 16.0544561	test: 16.0449493	best: 16.0449493 (8)	total: 3.25s	remaining: 32.8s
9:	learn: 16.0536992	test: 16.0445551	best: 16.0445551 (9)	total: 3.55s	remaining: 32s
10:	learn: 16.0525284	test: 16.0438177	best: 16.0438177 (10)	total: 3.9s	remaining: 31.5s
11:	learn: 16.0515811	te

<catboost.core.CatBoostRegressor at 0x7fb489bdb280>

In [12]:
'''
parameters = {'depth'         : [8, 10],
              'learning_rate' : [0.3, 0.4],
              'iterations'    : [10, 20],
              'l2_leaf_reg'   : [10, 15, 20]}
              
randomized_search_results = cbc.randomized_search(
    parameters,
    train_pool,
    n_iter=9,
    plot=True
)

randomized_search_results
'''

"\nparameters = {'depth'         : [8, 10],\n              'learning_rate' : [0.3, 0.4],\n              'iterations'    : [10, 20],\n              'l2_leaf_reg'   : [10, 15, 20]}\n              \nrandomized_search_results = cbc.randomized_search(\n    parameters,\n    train_pool,\n    n_iter=9,\n    plot=True\n)\n\nrandomized_search_results\n"

# Evaluating model

In [13]:
print(cbc.predict_proba(x_test),
      '\n\n',
      cbr.predict(x_test))

[[0.15457461 0.84542539]
 [0.19034098 0.80965902]
 [0.23161003 0.76838997]
 ...
 [0.13889594 0.86110406]
 [0.21149423 0.78850577]
 [0.20044068 0.79955932]] 

 [15.59972916 14.9913393  16.16155363 ... 13.84124903 16.70475869
 14.32605832]


In [14]:
cbc.eval_metrics(eval_pool_clf, metrics=['F1', 'Precision', 'Recall', 'Accuracy'])

{'F1': [0.8856536415064661,
  0.8856548555849147,
  0.8856558629280561,
  0.8856558629280561,
  0.8856558629280561,
  0.8856558629280561,
  0.8856558629280561,
  0.8856558629280561,
  0.8856558629280561,
  0.8856558629280561,
  0.8856558629280561,
  0.8856558629280561,
  0.8856582647739732,
  0.8856582647739732,
  0.8856550360056915,
  0.8856550360056915,
  0.8856550360056915,
  0.8856574115758242,
  0.8856574115758242,
  0.8856564042158703,
  0.8856572048454167,
  0.8856561974834546,
  0.8856561974834546,
  0.8856561974834546,
  0.885655964447273,
  0.8856549570732616,
  0.8856531227343158,
  0.8856529159916685,
  0.8856541301181816,
  0.8856543368580729,
  0.8856490931806388,
  0.8856480857876662,
  0.8856456838694746,
  0.8856458906251977,
  0.8856499201758843,
  0.885652141683225,
  0.8856479054041837,
  0.8856479054041837,
  0.8856479054041837,
  0.8856487060412466,
  0.8856481121551426,
  0.885643255557464,
  0.885643255557464,
  0.8856446500796444,
  0.8856426616813102,
  0.8856

In [15]:
cbr.eval_metrics(eval_pool_reg, metrics=['MAE', 'Poisson', 'Quantile', 'RMSE', 'MedianAbsoluteError'])

{'MAE': [13.229455402053036,
  13.220731794991819,
  13.215371877309014,
  13.210747652863066,
  13.207464631014817,
  13.205256627636121,
  13.20365206261914,
  13.202227560253684,
  13.200052296972386,
  13.199220848523757,
  13.197952685323575,
  13.197086111263273,
  13.19523640357754,
  13.19403881168003,
  13.19215992059149,
  13.191558905696345,
  13.190931572020581,
  13.190012731487604,
  13.1899407331038,
  13.189653807352277,
  13.188642249511728,
  13.188069358241512,
  13.187735489069418,
  13.187181894098913,
  13.186561909152879,
  13.186283436373488,
  13.186236015679242,
  13.186165804199693,
  13.185726500259438,
  13.18542926738095,
  13.185190791168525,
  13.184698132827583,
  13.183956242747138,
  13.18378157970583,
  13.183080867075407,
  13.182702293580672,
  13.182292878583368,
  13.18178801742389,
  13.18151530908573,
  13.18097545433107,
  13.180767381178459,
  13.180231385963618,
  13.179998312581738,
  13.179667418681458,
  13.1791064672227,
  13.17850792311

# Train model on the full data

In [16]:
#cbc.load_model('/kaggle/working/cbc')
#cbr.load_model('/kaggle/working/cbc')

In [17]:
cbc = CatBoostClassifier(iterations=100, learning_rate=0.4, depth=8, l2_leaf_reg=10, loss_function='Logloss')
cbc.fit(full_train_clf, eval_set=eval_pool_clf)

0:	learn: 0.5796386	test: 0.5798569	best: 0.5798569 (0)	total: 589ms	remaining: 58.3s
1:	learn: 0.5348528	test: 0.5352209	best: 0.5352209 (1)	total: 1.17s	remaining: 57.5s
2:	learn: 0.5156388	test: 0.5160914	best: 0.5160914 (2)	total: 1.69s	remaining: 54.7s
3:	learn: 0.5073826	test: 0.5079072	best: 0.5079072 (3)	total: 2.16s	remaining: 51.9s
4:	learn: 0.5039588	test: 0.5045299	best: 0.5045299 (4)	total: 2.65s	remaining: 50.4s
5:	learn: 0.5025325	test: 0.5031425	best: 0.5031425 (5)	total: 3.24s	remaining: 50.8s
6:	learn: 0.5018764	test: 0.5025042	best: 0.5025042 (6)	total: 3.75s	remaining: 49.8s
7:	learn: 0.5015927	test: 0.5022293	best: 0.5022293 (7)	total: 4.23s	remaining: 48.7s
8:	learn: 0.5013786	test: 0.5020204	best: 0.5020204 (8)	total: 4.77s	remaining: 48.2s
9:	learn: 0.5012358	test: 0.5018769	best: 0.5018769 (9)	total: 5.22s	remaining: 47s
10:	learn: 0.5010974	test: 0.5017442	best: 0.5017442 (10)	total: 5.72s	remaining: 46.3s
11:	learn: 0.5009454	test: 0.5015854	best: 0.5015854 (

<catboost.core.CatBoostClassifier at 0x7fb489bdb6a0>

In [18]:
cbr = CatBoostRegressor(iterations=100, learning_rate=0.4, depth=8, l2_leaf_reg=10)
cbr.fit(full_train_reg, eval_set=eval_pool_reg)

0:	learn: 16.0713491	test: 16.0620598	best: 16.0620598 (0)	total: 568ms	remaining: 56.3s
1:	learn: 16.0634790	test: 16.0546159	best: 16.0546159 (1)	total: 1.1s	remaining: 53.8s
2:	learn: 16.0598301	test: 16.0512285	best: 16.0512285 (2)	total: 1.57s	remaining: 50.9s
3:	learn: 16.0576570	test: 16.0491488	best: 16.0491488 (3)	total: 2s	remaining: 48s
4:	learn: 16.0555959	test: 16.0471167	best: 16.0471167 (4)	total: 2.44s	remaining: 46.4s
5:	learn: 16.0543889	test: 16.0458596	best: 16.0458596 (5)	total: 2.92s	remaining: 45.8s
6:	learn: 16.0530220	test: 16.0445896	best: 16.0445896 (6)	total: 3.44s	remaining: 45.7s
7:	learn: 16.0515835	test: 16.0432327	best: 16.0432327 (7)	total: 3.88s	remaining: 44.6s
8:	learn: 16.0503993	test: 16.0419631	best: 16.0419631 (8)	total: 4.33s	remaining: 43.8s
9:	learn: 16.0496325	test: 16.0411659	best: 16.0411659 (9)	total: 4.74s	remaining: 42.6s
10:	learn: 16.0485801	test: 16.0399025	best: 16.0399025 (10)	total: 5.21s	remaining: 42.1s
11:	learn: 16.0479071	tes

<catboost.core.CatBoostRegressor at 0x7fb489bd9f30>

In [19]:
cbc.save_model('cbc')
cbr.save_model('cbr')

# Predicting results

In [20]:
test_data = pd.read_parquet('/kaggle/input/burger-king-churn/hackaton2023_test.gzip')
test_data.sample(5)

Unnamed: 0,customer_id,group_name,revenue,startdatetime,dish_name,ownareaall_sqm,format_name
3701792,28349858,test,55.0,2022-09-03 12:37:44,Кинг Фри мал,128.5,Фудкорт без туалета
2754106,22851876,test,119.99,2023-01-05 20:48:18,Воппер Джуниор,113.0,Фудкорт без туалета
811397,7445632,test,14.98,2022-11-03 18:15:52,Up Сырный соус Пармеджано,384.0,Отдельно стоящий
3125362,24691076,test,359.98,2023-01-14 15:45:49,Наггетсы (9 шт.),86.8,Фудкорт без туалета
35678,294595,test,50.0,2022-12-23 18:37:37,Кинг Фри мал,283.8,Отдельно стоящий без внешней зоны без туалета


In [21]:
print(len(test_data.customer_id.unique()))
unique = test_data.customer_id.unique()
test_data = preprocess(test_data, is_train=False)
print(len(test_data['customer_id'].unique()))
test_data.head()

112334
112334


Unnamed: 0,bill_id,customer_id,dish_name,startdatetime,has_toilet,has_external_zone,is_foodcourt,revenue,ownareaall_sqm,dish_amount
0,0,28039329,"[934, 788, 789]",1662077000.0,False,0,True,1179.96,169.73,3
1,1,33602398,"[599, 81, 611, 282, 803, 860, 561, 860, 544, 8...",1662078000.0,True,2,False,619.95,306.84,13
2,2,28039329,[934],1662078000.0,False,0,True,289.99,169.73,1
3,3,29250460,"[846, 605, 666, 666, 605, 471, 183, 471, 289, ...",1662079000.0,True,2,False,828.93,300.0,13
4,4,19197657,[916],1662080000.0,True,1,False,69.99,338.3,1


In [22]:
out = pd.DataFrame()
out['customer_id'] = test_data['customer_id']
test_x = test_data[[column for column in data.columns if column not in ['buy_post', 'date_diff_post', 'customer_id', 'dish_name']]]

out['date_diff_post'] = cbr.predict(test_x)
out['buy_post'] = cbc.predict(test_x)
out.head()

Unnamed: 0,customer_id,date_diff_post,buy_post
0,28039329,13.225997,0
1,33602398,17.092892,1
2,28039329,12.592316,0
3,29250460,15.483408,1
4,19197657,13.414683,1


# Aggregate predictions for paychecks 
## to formulate a prediction for a single user

In [23]:
out = out.groupby('customer_id', as_index=False).mean()
for n, val in enumerate(out['buy_post']):
    if val == 0:
        out['date_diff_post'][n] = None
        
print(len(out.customer_id.unique()))
out.head()

112334


Unnamed: 0,customer_id,date_diff_post,buy_post
0,52341,16.706817,1.0
1,69175,16.076833,1.0
2,73427,15.333283,1.0
3,134577,15.251462,1.0
4,156357,16.972239,1.0


In [24]:
out.to_csv('/kaggle/working/solution.csv')