In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression, LinearRegression
from lightgbm import LGBMClassifier, LGBMRegressor

In [2]:
df_train = pd.read_parquet('train_dataset_hackaton2023_train.gzip')
df_test = pd.read_parquet('hackaton2023_test.gzip')
mapping = pd.read_csv('mapping.csv', index_col=0)
df_train['dish_cat'] = df_train.dish_name.map(mapping.to_dict()['dish_category'])
df_test['dish_cat'] = df_test.dish_name.map(mapping.to_dict()['dish_category'])
df_test['date_diff_post'] = 0
df_test['buy_post'] = 0

In [3]:
def preprocess(df):
    # df = df.drop_duplicates()
    df['discounts'] = df['revenue'] ==1

    data_check_agg = df.groupby(['customer_id', 'startdatetime']).agg({
        'revenue': 'sum', 
        'buy_post': 'first',
        'date_diff_post': 'first',
        'ownareaall_sqm': 'first',
        'format_name': 'first',
        'discounts': 'sum'
    }).reset_index()

    data_check_agg['delta'] = data_check_agg.groupby('customer_id')['startdatetime'].diff() 
    data_check_agg['delta_hours'] = (data_check_agg['delta'].dt.total_seconds() // (60*60)).fillna(0)
    data_check_agg['delta_days'] = (data_check_agg['delta'].dt.total_seconds() // (24*60*60)).fillna(0)

    data_check_agg = data_check_agg.drop('delta', axis=1)

    data_cust_agg = data_check_agg.groupby('customer_id').agg({
        'revenue': ['mean', 'median', 'std', 'max', 'min', 'count'],
        'delta_hours': ['mean', 'median', 'std', 'max', 'min',], 
        'delta_days': ['mean', 'median', 'std', 'max', 'min',], 
        'startdatetime': ['min', 'max'],
        'ownareaall_sqm': ['mean', 'median', 'std', 'max', 'min',],
        'discounts': ['mean', 'median', 'std', 'max', 'min',],
        'buy_post': 'first',
        'date_diff_post': 'first',
        'format_name': [pd.Series.mode, 'last']
    }).reset_index()

    data_cust_agg.columns = [f'{header}_{stat}' for header, stat in data_cust_agg.columns]

    data_cust_agg['recency'] = (data_cust_agg['startdatetime_max'] - data_cust_agg['startdatetime_min']).dt.days
    data_cust_agg['T'] = (pd.to_datetime('2023-08-02') - data_cust_agg['startdatetime_min']).dt.days
    data_cust_agg['days_from_last_purchase'] = (pd.to_datetime('2023-08-02') - data_cust_agg['startdatetime_max']).dt.days
    data_cust_agg['lambda'] = data_cust_agg['revenue_count'] / (data_cust_agg['recency'] + 1)
    data_cust_agg['format_name_mode'] = data_cust_agg['format_name_mode'].apply(lambda x: x[0] if type(x)==np.ndarray else x).astype('category')
    data_cust_agg['dish_cat_mode'] = df.groupby('customer_id')['dish_cat'].agg(pd.Series.mode).apply(lambda x: x[0] if type(x)==np.ndarray else x).astype('category').reset_index()['dish_cat']
    data_cust_agg['format_name_last'] = data_cust_agg['format_name_last'].astype('category')
    data_cust_agg = data_cust_agg.drop(['startdatetime_min', 'startdatetime_max'], axis=1)
    
    return data_cust_agg

In [4]:
data_train = preprocess(df_train)

In [5]:
data_test = preprocess(df_test)

In [16]:
data_test.head()

Unnamed: 0,customer_id_,revenue_mean,revenue_median,revenue_std,revenue_max,revenue_min,revenue_count,delta_hours_mean,delta_hours_median,delta_hours_std,...,ownareaall_sqm_median,ownareaall_sqm_std,ownareaall_sqm_max,ownareaall_sqm_min,buy_post_first,date_diff_post_first,recency,T,days_from_last_purchase,lambda
0,52341,563.9425,549.95,65.70033,655.93,499.94,4,192.0,168.0,178.529176,...,338.9,0.0,338.9,338.9,0,0,32,178,146,0.121212
1,69175,430.306667,399.98,62.142941,554.96,374.98,9,92.777778,24.0,131.890843,...,263.0,15.74222,263.0,227.3,0,0,34,40,5,0.257143
2,73427,254.240323,169.98,218.360745,869.95,1.0,31,45.096774,45.0,43.356164,...,144.6,85.578977,435.43,100.0,0,0,58,212,153,0.525424
3,134577,372.97,290.98,310.195853,799.93,109.99,4,35.0,0.0,70.0,...,207.6,0.0,207.6,207.6,0,0,5,316,310,0.666667
4,156357,558.963333,644.96,245.551523,749.95,281.98,3,344.0,138.0,481.285778,...,124.0,7.586383,137.14,124.0,0,0,43,227,184,0.068182


In [6]:
X_train = data_train.drop(['date_diff_post_first', 'buy_post_first', 'customer_id_'], axis=1)
y_train_class = data_train['buy_post_first']
y_train_reg = data_train['date_diff_post_first']

# X_valid = data_valid.drop(['date_diff_post_first', 'buy_post_first', 'customer_id_', 'startdatetime_min', 'startdatetime_max'], axis=1)
# y_valid_class = data_valid['buy_post_first']
# y_valid_reg = data_valid['date_diff_post_first']

X_test = data_test.drop(['date_diff_post_first', 'buy_post_first', 'customer_id_'], axis=1)

In [35]:
# from imblearn.over_sampling import SMOTENC
# sm = SMOTENC(random_state=2, sampling_strategy={0: 1436380, 1: 359095}, categorical_features='auto')
# X_res, y_res = sm.fit_resample(X_train, y_train_class)
# sum(y_res)/len(y_res)

In [14]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
model = CatBoostClassifier(iterations=5000,
                        depth=6,
                        l2_leaf_reg = 3,
                        leaf_estimation_iterations = 10,
                        task_type="GPU",
                        verbose=False,
                        cat_features=[26, 27, 32],
                        class_weights = [0.7, 0.3]
                        )

In [166]:
# model = LGBMClassifier(verbose=-1, n_estimators = 500)
# model = LogisticRegression()

res = cross_val_score(model, X_train, y_train_class, scoring='roc_auc')
print(f'ROC-AUC: {res.mean():.3f} ± {res.std():.3f}')

ROC-AUC: 0.750 ± 0.001


In [15]:
model.fit(X_train, y_train_class)

<catboost.core.CatBoostClassifier at 0x7fc9e4ddddf0>

In [93]:
res = cross_val_score(model, X_train, y_train_class, scoring='f1')
print(f'f1-score: {res.mean():.3f} ± {res.std():.3f}')

f1-score: 0.841 ± 0.001


In [138]:
# model = LGBMRegressor(verbose=-1)
# # model = LinearRegression()

# res = cross_val_score(model, X_train, y_train_reg.fillna(0), scoring='neg_root_mean_squared_error')
# print(f'RMSE: {-res.mean():.3f} ± {res.std():.3f}')

RMSE: 16.686 ± 0.207


In [141]:
# model_class = LGBMClassifier(verbose=-1)
# model_class.fit(X_train, y_train_class)

In [144]:
# model_reg = LGBMRegressor(verbose=-1)
# model_reg.fit(X_train, y_train_reg.fillna(0))

In [16]:
pred_class = model.predict(X_test)

In [18]:
sum(pred_class)/len(pred_class)

0.495905068812648

In [None]:
# pred_class = model_class.predict(X_test)
# pred_reg = model_reg.predict(X_test)

In [19]:
sub = pd.DataFrame()

sub['customer_id'] = data_test['customer_id_']
sub['date_diff_post'] = 0
sub['buy_post'] = pred_class

In [20]:
sub.to_csv('catboost_cw_07_sub.csv', sep=';', index=False)