In [1]:
import os

os.chdir('app/')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression, LinearRegression
from lightgbm import LGBMClassifier, LGBMRegressor

In [3]:
path_to_data = 'data'

In [126]:
df_train = pd.read_parquet(path_to_data + '/train_dataset_hackaton2023_train.gzip')
df_test = pd.read_parquet(path_to_data + '/hackaton2023_test.gzip')

df_test['date_diff_post'] = 0
df_test['buy_post'] = 0

In [127]:
def preprocess(df):
    # df = df.drop_duplicates()

    data_check_agg = df.groupby(['customer_id', 'startdatetime']).agg({
        'revenue': 'sum', 
        'buy_post': 'first',
        'date_diff_post': 'first'
    }).reset_index()

    data_check_agg['delta'] = data_check_agg.groupby('customer_id')['startdatetime'].diff() 

    data_check_agg['delta_hours'] = (data_check_agg['delta'].dt.total_seconds() // (60*60)).fillna(0)
    data_check_agg['delta_days'] = (data_check_agg['delta'].dt.total_seconds() // (24*60*60)).fillna(0)

    data_check_agg = data_check_agg.drop('delta', axis=1)

    data_cust_agg = data_check_agg.groupby('customer_id').agg({
        'revenue': ['mean', 'median', 'std', 'max', 'min', 'count'],
        'delta_hours': ['mean', 'median', 'std', 'max', 'min',], 
        'delta_days': ['mean', 'median', 'std', 'max', 'min',], 
        'startdatetime': ['min', 'max'],
        'buy_post': 'first',
        'date_diff_post': 'first',
    }).reset_index()

    data_cust_agg.columns = [f'{header}_{stat}' for header, stat in data_cust_agg.columns]

    data_cust_agg['recency'] = (data_cust_agg['startdatetime_max'] - data_cust_agg['startdatetime_min']).dt.days
    data_cust_agg['T'] = (pd.to_datetime('2023-08-02') - data_cust_agg['startdatetime_min']).dt.days
    data_cust_agg['days_from_last_purchase'] = (pd.to_datetime('2023-08-02') - data_cust_agg['startdatetime_max']).dt.days
    data_cust_agg['lambda'] = data_cust_agg['revenue_count'] / (data_cust_agg['recency'] + 1)
    data_cust_agg = data_cust_agg.drop(['startdatetime_min', 'startdatetime_max'], axis=1)
    
    return data_cust_agg

In [128]:
data_train = preprocess(df_train)

In [131]:
data_train.head()

Unnamed: 0,customer_id_,revenue_mean,revenue_median,revenue_std,revenue_max,revenue_min,revenue_count,delta_hours_mean,delta_hours_median,delta_hours_std,...,delta_days_median,delta_days_std,delta_days_max,delta_days_min,buy_post_first,date_diff_post_first,recency,T,days_from_last_purchase,lambda
0,29891,203.494,199.96,123.170275,439.98,1.0,25,55.24,24.0,72.50246,...,1.0,2.998889,11.0,0.0,1,9.0,57,239,181,0.431034
1,30477,227.024,229.99,124.933425,499.95,44.99,25,49.4,23.0,60.578462,...,0.0,2.518597,9.0,0.0,1,10.0,51,301,249,0.480769
2,31426,391.399583,344.985,334.849322,1079.97,1.0,24,49.708333,34.5,46.635944,...,1.0,1.809796,6.0,0.0,1,4.0,50,81,31,0.470588
3,44491,128.725,59.98,144.471912,344.97,49.97,4,166.75,11.5,318.351352,...,0.0,13.0,26.0,0.0,1,42.0,27,52,24,0.142857
4,44939,554.943333,554.94,49.985,604.93,504.96,3,123.333333,172.0,107.598017,...,7.0,4.358899,8.0,0.0,1,9.0,15,234,219,0.1875


In [132]:
data_test = preprocess(df_test)

In [133]:
data_test.head()

Unnamed: 0,customer_id_,revenue_mean,revenue_median,revenue_std,revenue_max,revenue_min,revenue_count,delta_hours_mean,delta_hours_median,delta_hours_std,...,delta_days_median,delta_days_std,delta_days_max,delta_days_min,buy_post_first,date_diff_post_first,recency,T,days_from_last_purchase,lambda
0,52341,563.9425,549.95,65.70033,655.93,499.94,4,192.0,168.0,178.529176,...,6.5,7.5,18.0,0.0,0,0,32,178,146,0.121212
1,69175,430.306667,399.98,62.142941,554.96,374.98,9,92.777778,24.0,131.890843,...,1.0,5.456902,13.0,0.0,0,0,34,40,5,0.257143
2,73427,254.240323,169.98,218.360745,869.95,1.0,31,45.096774,45.0,43.356164,...,1.0,1.786448,8.0,0.0,0,0,58,212,153,0.525424
3,134577,372.97,290.98,310.195853,799.93,109.99,4,35.0,0.0,70.0,...,0.0,2.5,5.0,0.0,0,0,5,316,310,0.666667
4,156357,558.963333,644.96,245.551523,749.95,281.98,3,344.0,138.0,481.285778,...,5.0,20.07486,37.0,0.0,0,0,43,227,184,0.068182


In [103]:
# data_train, data_valid = train_test_split(data_cust_agg, test_size=.2, random_state=42)

In [135]:
X_train = data_train.drop(['date_diff_post_first', 'buy_post_first', 'customer_id_'], axis=1)
y_train_class = data_train['buy_post_first']
y_train_reg = data_train['date_diff_post_first']

# X_valid = data_valid.drop(['date_diff_post_first', 'buy_post_first', 'customer_id_', 'startdatetime_min', 'startdatetime_max'], axis=1)
# y_valid_class = data_valid['buy_post_first']
# y_valid_reg = data_valid['date_diff_post_first']

X_test = data_test.drop(['date_diff_post_first', 'buy_post_first', 'customer_id_'], axis=1)

In [136]:
model = LGBMClassifier(verbose=-1)
# model = LogisticRegression()

res = cross_val_score(model, X_train, y_train_class, scoring='roc_auc')
print(f'ROC-AUC: {res.mean():.3f} ± {res.std():.3f}')

ROC-AUC: 0.749 ± 0.001


In [137]:
res = cross_val_score(model, X_train, y_train_class, scoring='f1')
print(f'f1-score: {res.mean():.3f} ± {res.std():.3f}')

f1-score: 0.841 ± 0.001


In [138]:
model = LGBMRegressor(verbose=-1)
# model = LinearRegression()

res = cross_val_score(model, X_train, y_train_reg.fillna(0), scoring='neg_root_mean_squared_error')
print(f'RMSE: {-res.mean():.3f} ± {res.std():.3f}')

RMSE: 16.686 ± 0.207


In [141]:
model_class = LGBMClassifier(verbose=-1)
model_class.fit(X_train, y_train_class)

In [144]:
model_reg = LGBMRegressor(verbose=-1)
model_reg.fit(X_train, y_train_reg.fillna(0))

In [149]:
pred_class = model_class.predict(X_test)
pred_reg = model_reg.predict(X_test)

In [152]:
sub = pd.DataFrame()

sub['customer_id'] = data_test['customer_id_']
sub['date_diff_post'] = pred_reg
sub['buy_post'] = pred_class

In [154]:
sub.to_csv('first_sub.csv', sep=';', index=False)