In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from xgboost.sklearn import xgboost_model_doc

In [2]:
train_data = pd.read_csv('data\\train.csv')
sub = pd.read_csv('data\\sample_submission.csv')

#### 1. Training dataset creation and feature engineering

```order_number``` - order counter for each user<br>
```user_id``` - user ID<br>
```category``` - category<br>
```ordered```  - purchase counter for each category for each user<br>
```orders_total``` - total purchase counter for each user<br>
```rating``` - average amount of each category in  customer's purchase<br>
```total_ordered``` - purchase counter by all users<br>
```id``` - user_id / category as in submission file<br>
```target``` - target variable (the last known purchase)<br>

In [3]:
# sparse matrix for temporary use
train_raw = pd.get_dummies(train_data, columns=['cart'], prefix='', prefix_sep='', dtype='bool')
train_raw = train_raw.groupby(['user_id', 'order_completed_at']).any().reset_index()

# order counter for each use
train_raw['order_number'] = train_raw.groupby(['user_id']).cumcount()
train_raw = train_raw.drop('order_completed_at', axis=1)

# separate datasets by the last purchase
last_order = train_raw.groupby(['user_id'])['order_number'].transform(max) == train_raw['order_number']
train = train_raw[~last_order].groupby('user_id').sum().reset_index()
valid = train_raw[last_order].reset_index(drop=True)

#purchase counter for each category for each user
train_melt = pd.melt(train, id_vars=['user_id'], var_name='category', value_name='ordered')
valid_melt = pd.melt(valid, id_vars=['user_id'], var_name='category', value_name='target')

Train = train_melt.copy()

# total purchase counter for each user
order_number = valid[['user_id', 'order_number']].set_index('user_id').squeeze()
Train['orders_total'] = Train['user_id'].map(order_number)

#average amount of each category in  customer's purchase
Train['rating'] = Train['ordered'] / Train['orders_total']

# user_id / category as in submission file
Train['id'] = Train['user_id'].astype(str) + ';' + Train['category']

# target variable (the last known purchase)
Train['target'] = valid_melt['target'].astype(int)

#remove those users/categories who are not represented in the submission file
Train = Train[Train.id.isin(sub.id.unique())].reset_index(drop=True)
#Check
print((Train.sort_values('id')['id'].values == Train.sort_values('id')['id'].values).all())

#purchase counter by all user (for represetned users)
total_ordered = Train.groupby('category')['ordered'].sum()
Train['total_ordered'] = Train['category'].map(total_ordered)

Train.head(3)

True


Unnamed: 0,user_id,category,ordered,orders_total,rating,id,target,total_ordered
0,7,0,0,10,0.0,7;0,1,12922
1,8,0,1,7,0.142857,8;0,0,12922
2,9,0,1,45,0.022222,9;0,0,12922


In [4]:
Train_set, Valid_set = train_test_split(Train, test_size=0.2,
                                        stratify=None, random_state=17)

#### 2. Model training

Kfold catboost

In [118]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

In [119]:
def f1(real, pred, **kwargs):
    return f1_score(real, (pred > 0.5).astype(int), **kwargs)

In [120]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
y = Train.target
X = Train.drop(columns=["user_id", "id", "target"])

models = []
for train_index, val_index in skf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[val_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[val_index]

    model = CatBoostClassifier(
        iterations=5000,
        depth=10,
        l2_leaf_reg=3,
        eval_metric="F1",  # Оценка по F1
        random_seed=42,
        verbose=100,
        early_stopping_rounds=700,
        use_best_model=True,
        task_type="GPU",
        devices='0',
        auto_class_weights="SqrtBalanced"
    )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
    models.append(model)

0:	learn: 0.6538521	test: 0.6558371	best: 0.6558371 (0)	total: 21.4ms	remaining: 1m 46s
100:	learn: 0.6607682	test: 0.6625889	best: 0.6630456 (86)	total: 1.89s	remaining: 1m 31s
200:	learn: 0.6621224	test: 0.6639733	best: 0.6642232 (189)	total: 3.74s	remaining: 1m 29s
300:	learn: 0.6628657	test: 0.6637147	best: 0.6642232 (189)	total: 5.61s	remaining: 1m 27s
400:	learn: 0.6639919	test: 0.6638219	best: 0.6642232 (189)	total: 7.46s	remaining: 1m 25s
500:	learn: 0.6656825	test: 0.6641342	best: 0.6645065 (483)	total: 9.33s	remaining: 1m 23s
600:	learn: 0.6668648	test: 0.6648419	best: 0.6648419 (600)	total: 11.2s	remaining: 1m 21s
700:	learn: 0.6675816	test: 0.6648646	best: 0.6650261 (655)	total: 13s	remaining: 1m 19s
800:	learn: 0.6682324	test: 0.6643938	best: 0.6650261 (655)	total: 14.9s	remaining: 1m 17s
900:	learn: 0.6688224	test: 0.6643131	best: 0.6650261 (655)	total: 16.7s	remaining: 1m 16s
1000:	learn: 0.6692991	test: 0.6646138	best: 0.6650261 (655)	total: 18.6s	remaining: 1m 14s
1100

xgboost

In [5]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

In [22]:
def f1(real, pred, **kwargs):
    return 'custom_f1', f1_score(real, (pred > 0.5).astype(int), **kwargs)


skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
y = Train.target
X = Train.drop(columns=["user_id", "id", "target"])

In [8]:
X

Unnamed: 0,category,ordered,orders_total,rating,total_ordered
0,0,0,10,0.000000,12922
1,0,1,7,0.142857,12922
2,0,1,45,0.022222,12922
3,0,1,20,0.050000,12922
4,0,3,16,0.187500,12922
...,...,...,...,...,...
790444,880,2,70,0.028571,7
790445,880,1,22,0.045455,7
790446,880,2,8,0.250000,7
790447,880,1,3,0.333333,7


In [25]:
X["category"] = X["category"].astype("category")
X["category"]

0           0
1           0
2           0
3           0
4           0
         ... 
790444    880
790445    880
790446    880
790447    880
790448    880
Name: category, Length: 790449, dtype: category
Categories (858, object): ['0', '1', '10', '100', ..., '96', '97', '98', '99']

In [85]:
models_xgb = []
for train_index, val_index in skf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[val_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[val_index]

    model = XGBClassifier(
        reg_lambda=12,
        enable_categorical=True,
        early_stopping_rounds=700,
    )
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
    models_xgb.append(model)

[0]	validation_0-logloss:0.46042
[1]	validation_0-logloss:0.42331
[2]	validation_0-logloss:0.40167
[3]	validation_0-logloss:0.38831
[4]	validation_0-logloss:0.37978
[5]	validation_0-logloss:0.37419
[6]	validation_0-logloss:0.37047
[7]	validation_0-logloss:0.36806
[8]	validation_0-logloss:0.36641
[9]	validation_0-logloss:0.36529
[10]	validation_0-logloss:0.36455
[11]	validation_0-logloss:0.36405
[12]	validation_0-logloss:0.36371
[13]	validation_0-logloss:0.36346
[14]	validation_0-logloss:0.36330
[15]	validation_0-logloss:0.36319
[16]	validation_0-logloss:0.36310
[17]	validation_0-logloss:0.36305
[18]	validation_0-logloss:0.36301
[19]	validation_0-logloss:0.36300
[20]	validation_0-logloss:0.36295
[21]	validation_0-logloss:0.36296
[22]	validation_0-logloss:0.36294
[23]	validation_0-logloss:0.36298
[24]	validation_0-logloss:0.36296
[25]	validation_0-logloss:0.36295
[26]	validation_0-logloss:0.36294
[27]	validation_0-logloss:0.36297
[28]	validation_0-logloss:0.36300
[29]	validation_0-loglos

Check best threshold for train

In [86]:
models_xgb

[XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=700,
               enable_categorical=True, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
 XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=700,
               enable_categori

In [87]:
best_score = 0
Valid_set["category"] = Valid_set["category"].astype("category")
for model in models_xgb:
    valid_pred = model.predict_proba(Valid_set.drop(columns=["user_id", "id", "target"]))[:, 1]
    for i in np.arange(0.01, 1.0, 0.01):
        score = f1 = f1_score(Valid_set.target, (valid_pred.data > i).astype(int))
        if score > best_score:
            best_score = score
            proba_split = i
    print('At i =', "%.2f" % proba_split, 'score is : ' "%.5f" % best_score)

At i = 0.24 score is : 0.59800
At i = 0.25 score is : 0.60205
At i = 0.25 score is : 0.60212
At i = 0.25 score is : 0.60212
At i = 0.25 score is : 0.60212


#### 3. Test dataset creation

In [88]:
Test = Train.copy()

#increment counter
Test['orders_total'] += 1

#add last purchase
Test['ordered'] = Test['ordered'] + Test['target']

#recalculate including last order
test_total_ordered = Test.groupby('category')['ordered'].sum()
Test['total_ordered'] = Test['category'].map(test_total_ordered)

#recalculate including last order
Test['rating'] = Test['ordered'] / Test['orders_total']

Test = Test.drop('target', axis=1)
Test.head(3)

Unnamed: 0,user_id,category,ordered,orders_total,rating,id,total_ordered
0,7,0,1,11,0.090909,7;0,14190
1,8,0,1,8,0.125,8;0,14190
2,9,0,1,46,0.021739,9;0,14190


In [89]:
Test["category"] = Test["category"].astype("category")

#### 4. Get avg predictions from 5 folds


In [90]:
predictions = []
for model in models_xgb:
    prediction = model.predict_proba(Test.drop(columns=["user_id", "id"]))[:, 1]
    predictions.append(prediction)
predictions

[array([0.09274558, 0.11000419, 0.02168283, ..., 0.03184012, 0.01927452,
        0.0177153 ], dtype=float32),
 array([0.08988574, 0.09912319, 0.02851621, ..., 0.04063403, 0.01823885,
        0.01707705], dtype=float32),
 array([0.09152331, 0.10601023, 0.02165207, ..., 0.03695901, 0.01680276,
        0.01329909], dtype=float32),
 array([0.09196748, 0.11095194, 0.03244093, ..., 0.04163974, 0.02401349,
        0.02272913], dtype=float32),
 array([0.08873425, 0.11024296, 0.02688402, ..., 0.03028618, 0.02109117,
        0.02043149], dtype=float32)]

In [91]:
avg_preds = sum(predictions) / 5
train_mean = Train.target.mean()

#### 5. Optimal probability threshold
Using a threshold probability level of 0.5, the average value of the predictions is less than the average value in the training set.<br>
Based on the hypothesis that the total number of all purchases of all customers in one order is approximately equal, the optimal probability threshold value is selecting.

In [92]:
th = 0.5
test_mean = (avg_preds > th).astype(int).mean()
while test_mean < train_mean:
    th -= 0.001
    test_mean = (avg_preds > th).astype(int).mean()
print('Threshold:', "%.4f" % th)
print('Train mean:', "%.5f" % train_mean)
print('New Test mean:', "%.5f" % test_mean)
th

Threshold: 0.2470
Train mean: 0.23596
New Test mean: 0.23609


0.24699999999999978

#### 6. Final predictions and submit

In [93]:
Test['target'] = (avg_preds > th).astype(int)
submit = pd.merge(sub['id'], Test[['id', 'target']], on='id')
submit.to_csv('submission_baseline_xgb_5_folds_4s.csv', index=False)

Public F1 score: **0.49153** (#17 place avanturer77) p.s изменил гиперпараметры, но закончились сабмиты и пока не знаю сколько xgboost дает сейчас