In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pyarrow.parquet as pq
import bisect
import sklearn.metrics as m
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [None]:
targets = pq.read_table('data/public_train.pqt')

In [None]:
lda_emb = pd.read_csv('embeddings/lda_embeddings.csv')
als_emb = pd.read_csv('embeddings/als_embeddings.csv')
svd_emb = pd.read_csv('embeddings/svd_embeddings.csv')
train_features = pd.read_csv('data/train_features.csv')

### Пол

In [None]:
usr_targets = targets.to_pandas()
df = usr_targets.merge(als_emb, how = 'inner', on = ['user_id'])
df = df[df['is_male'] != 'NA']
df = df.dropna()
df['is_male'] = df['is_male'].astype('int')

In [None]:
df = df.merge(svd_emb, how = 'inner', on = 'user_id')
df = df.merge(train_features, how = 'inner', on = 'user_id')
df = df.merge(lda_emb, how = 'inner', on = 'user_id')

In [None]:
X = df.drop(['user_id', 'age', 'is_male'], axis = 1)
y = df['is_male']

X, X_test, y, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [17]:
cat_clf = CatBoostClassifier(random_state=42)
cat_clf.fit(X_train, y_train, verbose = False)
cat_preds = cat_clf.predict_proba(X_val)[:,1]

print(f'F1 score по полу {m.f1_score(y_val, cat_preds.round()):2.3f}')
print(f'Roc-auc по полу {m.roc_auc_score(y_val, cat_preds):2.3f}')
print(f'GINI по полу {2 * m.roc_auc_score(y_val, cat_preds) - 1:2.3f}')

F1 score по полу 0.765
Roc-auc по полу 0.847
GINI по полу 0.694


In [18]:
xgb_params = {'n_estimators': 966, 
              'max_depth': 10, 
              'learning_rate': 0.11104382079340841, 
              'subsample': 0.9661785026410122, 
              'colsample_bytree': 0.6689239632751095, 
              'lambda': 0.447535704183866, 
              'alpha': 1.0512357705126518, 
              'gamma': 4.227856770118944e-05}

xgb_clf = XGBClassifier(**xgb_params, random_state=42)
xgb_clf.fit(X_train, y_train)
xgb_preds = xgb_clf.predict_proba(X_val)[:,1]

print(f'F1 score по полу {m.f1_score(y_val, xgb_preds.round()):2.3f}')
print(f'Roc-auc по полу {m.roc_auc_score(y_val, xgb_preds):2.3f}')
print(f'GINI по полу {2 * m.roc_auc_score(y_val, xgb_preds) - 1:2.3f}')

F1 score по полу 0.764
Roc-auc по полу 0.844
GINI по полу 0.688


In [19]:
class F1_score(Metric):
    def __init__(self):
        self._name = "f1_score"
        self._maximize = True

    def __call__(self, y_true, y_score):
        return m.f1_score(y_true, y_score[:, 1].round())

In [20]:
tabnet = TabNetClassifier(device_name='cuda', seed=42)
tabnet.fit(
        X_train=X_train.values,
        y_train=y_train.values,
        eval_set=[(X_train.values, y_train.values), (X_val.values, y_val.values)],
        eval_name=["train", "eval"],
        eval_metric=[F1_score],
        max_epochs=10,
        patience=20,
        batch_size=512,
        num_workers=6,
    )
tab_preds = tabnet.predict_proba(X_val.values)[:, 1]

epoch 0  | loss: 0.69507 | train_f1_score: 0.39691 | eval_f1_score: 0.39781 |  0:04:58s
epoch 1  | loss: 0.66694 | train_f1_score: 0.62889 | eval_f1_score: 0.63009 |  0:08:59s
epoch 2  | loss: 0.53485 | train_f1_score: 0.73736 | eval_f1_score: 0.73413 |  0:14:25s
epoch 3  | loss: 0.49666 | train_f1_score: 0.76779 | eval_f1_score: 0.76293 |  0:20:07s
epoch 4  | loss: 0.48434 | train_f1_score: 0.77014 | eval_f1_score: 0.76733 |  0:25:42s
epoch 5  | loss: 0.4775  | train_f1_score: 0.77174 | eval_f1_score: 0.76907 |  0:29:04s
epoch 6  | loss: 0.47458 | train_f1_score: 0.76809 | eval_f1_score: 0.76069 |  0:31:55s
epoch 7  | loss: 0.47044 | train_f1_score: 0.77902 | eval_f1_score: 0.77319 |  0:35:35s
epoch 8  | loss: 0.46795 | train_f1_score: 0.77696 | eval_f1_score: 0.77079 |  0:38:59s
epoch 9  | loss: 0.46435 | train_f1_score: 0.78286 | eval_f1_score: 0.77426 |  0:42:47s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_eval_f1_score = 0.77426


Обучаем блендинг из трех моделей: catboost, xgboost и tabnet:

In [21]:
meta_features = pd.DataFrame({'catboost': cat_preds, 'tabnet': tab_preds, 'xgboost': xgb_preds})
log_reg = LogisticRegression()
log_reg.fit(meta_features, y_val)

In [22]:
cat_preds_test = cat_clf.predict_proba(X_test)[:, 1]
xgb_preds_test = xgb_clf.predict_proba(X_test)[:, 1]
tab_preds_test = tabnet.predict_proba(X_test.values)[:, 1]

meta_features_test = pd.DataFrame({'catboost': cat_preds_test, 'tabnet': tab_preds_test, 'xgboost': xgb_preds_test})

In [None]:
log_preds = log_reg.predict_proba(meta_features_test, random_state=42)[:, 1]

print(f'F1 score по полу {m.f1_score(y_test.values.astype(int), log_preds.round()):2.3f}')
print(f'Roc-auc по полу {m.roc_auc_score(y_test, log_preds):2.3f}')
print(f'GINI по полу {2 * m.roc_auc_score(y_test, log_preds) - 1:2.3f}')

F1 score по полу 0.778
Roc-auc по полу 0.860
GINI по полу 0.721


### Возраст

In [24]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

In [29]:
usr_targets = targets.to_pandas()
df = usr_targets.merge(als_emb, how = 'inner', on = ['user_id'])

age_df = df[df['age'] != 'NA']
age_df = age_df.dropna()
age_df['age'] = age_df['age'].map(age_bucket)

In [30]:
age_df = age_df.merge(svd_emb, how = 'inner', on = ['user_id'])
age_df = age_df.merge(train_features, how = 'inner', on = ['user_id'])
age_df = age_df.merge(lda_emb, how = 'inner', on = ['user_id'])

In [31]:
X = age_df.drop(['user_id', 'age', 'is_male'], axis = 1)
y = age_df['age']

X, X_test, y, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

Для предсказания возраста обучаем catboost

In [32]:
clf = CatBoostClassifier(random_state = 42)
clf.fit(X, y, verbose = False)
print(m.classification_report(y_test, clf.predict(X_test), \
    target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       205
       18-25       0.54      0.35      0.43      6487
       25-34       0.49      0.64      0.56     17461
       35-44       0.41      0.50      0.45     15616
       45-54       0.37      0.24      0.29      8452
       55-65       0.39      0.22      0.28      4633
         65+       0.32      0.02      0.04      1138

    accuracy                           0.45     53992
   macro avg       0.36      0.28      0.29     53992
weighted avg       0.44      0.45      0.43     53992

