# Preparation

In [0]:
import pandas as pd
import numpy as np
import tqdm
import datetime

SEED = 215
SECONDS = 946684800

aggs = ['sum', 'mean', 'std', 'min', 'max']
cats = [27, 32, 41, 45, 67, 73, 81, 88]

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier 

Распакуйте архив с данными в папку,где находится этот jupyter notebook (final.ipynb).

In [0]:
transactions_train = pd.read_csv('transactions_train.csv')
train_target = pd.read_csv('train_target.csv')
transactions_test = pd.read_csv('transactions_test.csv')
test_id = pd.read_csv('test.csv')

In [0]:
transactions_train.head()

Unnamed: 0,client_dk,trans_date,amount,small_group
0,43976,0,4.563,2
1,8417,0,48.342,0
2,17309,0,12.32,0
3,33523,0,29.005,6
4,24228,0,10.266,6


* client_dk - уникальный идентификатор клиента
* trans_date - дата совершения транзакции
* amount - сумма транзакции
* small_group - категория покупки

In [0]:
train_target.head(5)

Unnamed: 0,client_dk,27,32,41,45,67,73,81,88
0,39762,1,0,0,0,0,0,0,0
1,10586,0,0,0,1,0,0,0,0
2,40115,0,1,0,0,0,0,0,0
3,34543,0,0,0,1,0,0,0,0
4,5372,0,0,0,1,0,0,0,0


* client_dk - уникальный идентификатор клиента, соответствует полю client_dk из транзакций
* числовые названия колонок - это 8 категорий продуктов. Их названия (числа) соответствуют значениям в колонке **small_group** из данных по транзакциям. Значения в этих колонках бинарные,т.е. 0 - в исследуемую неделю не было совершено покупки в данной категории, 1 - покупка была совершена. Например, клиент с номером 34110 (первая строчка) купил товар из категории 45, а по остальным категориям покупок не совершал.

# Feature creating


*Перевод trans_date в месяц/год/день_недели, считая от 01.01.2000*

In [0]:
def to_dates(day_):
    
    date = datetime.date.fromtimestamp(SECONDS + day_ * (60 * 60 * 24))
    day = date.day
    weekday = date.weekday()
    month = date.month
    year = date.year

    return [month, year, weekday]

Создание признаков:

* Самые простые аггрегационные признаки по клиенту.
* Для каждого клиента аггрегационные признаки для транзакций по каждой категории.
* Для каждого клиента аггрегационные признаки для транзакций по месяцам, дням недели, годам.
* Для каждого клиента аггрегационные признаки для транзакций по месяцам, дням недели, годам и по категории одновременно.

In [0]:
def create_x(df, target):
    # Получаем аналоги даты в виде месяцев/дней недели/дня месяца/...
    dates_df = pd.DataFrame(df['trans_date'].apply(to_dates).values.tolist())
    dates_df.columns = ['month', 'year', 'weekday']
    df = pd.concat([df, dates_df], axis=1)
    df.drop('trans_date', axis=1, inplace=True)

    # Получаем агреггированные фичи из дат
    for date_name in ['year', 'month', 'weekday']:
        _dates_df = df.groupby(['client_dk', date_name])['amount'].agg(aggs).reset_index()
        res_lst = []
        for agg in aggs:
            _df = _dates_df.pivot(index='client_dk', columns=date_name, values=agg).fillna(0)
            _df.columns = [f'{date_name}_{i}_rur_{agg}' for i in _df.columns]
            res_lst.append(_df)
        weekdays_agg_df = pd.concat(res_lst, axis=1)
        target = pd.merge(target, weekdays_agg_df, on='client_dk')
    
    for date_name in ['year', 'month', 'weekday']:
        rang = {'year': [2000, 2001], 'month': [i for i in range(1, 13)], 'weekday': [0, 1, 2, 3, 4, 5, 6]}
        res_lst = []
        for i in rang[date_name]:
            _df = df[df[date_name] == i].groupby(['client_dk', 'small_group'])['amount'].count().reset_index().pivot('client_dk', 'small_group', 'amount').fillna(0)
            
            add_colms = list(set(_df.columns) ^ set([i for i in range(204)]))
            zeros_df = pd.DataFrame(columns=add_colms, index=_df.index)

            _df = pd.merge(_df, zeros_df, on='client_dk',)
            _df.columns = [f'{date_name}_{i}_group_{j}' for j in _df.columns]
            res_lst.append(_df)

        weekdays_df = pd.concat(res_lst, axis=1)
        target = pd.merge(target, weekdays_df, on='client_dk')

    agg_features = df.groupby('client_dk')['amount'].agg(aggs).reset_index()

    # Получаем количество транзакций для каждой группы покупок
    counts_df = df.groupby(['client_dk','small_group'])['amount'].count()\
                        .reset_index()\
                        .pivot(index='client_dk', columns='small_group',values='amount')
    add_colms = list(set(counts_df.columns) ^ set([i for i in range(204)]))
    zeros_df = pd.DataFrame(columns=add_colms, index=counts_df.index)

    counts_df = pd.merge(
        counts_df,
        zeros_df, 
        on='client_dk',
    )
    counts_df = counts_df.fillna(0)
    counts_df.columns = ['small_group_' + str(i) for i in counts_df.columns]

    agg_cat_features = df\
            .groupby(['client_dk', 'small_group'])['amount']\
            .agg(['sum','mean','std','min','max'])\
            .reset_index()
    for agg in aggs:
        _df = agg_cat_features.pivot(index='client_dk', columns='small_group', values=agg)
        _df = pd.merge(_df, zeros_df, on='client_dk',)
        _df.columns = [f'small_group_rur_{agg}_{i}' for i in _df.columns]
        target = pd.merge(target, _df.fillna(0), on='client_dk')


    agg_cat_features = df\
            .groupby(['client_dk', 'small_group'])['amount']\
            .agg(['sum','mean','std','min','max'])\
            .reset_index()
    for agg in aggs:
        _df = agg_cat_features.pivot(index='client_dk', columns='small_group', values=agg)
        _df = pd.merge(_df, zeros_df, on='client_dk',)
        _df.columns = [f'small_group_rur_{agg}_{i}' for i in _df.columns]
        target = pd.merge(target, _df.fillna(0), on='client_dk')


    target = pd.merge(target, agg_features, on='client_dk')
    target = pd.merge(target, counts_df.reset_index(), on='client_dk')

    return target.fillna(0)

In [0]:
train_data = create_x(transactions_train, train_target)
test_data = create_x(transactions_test, test_id)

In [0]:
train_data.head(1)

Unnamed: 0,client_dk,27,32,41,45,67,73,81,88,year_2000_rur_sum,year_2001_rur_sum,year_2000_rur_mean,year_2001_rur_mean,year_2000_rur_std,year_2001_rur_std,year_2000_rur_min,year_2001_rur_min,year_2000_rur_max,year_2001_rur_max,month_1_rur_sum,month_2_rur_sum,month_3_rur_sum,month_4_rur_sum,month_5_rur_sum,month_6_rur_sum,month_7_rur_sum,month_8_rur_sum,month_9_rur_sum,month_10_rur_sum,month_11_rur_sum,month_12_rur_sum,month_1_rur_mean,month_2_rur_mean,month_3_rur_mean,month_4_rur_mean,month_5_rur_mean,month_6_rur_mean,month_7_rur_mean,month_8_rur_mean,month_9_rur_mean,...,small_group_164,small_group_165,small_group_166,small_group_167,small_group_168,small_group_169,small_group_170,small_group_171,small_group_172,small_group_173,small_group_174,small_group_175,small_group_176,small_group_177,small_group_178,small_group_179,small_group_180,small_group_181,small_group_182,small_group_183,small_group_184,small_group_185,small_group_186,small_group_187,small_group_188,small_group_189,small_group_190,small_group_191,small_group_192,small_group_193,small_group_195,small_group_196,small_group_194,small_group_197,small_group_198,small_group_199,small_group_200,small_group_201,small_group_202,small_group_203
0,39762,1,0,0,0,0,0,0,0,21696.773,2698.97,21.567369,34.164177,30.541177,52.099715,0.048,0.967,292.338,307.601,3865.094,1817.035,1131.474,1219.594,2063.22,2095.204,1643.494,1864.369,737.009,1771.5,3004.255,3183.495,24.93609,25.236597,15.936254,20.671085,26.451538,24.082805,16.112686,20.26488,19.919162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0


In [0]:
test_data.head(1)

Unnamed: 0,client_dk,year_2000_rur_sum,year_2001_rur_sum,year_2000_rur_mean,year_2001_rur_mean,year_2000_rur_std,year_2001_rur_std,year_2000_rur_min,year_2001_rur_min,year_2000_rur_max,year_2001_rur_max,month_1_rur_sum,month_2_rur_sum,month_3_rur_sum,month_4_rur_sum,month_5_rur_sum,month_6_rur_sum,month_7_rur_sum,month_8_rur_sum,month_9_rur_sum,month_10_rur_sum,month_11_rur_sum,month_12_rur_sum,month_1_rur_mean,month_2_rur_mean,month_3_rur_mean,month_4_rur_mean,month_5_rur_mean,month_6_rur_mean,month_7_rur_mean,month_8_rur_mean,month_9_rur_mean,month_10_rur_mean,month_11_rur_mean,month_12_rur_mean,month_1_rur_std,month_2_rur_std,month_3_rur_std,month_4_rur_std,month_5_rur_std,...,small_group_164,small_group_165,small_group_166,small_group_167,small_group_168,small_group_169,small_group_170,small_group_171,small_group_172,small_group_173,small_group_174,small_group_175,small_group_176,small_group_177,small_group_178,small_group_179,small_group_180,small_group_181,small_group_182,small_group_183,small_group_184,small_group_185,small_group_186,small_group_187,small_group_188,small_group_189,small_group_190,small_group_191,small_group_192,small_group_193,small_group_195,small_group_196,small_group_194,small_group_197,small_group_198,small_group_199,small_group_200,small_group_201,small_group_202,small_group_203
0,12671,44451.205,1290.185,32.781125,23.039018,64.662643,30.357497,0.088,0.315,1040.564,140.467,5637.211,6921.62,4721.675,4806.903,5165.467,4784.525,3755.472,1512.61,782.42,902.064,5007.71,1743.713,34.373238,42.991429,35.770265,38.765347,37.704139,30.867903,26.079667,21.921884,23.709697,17.687529,34.066054,18.354874,49.937235,105.533478,52.920463,62.369577,64.314238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0


# Get useless features

Получаем ненужные признаки, чтобы выкинуть их в дальнейшем из таблицы

In [0]:
y_train = train_data.iloc[:,1:9]
X_train = train_data.drop(train_data.columns[1:9], axis=1)
X_test = test_data

In [0]:
params = {
    'iterations': 100,
    'learning_rate': None,
    'random_seed': SEED,
    'task_type': 'GPU'
}

In [0]:
models = []

for q in y_train.columns:
    print('train product '+str(q))
    
    curr_y_train = y_train.loc[:,q]
    
    modelCB = CatBoostClassifier(**params)
    modelCB.fit(X_train, curr_y_train, verbose=params['iterations'] // 5)

    models.append(modelCB)

train product 27
Learning rate set to 0.222273
0:	learn: 0.5167181	total: 19.9ms	remaining: 1.97s
20:	learn: 0.3033108	total: 393ms	remaining: 1.48s
40:	learn: 0.2929242	total: 729ms	remaining: 1.05s
60:	learn: 0.2856890	total: 1.05s	remaining: 671ms
80:	learn: 0.2791327	total: 1.37s	remaining: 322ms
99:	learn: 0.2737503	total: 1.66s	remaining: 0us
train product 32
Learning rate set to 0.222273
0:	learn: 0.5584013	total: 19.4ms	remaining: 1.92s
20:	learn: 0.3832637	total: 403ms	remaining: 1.52s
40:	learn: 0.3723371	total: 748ms	remaining: 1.08s
60:	learn: 0.3635498	total: 1.08s	remaining: 690ms
80:	learn: 0.3556086	total: 1.4s	remaining: 329ms
99:	learn: 0.3476122	total: 1.72s	remaining: 0us
train product 41
Learning rate set to 0.222273
0:	learn: 0.5296602	total: 19ms	remaining: 1.88s
20:	learn: 0.3232928	total: 389ms	remaining: 1.46s
40:	learn: 0.3154930	total: 714ms	remaining: 1.03s
60:	learn: 0.3082611	total: 1.05s	remaining: 670ms
80:	learn: 0.3012451	total: 1.37s	remaining: 322ms

In [0]:
features = []
for i in range(8): 
    featurei = list(zip(models[i].feature_importances_, X_train.columns))
    featurei.sort(key=lambda x: -x[0])
    features.append(featurei)

zeros = [[] for i in range(8)]
for i in range(8):
    for imp, name in features[i]:
        if imp == 0: zeros[i].append(name)

res = set(zeros[0])
for i in range(8):
    res = res & set(zeros[i])

bad = list(res)                                                                 # ненужные признаки

In [0]:
bad

['weekday_5_group_60',
 'small_group_rur_sum_185_x',
 'small_group_rur_max_114_x',
 'small_group_rur_std_35_x',
 'small_group_rur_std_44_x',
 'month_5_group_196',
 'month_10_group_182',
 'month_8_group_182',
 'month_7_group_98',
 'weekday_0_group_195',
 'month_9_group_59',
 'weekday_1_group_197',
 'month_10_group_166',
 'month_7_group_144',
 'small_group_105',
 'small_group_rur_sum_178_y',
 'year_2000_group_79',
 'month_5_group_63',
 'weekday_5_group_58',
 'small_group_rur_sum_100_y',
 'small_group_rur_mean_139_y',
 'month_4_group_166',
 'weekday_1_group_181',
 'small_group_rur_max_195_y',
 'small_group_rur_min_168_x',
 'small_group_rur_sum_79_x',
 'small_group_rur_mean_191_y',
 'month_4_group_152',
 'month_9_group_162',
 'weekday_0_group_132',
 'month_2_group_127',
 'month_8_group_61',
 'weekday_3_group_114',
 'weekday_6_group_43',
 'weekday_0_group_61',
 'small_group_rur_sum_182_y',
 'small_group_rur_std_198_y',
 'weekday_2_group_111',
 'small_group_rur_max_154_x',
 'weekday_1_group_

# Models

### Preparation

*P.s Для примера прогнал на 100 итерациях представленных ниже моделей, чтобы не ждать долго. На финале гнал на 2000 итерациях.*

In [0]:
y_train = train_data.iloc[:,1:9]
X_train = train_data.drop(train_data.columns[1:9], axis=1).drop(bad, axis=1).sort_index(axis=1)
X_test = test_data.drop(bad, axis=1).sort_index(axis=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

In [0]:
def validation(models, name):
    results_tree = []
    for i, q in enumerate(y_val.columns):
        pred = models[i].predict_proba(X_val)[:,1]
        results_tree.append(pred)

    pred = np.array(results_tree)
    
    print(f'{name}:', end=' ')
    print(roc_auc_score(y_val, pred.T))

In [0]:
def get_predict(models):
    results_tree = []
    for i, q in enumerate(y_val.columns):
        pred = models[i].predict_proba(X_test)[:,1]
        results_tree.append(pred)

    pred = np.array(results_tree).T
    return pred

In [0]:
def save_model(pred, name):
    columns = ['cat_27','cat_32','cat_41','cat_45','cat_67','cat_73','cat_81','cat_88']
    submission = pd.DataFrame({columns[i]: pred[:, i] for i in range(8)}, index=test_id.client_dk)
    
    submission_path = '/content/drive/My Drive/submissions/{}.csv'.format(name)
    print(submission_path)
    submission.to_csv(submission_path, index=True)

### Catboost

In [0]:
params = {
    #'iterations': 2000,                                                        # Финальный параметр
    'iterations': 100,                                                          # Параметр для примера
    'learning_rate': None,
    'random_seed': SEED,
    'task_type': 'GPU'
}

In [0]:
models = []

for q in y_train.columns:
    print('train product '+str(q))

    curr_y_train = y_train.loc[:,q]
    model = CatBoostClassifier(**params)
    model.fit(X_train, curr_y_train, verbose=params['iterations'] // 5)
    models.append(model)

train product 27
Learning rate set to 0.225018
0:	learn: 0.5178318	total: 11.5ms	remaining: 1.14s
20:	learn: 0.3017063	total: 236ms	remaining: 889ms
40:	learn: 0.2926915	total: 444ms	remaining: 639ms
60:	learn: 0.2834031	total: 655ms	remaining: 419ms
80:	learn: 0.2743549	total: 863ms	remaining: 203ms
99:	learn: 0.2664608	total: 1.06s	remaining: 0us
train product 32
Learning rate set to 0.225018
0:	learn: 0.5585010	total: 11.8ms	remaining: 1.17s
20:	learn: 0.3774479	total: 238ms	remaining: 894ms
40:	learn: 0.3652849	total: 450ms	remaining: 648ms
60:	learn: 0.3549623	total: 662ms	remaining: 423ms
80:	learn: 0.3426693	total: 881ms	remaining: 207ms
99:	learn: 0.3329253	total: 1.08s	remaining: 0us
train product 41
Learning rate set to 0.225018
0:	learn: 0.5344269	total: 11.4ms	remaining: 1.13s
20:	learn: 0.3207018	total: 236ms	remaining: 886ms
40:	learn: 0.3105072	total: 457ms	remaining: 657ms
60:	learn: 0.3012770	total: 681ms	remaining: 435ms
80:	learn: 0.2904024	total: 901ms	remaining: 21

In [0]:
validation(models, "Catboost")

Catboost: 0.8290712713196606


In [0]:
pred = get_predict(models)

In [0]:
save_model(pred, "ctb")

/content/drive/My Drive/submissions/ctb.csv


Сохранение

### XGBoost

In [0]:
params = {
    #'n_estimators': 2000,                                                      # Финальный параметр
    'n_estimators': 100,                                                        # Параметр для примера
    'tree_method':       'gpu_hist',
    'predictor':         'gpu_predictor',
    'learning_rate':     0.015364,
    'random_state':      SEED,
    'max_depth':         8
}

In [0]:
models = []

for q in y_train.columns:
    print('train product '+str(q))

    curr_y_train = y_train.loc[:,q]
    model = XGBClassifier(**params)
    model.fit(X_train, curr_y_train)
    models.append(model)

train product 27
train product 32
train product 41
train product 45
train product 67
train product 73
train product 81
train product 88


In [0]:
validation(models, "XGBoost")

XGBoost: 0.8249632183018184


In [0]:
pred = get_predict(models)

In [0]:
save_model(pred, "xgb")

/content/drive/My Drive/submissions/xgb.csv


### LightGBM

In [0]:
par = {
    #'n_estimators': 2000,                                                      # Финальный параметр
    'n_estimators': 100,                                                        # Параметр для примера
    'boosting_type':     'dart',
    'learning_rate':     0.013215,
    'random_state':      SEED,
    'max_depth':         8,
    'min_data':          10,
    'num_leaves':        10
}

In [0]:
models = []

for q in y_train.columns:
    print('train product '+str(q))

    curr_y_train = y_train.loc[:,q]
    model = LGBMClassifier(**par)
    model.fit(X_train, curr_y_train)
    models.append(model)

train product 27
train product 32
train product 41
train product 45
train product 67
train product 73
train product 81
train product 88


In [0]:
validation(models, "LGBM")

LGBM: 0.8202114156077541


In [0]:
pred = get_predict(models)

In [0]:
save_model(pred, "lgb")

/content/drive/My Drive/submissions/lgb.csv


### RandomForest

In [0]:
par = {
    #'n_estimators': 2000,                                                      # Финальный параметр
    'n_estimators': 100,                                                        # Параметр для примера
    "max_depth": 10, 
    'min_samples_leaf': 20,
    'random_state': SEED,
}

In [0]:
models = []

for q in y_train.columns:
    print('train product '+str(q))

    curr_y_train = y_train.loc[:,q]
    model = RandomForestClassifier(**par)
    model.fit(X_train, curr_y_train)
    models.append(model)

train product 27
train product 32
train product 41
train product 45
train product 67
train product 73
train product 81
train product 88


In [0]:
validation(models, "RandomForest")

RandomForest: 0.8281471814519752


In [0]:
pred = get_predict(models)

In [0]:
save_model(pred, "rndf")

/content/drive/My Drive/submissions/rndf.csv


# Merge

Взятие средней всех моделей

In [0]:
df1 = pd.read_csv('ctb.csv').iloc[:, 1:].to_numpy()
df2 = pd.read_csv('xgb.csv').iloc[:, 1:].to_numpy()
df3 = pd.read_csv('lgb.csv').iloc[:, 1:].to_numpy()
df4 = pd.read_csv('rndf.csv').iloc[:, 1:].to_numpy()

pred = (df1 + df2 + df3 + df4) / 4

In [0]:
save_model(pred, "final")