# Download libraries and data

In [None]:
#Download CatBoost
#
!pip install catboost==0.23.2

In [1]:
#Download libraries
#
import pandas as pd, os, gc
import numpy as np
import math
import copy
from itertools import combinations

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import roc_curve, auc, log_loss

from tqdm import tqdm, tqdm_notebook

from sklearn.model_selection import GroupShuffleSplit, StratifiedKFold, GroupKFold
from catboost import CatBoostClassifier

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Download data
#
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
#Training data
#
train.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,4WKQSBB,1/2/2019,F,M,1987,1X1H,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CP5S02H,1/6/2019,F,M,1981,UAOD,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2YKDILJ,1/6/2013,M,U,1991,748L,QZYX,90QI,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2S9E81J,1/8/2019,M,M,1990,1X1H,BP09,56SI,0,0,...,0,0,0,1,0,0,0,0,0,0
4,BHDYVFT,1/8/2019,M,M,1990,748L,NO3L,T4MS,0,0,...,0,0,0,0,0,0,1,1,0,0


In [4]:
#Testing data
#
test.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,F86J5PC,1/12/2018,M,M,1984,94KC,DZRV,90QI,0,0,...,0,0,0,0,0,0,0,0,0,0
1,H6141K3,1/10/2019,M,M,1996,1X1H,J9SY,90QI,0,0,...,0,0,0,1,0,0,0,0,0,0
2,RBAYUXZ,1/1/2020,F,W,1968,UAOD,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
3,KCBILBQ,1/2/2019,M,M,1989,94KC,2A7I,T4MS,0,0,...,0,0,0,0,0,0,0,0,0,0
4,LSEC1ZJ,1/2/2020,F,M,1982,UAOD,0KID,T4MS,0,0,...,0,0,0,0,0,0,1,0,0,0


# Data preparing

In [5]:
#Adding amount of purchased products for each client(for test without 1 missing)
#
train['sum'] = train.iloc[:, 8:].T.sum()

test['sum'] = test.iloc[:, 8:].T.sum()

In [6]:
#Names of each product
#
names_products = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]

In [7]:
train.loc[train.marital_status == 'f', 'marital_status'] = 'F'

In [9]:
#Statistic values with combinations of 1s products by train data
#
values = {}
cols = train.columns[8:-1]
for i in tqdm_notebook(range(1,len(cols)+1)):
    for comb in list(combinations(cols, i)):
        values[tuple(sorted(comb))] = 0
for i, row in tqdm_notebook(pd.concat([train, test.loc[test['sum'] > 1]]).iterrows()):
    res = []

    for c in cols:
        if row[c] == 1:
            res.append(c)
    for j in range(1, len(res)+1):
        for comb in list(combinations(res, j)):
            values[tuple(sorted(comb))] += 1

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
#Add to summ 1 missing product
#
test['sum'] += 1

In [11]:
#Make spliting train clients info. Trying to reproduce the situation with test
#
X_train = []
X_train_columns = train.columns[:-1]
df_train_true = []
client_index = 0

for line in tqdm_notebook(train.values):

    info = line[:8]
    info_products = line[8:-1]
    indexes = [k for k, i in enumerate(info_products) if i == 1]

    for i in indexes:

        client_index += 1

        for k in range(len(info_products)):

            if k == i:

                info_products_transformed = list(copy.copy(info_products))
                df_train_true.append(info_products)
                info_products_transformed[i] = 0

                X_train.append(
                    list(info) + info_products_transformed +
                    [X_train_columns[8 + k]] + [client_index])

X_train = pd.DataFrame(X_train)
df_train_true = pd.DataFrame(df_train_true)
df_train_true.columns = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]
X_train.columns = [
    'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
    'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
    '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
    'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',
    'product_pred', 'ID2'
]

HBox(children=(FloatProgress(value=0.0, max=29132.0), HTML(value='')))




In [12]:
#Make info about true values in test data
#
X_test = []
true_values = []
client_index = 0
for line in tqdm_notebook(test.values):

    client_index += 1

    info = line[:8]
    info_products = line[8:-1]
    indexes = [k for k, i in enumerate(info_products) if i == 1]

    X_test.append(list(info) + list(info_products) + [client_index])

    for true in test.columns[8:][indexes]:
        true_values.append(line[0] + ' X ' + true)

X_test = pd.DataFrame(X_test)
X_test.columns = [
    'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
    'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
    '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
    'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',
    'ID2'
]

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [13]:
#Checking shapes
#
train.shape, X_train.shape

((29132, 30), (66353, 31))

In [14]:
#Look of train data after alters
#
X_train.head(2)

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3,product_pred,ID2
0,4WKQSBB,1/2/2019,F,M,1987,1X1H,2A7I,T4MS,0,0,...,0,1,0,0,0,0,0,0,RVSZ,1
1,4WKQSBB,1/2/2019,F,M,1987,1X1H,2A7I,T4MS,0,0,...,0,0,0,0,0,0,0,0,K6QO,2


In [15]:
#Look of test data after alters
#
X_test.head(2)

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3,ID2
0,F86J5PC,1/12/2018,M,M,1984,94KC,DZRV,90QI,0,0,...,0,0,0,0,0,0,0,0,0,1
1,H6141K3,1/10/2019,M,M,1996,1X1H,J9SY,90QI,0,0,...,0,0,1,0,0,0,0,0,0,2


In [16]:
#It is true values for train data
#
df_train_true.head(2)

Unnamed: 0,P5DA,RIBP,8NN1,7POT,66FJ,GYSR,SOP4,RVSZ,PYUQ,LJR9,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


# Reshaping data

In [17]:
#Make data with reshape
#
features_train = []
features_test = []
columns = []

append_features = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status',
    'branch_code', 'occupation_code', 'occupation_category_code', 'birth_year'
]
for f in append_features:

    features_train.append(X_train[f].values.reshape(-1, 1))
    features_test.append(X_test[f].values.reshape(-1, 1))

    columns.append(np.array([f]))

y_train = X_train[['product_pred']]

In [18]:
features_train = np.concatenate(features_train, axis=1)
features_test = np.concatenate(features_test, axis=1)
columns = np.concatenate(np.array(columns))

X_train = pd.DataFrame(features_train)
X_train.columns = columns

X_test = pd.DataFrame(features_test)
X_test.columns = columns

# Encoding

In [19]:
#Encoding of target values that look like names of missing products
#
le = LabelEncoder()
le.fit(y_train.iloc[:, 0])

y_train = pd.DataFrame(le.transform(y_train.iloc[:, 0]))
y_train.columns = ['target']

# Add new features

In [20]:
#Reformatting date of join to some features: year, month, day, day of week, day of year of join; add age of clients
#
for df in [X_train, X_test]:
    df['join_date'] = pd.to_datetime(df.join_date, format='%d/%m/%Y')

    df['from_begin'] = (df.join_date - pd.datetime(2010, 1, 1)).dt.days

    df['join_day'] = df['join_date'].dt.day
    df['join_month'] = df['join_date'].dt.month
    df['join_year'] = df['join_date'].dt.year
    df['dayofweek'] = df['join_date'].dt.weekday
    df['day_of_year'] = df['join_date'].dt.dayofyear

    df['age'] = (df['join_year'] - df['birth_year']).astype(float)

In [21]:
#Functions of adding statistic features
#
def create_relative_group_feature(df, base_col, group_col, op):
    new_col_name = '{}_to_{}_{}'.format(base_col, op, group_col)
    df[new_col_name] = df[base_col] / df.groupby([group_col
                                                  ])[base_col].transform(op)


def create_relative_group_feature_list(df, base_cols, group_cols, ops):
    for group_col in group_cols:
        for base_col in base_cols:
            for op in ops:
                create_relative_group_feature(df, base_col, group_col, op)

In [22]:
#Concating train and test data
#
common = pd.concat([X_train, X_test])

In [23]:
common['branch_since']=2020-common.groupby('branch_code')['join_year'].transform('min')
common['age_in_branch']=common.groupby('branch_code')['age'].transform('mean')
common['age_in_occupation']=common.groupby('occupation_code')['age'].transform('mean')

In [24]:
for product in tqdm_notebook(names_products):
    common['from_arise_product_'+product] = (common['join_date'] - common.loc[common[product] == 1, 'join_date'].min()).dt.days
    common[product+'_'+'sum']=common.groupby('branch_code')[product].transform(sum)
    common[product+'_'+'_age_mean']=common.groupby(product)['age'].transform('mean')
    common[product+'_'+'_sum_join_year']=common.groupby('join_year')[product].transform(sum)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [25]:
#Approximate counting of days after open branches and after first buy of each product
#
for code in tqdm_notebook(common.branch_code.unique()):
    common.loc[common.branch_code == code, 'from_arise_branch'] = \
    common.loc[common.branch_code == code, 'from_begin'] - common.loc[common.branch_code == code, 'from_begin'].min()
    for product in cols:
        common.loc[common.branch_code == code, 'from_arise'+'_'+product] = \
        common.loc[common.branch_code == code, 'from_begin'] - common.loc[(common.branch_code == code)&(common[product]==1), 'from_begin'].min()

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [26]:
#Add statistic to our concating data
#
for c in [
        'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
        'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
        'J9JW', 'GHYX', 'ECY3'
]:
    common[c] = common[c].astype(int)
    create_relative_group_feature_list(
        common, [c],
        ['sex', 'marital_status', 'branch_code', 'occupation_code',
       'occupation_category_code', 'join_year', 'age'],
        ['mean', 'std'])

In [27]:
#Spliting concating data to train and test
#
X_train = common[:66353]
X_test = common[66353:]

In [28]:
#Names products
#
cols = train.columns[8:-1]

In [29]:
#Adding probability from statistic data (6 cell) with combinations
#
prob_products_train = []
prob_products_test = []
for j, df in enumerate([X_train, X_test]):
    for i, row in tqdm_notebook(df.iterrows()):

        ones_cols = []

        for c in cols:
            if row[c] == 1:
                ones_cols.append(c)

        max_val = 0
        value_prob = np.zeros(21)
        totals = 0

        for k in range(1, len(ones_cols)+2):
            totals+=1
            total_sum = 0
            values_prob = []
            for c in cols:
                temp = ones_cols[:]
                temp.append(c)
                probability = 0
                for prod in list(combinations(temp, k)):
                    if c in prod:
                        res = tuple(sorted(prod))
                        probability += values.get(res, 0)
                values_prob.append(probability)
                total_sum += probability
            if total_sum == 0:
                totals-=1
                continue
            value_prob += [n / total_sum for n in values_prob]

        value_prob /= totals
        
        for n, c in enumerate(cols):
            if value_prob[n] > max_val:
                name_prod = c
                max_proba = value_prob[n]
                max_val = value_prob[n]
        
        if j == 0:
            prob_products_train.append(value_prob)
        else:
            prob_products_test.append(value_prob)
        df.loc[df.index == i, 'max_product'] = name_prod
        df.loc[df.index == i, 'prob_max_product'] = max_proba
prob_products_train = pd.DataFrame(prob_products_train)
prob_products_train.columns = cols

prob_products_test = pd.DataFrame(prob_products_test)
prob_products_test.columns = cols

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [31]:
#Encoding products with maximum probability
#
X_train['max_product'] = le.transform(X_train[['max_product']].iloc[:, 0])
X_test['max_product'] = le.transform(X_test[['max_product']].iloc[:, 0])

In [32]:
#Merging amount of purchased products
#
X_train = X_train.merge(train[['ID', 'sum']])
X_test = X_test.merge(test[['ID', 'sum']])

In [33]:
#Adding features with replacing to string type to use them like cat_features
#
for df in [X_train, X_test]:
    df['dayofweek_cat'] = df['dayofweek'].astype(str)
    df['from_begin_cat'] = df['from_begin'].astype(str)
    df['day_of_year_cat'] = df['day_of_year'].astype(str)
    df['max_product_cat'] = df['max_product'].astype(str)
    df['sum_cat'] = df['sum'].astype(str)

# Model

In [34]:
#Names of cat_features
#
cat_features = [
    'sex', 'marital_status', 'branch_code', 'occupation_category_code',
    'occupation_code', 'dayofweek_cat', 'from_begin_cat', 'max_product_cat', 'sum_cat', 'day_of_year_cat'
]

In [35]:
#Columns wich we use to predictions
#
cols = X_train.drop(columns=['ID', 'ID2', 'join_date']).columns

In [36]:
#Multiclass model
#
model_cat = CatBoostClassifier(
    **{
        'depth': 5,
        #'iterations': 3000,
        'n_estimators': 15000,
        'learning_rate': 0.01,
        'random_state': 567,
        'task_type': 'GPU',
        'thread_count': 1,
        "verbose": 100,
        "use_best_model": True,
        'nan_mode': 'Max',
    })
scoring = 0

probs = []

group_kfold = GroupShuffleSplit(n_splits=5, random_state = 42)
for train_index, test_index in group_kfold.split(X_train, y_train,
                                                 np.array(X_train['ID'])):
    X_real_train, X_valid = X_train.iloc[train_index], X_train.iloc[test_index]
    y_real_train, y_valid = y_train.iloc[train_index], y_train.iloc[test_index]

    model_cat.fit(
        X_real_train[cols],
        y_real_train,
        cat_features=cat_features,
        eval_set=[(X_valid[cols], y_valid)],
        early_stopping_rounds=200,
    )
    scoring += model_cat.get_best_score()['validation']['MultiClass']
    """importance_features.append(
        pd.DataFrame(model.feature_importances_,
                     index=cols,
                     columns=['importance']))"""

    proba = model_cat.predict_proba(X_test[cols])
    probs.append(proba)
scoring /= 5
print('MEAN SCORE =', scoring)

0:	learn: 2.9437560	test: 2.9433273	best: 2.9433273 (0)	total: 101ms	remaining: 25m 21s
100:	learn: 0.9597030	test: 0.9549032	best: 0.9549032 (100)	total: 6.92s	remaining: 17m
200:	learn: 0.6721500	test: 0.6705317	best: 0.6705317 (200)	total: 13.7s	remaining: 16m 48s
300:	learn: 0.5668173	test: 0.5681142	best: 0.5681142 (300)	total: 20.3s	remaining: 16m 33s
400:	learn: 0.5146089	test: 0.5193380	best: 0.5193380 (400)	total: 26.8s	remaining: 16m 16s
500:	learn: 0.4831062	test: 0.4908390	best: 0.4908390 (500)	total: 33.2s	remaining: 16m
600:	learn: 0.4610289	test: 0.4717783	best: 0.4717783 (600)	total: 39.6s	remaining: 15m 48s
700:	learn: 0.4446615	test: 0.4580457	best: 0.4580457 (700)	total: 45.9s	remaining: 15m 36s
800:	learn: 0.4320560	test: 0.4476789	best: 0.4476789 (800)	total: 52.2s	remaining: 15m 25s
900:	learn: 0.4219822	test: 0.4398666	best: 0.4398666 (900)	total: 58.5s	remaining: 15m 15s
1000:	learn: 0.4138414	test: 0.4335849	best: 0.4335849 (1000)	total: 1m 4s	remaining: 15m 3s

8800:	learn: 0.2922049	test: 0.3797061	best: 0.3797053 (8798)	total: 8m 23s	remaining: 5m 54s
8900:	learn: 0.2915393	test: 0.3796600	best: 0.3796581 (8899)	total: 8m 29s	remaining: 5m 49s
9000:	learn: 0.2908600	test: 0.3796355	best: 0.3796319 (8997)	total: 8m 35s	remaining: 5m 43s
9100:	learn: 0.2902080	test: 0.3795464	best: 0.3795464 (9100)	total: 8m 40s	remaining: 5m 37s
9200:	learn: 0.2895489	test: 0.3795149	best: 0.3795097 (9185)	total: 8m 46s	remaining: 5m 31s
9300:	learn: 0.2889075	test: 0.3794659	best: 0.3794655 (9291)	total: 8m 51s	remaining: 5m 25s
9400:	learn: 0.2881460	test: 0.3793905	best: 0.3793893 (9398)	total: 8m 57s	remaining: 5m 20s
9500:	learn: 0.2875113	test: 0.3793492	best: 0.3793350 (9472)	total: 9m 3s	remaining: 5m 14s
9600:	learn: 0.2868699	test: 0.3792765	best: 0.3792759 (9598)	total: 9m 8s	remaining: 5m 8s
9700:	learn: 0.2862417	test: 0.3792511	best: 0.3792511 (9700)	total: 9m 14s	remaining: 5m 2s
9800:	learn: 0.2856132	test: 0.3792207	best: 0.3792156 (9794)	to

6300:	learn: 0.3087819	test: 0.3886645	best: 0.3886645 (6300)	total: 6m 7s	remaining: 8m 27s
6400:	learn: 0.3078655	test: 0.3884989	best: 0.3884974 (6399)	total: 6m 13s	remaining: 8m 21s
6500:	learn: 0.3069837	test: 0.3882805	best: 0.3882805 (6500)	total: 6m 18s	remaining: 8m 15s
6600:	learn: 0.3061001	test: 0.3880972	best: 0.3880950 (6599)	total: 6m 24s	remaining: 8m 9s
6700:	learn: 0.3052525	test: 0.3878980	best: 0.3878980 (6700)	total: 6m 30s	remaining: 8m 3s
6800:	learn: 0.3043758	test: 0.3877342	best: 0.3877342 (6800)	total: 6m 35s	remaining: 7m 57s
6900:	learn: 0.3035502	test: 0.3876189	best: 0.3876164 (6896)	total: 6m 41s	remaining: 7m 51s
7000:	learn: 0.3026816	test: 0.3874617	best: 0.3874609 (6999)	total: 6m 47s	remaining: 7m 45s
7100:	learn: 0.3018989	test: 0.3873463	best: 0.3873463 (7100)	total: 6m 52s	remaining: 7m 39s
7200:	learn: 0.3010277	test: 0.3871984	best: 0.3871984 (7200)	total: 6m 58s	remaining: 7m 33s
7300:	learn: 0.3001969	test: 0.3870476	best: 0.3870476 (7300)	t

2000:	learn: 0.3696073	test: 0.4087236	best: 0.4087236 (2000)	total: 2m 7s	remaining: 13m 47s
2100:	learn: 0.3669234	test: 0.4071425	best: 0.4071425 (2100)	total: 2m 13s	remaining: 13m 37s
2200:	learn: 0.3644436	test: 0.4058501	best: 0.4058501 (2200)	total: 2m 19s	remaining: 13m 28s
2300:	learn: 0.3621874	test: 0.4045828	best: 0.4045828 (2300)	total: 2m 24s	remaining: 13m 19s
2400:	learn: 0.3601496	test: 0.4035248	best: 0.4035248 (2400)	total: 2m 30s	remaining: 13m 9s
2500:	learn: 0.3583394	test: 0.4025669	best: 0.4025669 (2500)	total: 2m 36s	remaining: 12m 59s
2600:	learn: 0.3564402	test: 0.4016453	best: 0.4016453 (2600)	total: 2m 41s	remaining: 12m 50s
2700:	learn: 0.3544454	test: 0.4007456	best: 0.4007456 (2700)	total: 2m 47s	remaining: 12m 42s
2800:	learn: 0.3527795	test: 0.3999036	best: 0.3999036 (2800)	total: 2m 53s	remaining: 12m 33s
2900:	learn: 0.3510790	test: 0.3990602	best: 0.3990602 (2900)	total: 2m 59s	remaining: 12m 26s
3000:	learn: 0.3495831	test: 0.3983559	best: 0.39835

10800:	learn: 0.2828684	test: 0.3817442	best: 0.3817388 (10791)	total: 10m 31s	remaining: 4m 5s
10900:	learn: 0.2821766	test: 0.3817218	best: 0.3817189 (10895)	total: 10m 37s	remaining: 3m 59s
11000:	learn: 0.2814986	test: 0.3816684	best: 0.3816684 (11000)	total: 10m 43s	remaining: 3m 53s
11100:	learn: 0.2809060	test: 0.3816018	best: 0.3816018 (11100)	total: 10m 48s	remaining: 3m 47s
11200:	learn: 0.2802576	test: 0.3815544	best: 0.3815543 (11199)	total: 10m 54s	remaining: 3m 42s
11300:	learn: 0.2796024	test: 0.3815439	best: 0.3815402 (11296)	total: 11m	remaining: 3m 36s
11400:	learn: 0.2789336	test: 0.3815212	best: 0.3815176 (11396)	total: 11m 6s	remaining: 3m 30s
11500:	learn: 0.2784143	test: 0.3815057	best: 0.3814982 (11495)	total: 11m 11s	remaining: 3m 24s
11600:	learn: 0.2777680	test: 0.3815024	best: 0.3814971 (11579)	total: 11m 17s	remaining: 3m 18s
11700:	learn: 0.2771516	test: 0.3814916	best: 0.3814852 (11653)	total: 11m 23s	remaining: 3m 12s
11800:	learn: 0.2765452	test: 0.3814

6800:	learn: 0.3045921	test: 0.3860202	best: 0.3860202 (6800)	total: 6m 43s	remaining: 8m 6s
6900:	learn: 0.3037622	test: 0.3858890	best: 0.3858862 (6897)	total: 6m 49s	remaining: 8m
7000:	learn: 0.3029300	test: 0.3857473	best: 0.3857451 (6997)	total: 6m 55s	remaining: 7m 54s
7100:	learn: 0.3021470	test: 0.3856191	best: 0.3856179 (7096)	total: 7m 1s	remaining: 7m 49s
7200:	learn: 0.3013023	test: 0.3854757	best: 0.3854757 (7200)	total: 7m 7s	remaining: 7m 43s
7300:	learn: 0.3005208	test: 0.3853633	best: 0.3853633 (7300)	total: 7m 13s	remaining: 7m 37s
7400:	learn: 0.2996926	test: 0.3852288	best: 0.3852282 (7399)	total: 7m 19s	remaining: 7m 30s
7500:	learn: 0.2989055	test: 0.3851076	best: 0.3851048 (7499)	total: 7m 24s	remaining: 7m 24s
7600:	learn: 0.2980739	test: 0.3850186	best: 0.3850186 (7600)	total: 7m 30s	remaining: 7m 18s
7700:	learn: 0.2973352	test: 0.3849244	best: 0.3849244 (7700)	total: 7m 36s	remaining: 7m 12s
7800:	learn: 0.2965853	test: 0.3847819	best: 0.3847819 (7800)	total

4700:	learn: 0.3302436	test: 0.3844108	best: 0.3844108 (4700)	total: 4m 49s	remaining: 10m 34s
4800:	learn: 0.3293095	test: 0.3841218	best: 0.3841218 (4800)	total: 4m 55s	remaining: 10m 27s
4900:	learn: 0.3282424	test: 0.3838137	best: 0.3838137 (4900)	total: 5m	remaining: 10m 19s
5000:	learn: 0.3272935	test: 0.3835739	best: 0.3835716 (4999)	total: 5m 6s	remaining: 10m 12s
5100:	learn: 0.3264623	test: 0.3833460	best: 0.3833460 (5100)	total: 5m 12s	remaining: 10m 5s
5200:	learn: 0.3255072	test: 0.3830828	best: 0.3830817 (5199)	total: 5m 17s	remaining: 9m 58s
5300:	learn: 0.3245742	test: 0.3827911	best: 0.3827911 (5300)	total: 5m 23s	remaining: 9m 51s
5400:	learn: 0.3235916	test: 0.3825725	best: 0.3825709 (5398)	total: 5m 28s	remaining: 9m 44s
5500:	learn: 0.3227170	test: 0.3823379	best: 0.3823379 (5500)	total: 5m 34s	remaining: 9m 37s
5600:	learn: 0.3219357	test: 0.3821044	best: 0.3821044 (5600)	total: 5m 39s	remaining: 9m 30s
5700:	learn: 0.3210432	test: 0.3819009	best: 0.3818998 (5699)

In [37]:
#Meaning predict values
#
new_a = np.ones((10000, 21)) * 0.0
for r in probs:
    new_a += r
new_a /= 5

In [38]:
#Making dataframe with predictions
#
y_test = pd.DataFrame(new_a)
y_test.columns = le.inverse_transform(y_test.columns)

In [39]:
#Replacing true values from test and normalizing other predictions
#
for i,row in tqdm_notebook(X_test.iterrows()):
    summ = 0
    
    for c in names_products:
        if row[c] == 1:
            y_test.loc[y_test.index == i, c] = 1.0
        else:
            summ += y_test.loc[y_test.index == i, c].values[0]
    for c in names_products:
        if row[c] != 1.0:
            y_test.loc[y_test.index == i, c] /= summ

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [40]:
#Reformat predict values to necessary view and replace that are given
#
answer_mass = []
for i in range(X_test.shape[0]):
    id = X_test['ID'].iloc[i]
    for c in y_test.columns:
        answer_mass.append([id + ' X ' + c, y_test[c].iloc[i]])

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
df_answer.head()

Unnamed: 0,ID X PCODE,Label
0,F86J5PC X 66FJ,2.6e-05
1,F86J5PC X 7POT,2.2e-05
2,F86J5PC X 8NN1,3e-06
3,F86J5PC X AHXO,3.7e-05
4,F86J5PC X BSTQ,6e-06


In [42]:
#Parameters of second model
#
best_params = {
    'depth': 4,
    #'eval_metric': 'AUC',
    'grow_policy': 'Depthwise',
    'iterations': 10000,
    'learning_rate': 0.03,
    #'objective': 'Logloss',
    'random_state': 567,
    'reg_lambda': 6.0,
    'task_type': 'GPU',
    'thread_count': 1,
    "verbose": 500,
    "use_best_model": True,
}

In [43]:
#New cat_features
#
cat_features = [
    'sex', 'marital_status', 'branch_code', 'occupation_category_code',
    'occupation_code', 'dayofweek_cat', 'from_begin_cat', 'sum_cat', 'day_of_year_cat'
]

In [44]:
#Model by each product(binary)


#Dataframe of predicts
#
df_predictions = X_test[names_products]

#Model
#
model_cat = CatBoostClassifier(**best_params)

#Calculate mean score
#
scoring = 0

#Start model
#
for product in tqdm_notebook(names_products):

    print('=' * 25)
    print(product)
    
    #Columns wich need to train and predict
    cols = list(X_train.drop(columns=['ID', 'ID2', 'join_date', 'max_product', 'max_product_cat', 'prob_max_product']).columns)
    cols.remove(product)

    i = 1
    group_kfold = GroupKFold(n_splits=5)
    
    #Spliting data to 5 folds and then blanding
    for train_index, test_index in group_kfold.split(X_train, df_train_true[product],np.array(X_train['ID'])):
        print('Fold', i)

        X_real_train, X_valid = X_train.iloc[train_index], X_train.iloc[test_index]
        y_real_train, y_valid = df_train_true[product].iloc[train_index], df_train_true[product].iloc[test_index]
        
        X_real_train['probobility'] = prob_products_train[product].iloc[train_index]
        X_valid['probobility'] = prob_products_train[product].iloc[test_index]
        
        testing = X_test[cols]
        testing['probobility'] = prob_products_test[product]
        
        #Training
        model_cat.fit(
            X_real_train[cols+['probobility']],
            y_real_train,
            cat_features=cat_features,
            eval_set=[(X_valid[cols+['probobility']], y_valid)],
            early_stopping_rounds=600,
        )
        
        #Add score
        scoring += model_cat.get_best_score()['validation']['Logloss']
        
        #Predictions
        now = np.array(
            list(
                map(lambda x: x[1],
                    model_cat.predict_proba(testing))))
        if i == 1:
            predictions_cat = now
        else:
            predictions_cat += now
        i += 1
    
    #Mean predictions
    predictions_cat /= 5

    #Add
    df_predictions[product] = predictions_cat
scoring /= 105
print('MEAN SCORE =', scoring)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

P5DA
Fold 1
0:	learn: 0.5722136	test: 0.5722304	best: 0.5722304 (0)	total: 38.6ms	remaining: 6m 26s
500:	learn: 0.0013850	test: 0.0014579	best: 0.0014520 (483)	total: 13.3s	remaining: 4m 12s
1000:	learn: 0.0006703	test: 0.0015469	best: 0.0014520 (483)	total: 26.6s	remaining: 3m 59s
bestTest = 0.001452002421
bestIteration = 483
Shrink model to first 484 iterations.
Fold 2
0:	learn: 0.5771102	test: 0.5770018	best: 0.5770018 (0)	total: 25.6ms	remaining: 4m 15s
500:	learn: 0.0014357	test: 0.0010751	best: 0.0010751 (500)	total: 13s	remaining: 4m 7s
1000:	learn: 0.0007628	test: 0.0011799	best: 0.0010719 (507)	total: 26.8s	remaining: 4m 1s
bestTest = 0.001071850734
bestIteration = 507
Shrink model to first 508 iterations.
Fold 3
0:	learn: 0.5708681	test: 0.5708602	best: 0.5708602 (0)	total: 28.6ms	remaining: 4m 46s
500:	learn: 0.0014165	test: 0.0026551	best: 0.0026551 (500)	total: 13.3s	remaining: 4m 12s
1000:	learn: 0.0008442	test: 0.0026891	best: 0.0025959 (704)	total: 26.8s	remaining: 4m 1

1000:	learn: 0.0098165	test: 0.0156108	best: 0.0154717 (755)	total: 27.4s	remaining: 4m 6s
bestTest = 0.01547174868
bestIteration = 755
Shrink model to first 756 iterations.
Fold 5
0:	learn: 0.6048701	test: 0.6050826	best: 0.6050826 (0)	total: 25.5ms	remaining: 4m 15s
500:	learn: 0.0128021	test: 0.0155536	best: 0.0155241 (472)	total: 13.4s	remaining: 4m 14s
1000:	learn: 0.0105019	test: 0.0153919	best: 0.0153800 (919)	total: 27.1s	remaining: 4m 3s
1500:	learn: 0.0088767	test: 0.0154757	best: 0.0153695 (1033)	total: 40.9s	remaining: 3m 51s
bestTest = 0.01536947766
bestIteration = 1033
Shrink model to first 1034 iterations.
66FJ
Fold 1
0:	learn: 0.5930066	test: 0.5923816	best: 0.5923816 (0)	total: 26.1ms	remaining: 4m 20s
500:	learn: 0.0145689	test: 0.0160006	best: 0.0159842 (375)	total: 13.3s	remaining: 4m 13s
1000:	learn: 0.0124117	test: 0.0160878	best: 0.0159754 (673)	total: 27.5s	remaining: 4m 7s
bestTest = 0.01597537491
bestIteration = 673
Shrink model to first 674 iterations.
Fold 2

Fold 1
0:	learn: 0.5886273	test: 0.5879772	best: 0.5879772 (0)	total: 27.8ms	remaining: 4m 38s
500:	learn: 0.0676260	test: 0.0745085	best: 0.0745085 (500)	total: 13.7s	remaining: 4m 20s
1000:	learn: 0.0610239	test: 0.0722640	best: 0.0722640 (1000)	total: 27.2s	remaining: 4m 4s
1500:	learn: 0.0564226	test: 0.0715537	best: 0.0715389 (1478)	total: 41.1s	remaining: 3m 52s
2000:	learn: 0.0526092	test: 0.0710967	best: 0.0710886 (1998)	total: 54.8s	remaining: 3m 38s
2500:	learn: 0.0492775	test: 0.0711323	best: 0.0710524 (2087)	total: 1m 8s	remaining: 3m 25s
bestTest = 0.07105237764
bestIteration = 2087
Shrink model to first 2088 iterations.
Fold 2
0:	learn: 0.5854821	test: 0.5847257	best: 0.5847257 (0)	total: 29.8ms	remaining: 4m 57s
500:	learn: 0.0676543	test: 0.0728315	best: 0.0728315 (500)	total: 13.8s	remaining: 4m 21s
1000:	learn: 0.0611632	test: 0.0709924	best: 0.0709861 (996)	total: 27.5s	remaining: 4m 6s
1500:	learn: 0.0567080	test: 0.0708083	best: 0.0707468 (1323)	total: 41.1s	remain

bestTest = 0.01848797701
bestIteration = 1325
Shrink model to first 1326 iterations.
Fold 4
0:	learn: 0.6045258	test: 0.6045695	best: 0.6045695 (0)	total: 25.9ms	remaining: 4m 18s
500:	learn: 0.0146340	test: 0.0166062	best: 0.0166062 (498)	total: 13.4s	remaining: 4m 14s
1000:	learn: 0.0125162	test: 0.0165436	best: 0.0165177 (775)	total: 27s	remaining: 4m 3s
1500:	learn: 0.0109100	test: 0.0166484	best: 0.0165072 (1113)	total: 40.9s	remaining: 3m 51s
bestTest = 0.01650722499
bestIteration = 1113
Shrink model to first 1114 iterations.
Fold 5
0:	learn: 0.6013070	test: 0.6014913	best: 0.6014913 (0)	total: 25.8ms	remaining: 4m 18s
500:	learn: 0.0135345	test: 0.0205362	best: 0.0205350 (492)	total: 13.7s	remaining: 4m 19s
1000:	learn: 0.0114058	test: 0.0199697	best: 0.0199638 (980)	total: 27.3s	remaining: 4m 5s
1500:	learn: 0.0097003	test: 0.0197829	best: 0.0197606 (1470)	total: 41.5s	remaining: 3m 54s
2000:	learn: 0.0082301	test: 0.0198353	best: 0.0197606 (1470)	total: 55.2s	remaining: 3m 40s

1000:	learn: 0.0754655	test: 0.0877124	best: 0.0877124 (1000)	total: 27.9s	remaining: 4m 10s
1500:	learn: 0.0701216	test: 0.0870917	best: 0.0870839 (1499)	total: 41.8s	remaining: 3m 56s
2000:	learn: 0.0656956	test: 0.0871215	best: 0.0869719 (1697)	total: 55.8s	remaining: 3m 43s
2500:	learn: 0.0617892	test: 0.0870155	best: 0.0869177 (2337)	total: 1m 9s	remaining: 3m 29s
bestTest = 0.08691774209
bestIteration = 2337
Shrink model to first 2338 iterations.
Fold 2
0:	learn: 0.6073872	test: 0.6077749	best: 0.6077749 (0)	total: 27.4ms	remaining: 4m 34s
500:	learn: 0.0818521	test: 0.0896248	best: 0.0896248 (500)	total: 13.7s	remaining: 4m 20s
1000:	learn: 0.0752353	test: 0.0880867	best: 0.0880867 (1000)	total: 27.7s	remaining: 4m 8s
1500:	learn: 0.0697129	test: 0.0876224	best: 0.0875369 (1421)	total: 41.6s	remaining: 3m 55s
2000:	learn: 0.0652426	test: 0.0873624	best: 0.0873460 (1962)	total: 55.5s	remaining: 3m 41s
2500:	learn: 0.0612752	test: 0.0873900	best: 0.0873246 (2018)	total: 1m 9s	rema

1500:	learn: 0.0081317	test: 0.0128062	best: 0.0126816 (1148)	total: 42.6s	remaining: 4m 1s
bestTest = 0.01268163926
bestIteration = 1148
Shrink model to first 1149 iterations.
JZ9D
Fold 1
0:	learn: 0.5692732	test: 0.5691917	best: 0.5691917 (0)	total: 28.3ms	remaining: 4m 43s
500:	learn: 0.0001164	test: 0.0005083	best: 0.0005061 (488)	total: 14.2s	remaining: 4m 29s
1000:	learn: 0.0000498	test: 0.0004782	best: 0.0004778 (973)	total: 28.3s	remaining: 4m 14s
1500:	learn: 0.0000324	test: 0.0004690	best: 0.0004690 (1500)	total: 42.3s	remaining: 3m 59s
2000:	learn: 0.0000250	test: 0.0004666	best: 0.0004653 (1860)	total: 56s	remaining: 3m 43s
bestTest = 0.0004653435141
bestIteration = 1860
Shrink model to first 1861 iterations.
Fold 2
0:	learn: 0.5692355	test: 0.5692104	best: 0.5692104 (0)	total: 25.1ms	remaining: 4m 10s
500:	learn: 0.0001682	test: 0.0009508	best: 0.0008425 (222)	total: 13.7s	remaining: 4m 18s
bestTest = 0.0008424842277
bestIteration = 222
Shrink model to first 223 iterations

1000:	learn: 0.0339772	test: 0.0439452	best: 0.0439197 (892)	total: 29s	remaining: 4m 20s
1500:	learn: 0.0308480	test: 0.0438608	best: 0.0437904 (1109)	total: 43.2s	remaining: 4m 4s
bestTest = 0.04379035835
bestIteration = 1109
Shrink model to first 1110 iterations.

MEAN SCORE = 0.030684226951777867


In [45]:
#Prediction values
df_predictions.head()

Unnamed: 0,P5DA,RIBP,8NN1,7POT,66FJ,GYSR,SOP4,RVSZ,PYUQ,LJR9,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,0.000105,0.000716,0.00019,0.000462,0.000363,2.3e-05,0.000607,0.999824,0.002176,0.000473,...,0.000542,0.000209,4.6e-05,0.986661,0.018351,0.0002,2.7e-05,5.1e-05,0.005993,0.003172
1,0.000105,0.003102,0.000195,0.000461,0.000583,2.1e-05,0.000864,0.999886,0.008545,0.000473,...,0.000542,0.000209,5.1e-05,0.999057,0.335262,0.052571,2.7e-05,5.2e-05,0.272945,0.14753
2,0.000105,0.002625,0.000196,0.000514,0.000675,0.000125,0.001011,0.999916,0.010279,0.000461,...,0.001361,0.000209,4.6e-05,0.999824,0.919369,0.007947,2.7e-05,0.000119,0.010806,0.049664
3,0.000105,0.0006,0.00019,0.000435,0.000372,2e-05,0.000614,0.999858,0.001311,0.000473,...,0.000548,0.000209,4.6e-05,0.992774,0.007501,0.000121,2.7e-05,5.1e-05,0.003986,0.003317
4,0.000105,0.000572,0.00019,0.000435,0.000362,3.1e-05,0.000474,0.019888,0.007452,0.000473,...,0.001542,0.000601,4.6e-05,0.000891,0.01051,7.1e-05,0.999577,0.997908,0.003665,0.001229


In [46]:
#Reformat predict values to necessary view and replace values that are given
#
df_predicts = df_predictions.copy()
df_predicts['ID'] = X_test['ID'].drop_duplicates()
products = df_predicts[[
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]]
df_predicts = df_predicts.melt(id_vars=df_predicts.columns[-1],
                               value_vars=products,
                               var_name="PCODE",
                               value_name="Label")
df_predicts['ID X PCODE'] = df_predicts['ID'] + ' X ' + df_predicts['PCODE']
df_predicts.drop(['ID', 'PCODE'], axis=1, inplace=True)
df_predicts = df_predicts[['ID X PCODE', 'Label']]
for i in tqdm_notebook(range(df_predicts.shape[0])):
    if df_predicts['ID X PCODE'].iloc[i] in true_values:
        df_predicts['Label'].iloc[i] = 1.0
df_predicts.head()

HBox(children=(FloatProgress(value=0.0, max=210000.0), HTML(value='')))




Unnamed: 0,ID X PCODE,Label
0,F86J5PC X P5DA,0.000105
1,H6141K3 X P5DA,0.000105
2,RBAYUXZ X P5DA,0.000105
3,KCBILBQ X P5DA,0.000105
4,LSEC1ZJ X P5DA,0.000105


In [47]:
#Calibrating predictions
#
def rep(x):
    if x[1]<0.0001:
        x[1] = 0
    return x
df_predicts_new = df_predicts.apply(rep, axis = 1)
df_answer_new = df_answer.apply(rep, axis = 1)

In [48]:
#Ensambling
#
df = df_predicts_new.merge(
    df_answer_new.rename(columns={'Label': 'lbl'}))
df['Label'] = (df['Label'] + df['lbl']) / 2
df.drop(['lbl'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID X PCODE,Label
0,F86J5PC X P5DA,5.3e-05
1,H6141K3 X P5DA,0.000125
2,RBAYUXZ X P5DA,5.3e-05
3,KCBILBQ X P5DA,5.3e-05
4,LSEC1ZJ X P5DA,5.3e-05


In [49]:
#Calibrating again
#
def rep(x):
    if x[1]<0.0001:
        x[1] = 0
    return x
df = df.apply(rep, axis = 1)

In [50]:
#Make dataframe of first predictions
#
df.reset_index(drop=True, inplace=True)
df.to_csv('submis_1.csv', index=False)

Open Zimnat-lgb_best_score.ipynb