# Download libraries and data

In [None]:
#Download CatBoost
#
!pip install catboost==0.23.2

In [1]:
#Import libraries
#
import pandas as pd, os, gc
import numpy as np
import math
import copy
from itertools import combinations

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import roc_curve, auc, log_loss

from tqdm import tqdm, tqdm_notebook

from sklearn.model_selection import GroupShuffleSplit, StratifiedKFold, train_test_split, GroupKFold
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Download data
#
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [4]:
train.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,4WKQSBB,1/2/2019,F,M,1987,1X1H,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CP5S02H,1/6/2019,F,M,1981,UAOD,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2YKDILJ,1/6/2013,M,U,1991,748L,QZYX,90QI,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2S9E81J,1/8/2019,M,M,1990,1X1H,BP09,56SI,0,0,...,0,0,0,1,0,0,0,0,0,0
4,BHDYVFT,1/8/2019,M,M,1990,748L,NO3L,T4MS,0,0,...,0,0,0,0,0,0,1,1,0,0


In [5]:
test.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,F86J5PC,1/12/2018,M,M,1984,94KC,DZRV,90QI,0,0,...,0,0,0,0,0,0,0,0,0,0
1,H6141K3,1/10/2019,M,M,1996,1X1H,J9SY,90QI,0,0,...,0,0,0,1,0,0,0,0,0,0
2,RBAYUXZ,1/1/2020,F,W,1968,UAOD,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
3,KCBILBQ,1/2/2019,M,M,1989,94KC,2A7I,T4MS,0,0,...,0,0,0,0,0,0,0,0,0,0
4,LSEC1ZJ,1/2/2020,F,M,1982,UAOD,0KID,T4MS,0,0,...,0,0,0,0,0,0,1,0,0,0


# Data preparing

In [6]:
replace_train=list(set(train['occupation_code'].unique().tolist())-set(test['occupation_code']))
replace_test=list(set(test['occupation_code'].unique().tolist())-set(train['occupation_code']))

train['occupation_code']=train['occupation_code'].replace(replace_train,np.nan)
test['occupation_code']=test['occupation_code'].replace(replace_test,np.nan)
train['occupation_code'].fillna(train['occupation_category_code'],inplace=True)
test['occupation_code'].fillna(test['occupation_category_code'],inplace=True)

In [7]:
#Adding amount of purchased products for each client(for test without 1 missing)
#
train['sum'] = train.iloc[:, 8:].T.sum()

test['sum'] = test.iloc[:, 8:].T.sum()+1

In [8]:
train.loc[train.marital_status == 'f', 'marital_status'] = 'F'

In [9]:
#Renaming features to prevent any repeating
#
train['sex'] += '_sex'
train['marital_status'] += '_marital_status'
train['branch_code'] += '_branch_code'
train['occupation_code'] += '_occupation_code'
train['occupation_category_code'] += '_occupation_category_code'
test['sex'] += '_sex'
test['marital_status'] += '_marital_status'
test['branch_code'] += '_branch_code'
test['occupation_code'] += '_occupation_code'
test['occupation_category_code'] += '_occupation_category_code'

In [10]:
names_products = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]

In [16]:
#Make spliting train clients info. Trying to reproduce the situation with test
#
X_train = []
X_train_columns = train.columns[:-1]
df_train_true = []
client_index = 0

for line in tqdm_notebook(train.values):

    info = line[:8]
    info_products = line[8:-1]
    indexes = [k for k, i in enumerate(info_products) if i == 1]

    for i in indexes:

        client_index += 1

        for k in range(len(info_products)):

            if k == i:

                info_products_transformed = list(copy.copy(info_products))
                df_train_true.append(info_products)
                info_products_transformed[i] = 0

                X_train.append(
                    list(info) + info_products_transformed +
                    [X_train_columns[8 + k]] + [client_index])

X_train = pd.DataFrame(X_train)
df_train_true = pd.DataFrame(df_train_true)
df_train_true.columns = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]
X_train.columns = [
    'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
    'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
    '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
    'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',
    'product_pred', 'ID2'
]

HBox(children=(FloatProgress(value=0.0, max=29132.0), HTML(value='')))




In [17]:
#Make info about true values in data of predictions
#
X_test = []
true_values = []
client_index = 0
for line in tqdm_notebook(test.values):

    client_index += 1

    info = line[:8]
    info_products = line[8:-1]
    indexes = [k for k, i in enumerate(info_products) if i == 1]

    X_test.append(list(info) + list(info_products) + [client_index])

    for true in test.columns[8:][indexes]:
        true_values.append(line[0] + ' X ' + true)

X_test = pd.DataFrame(X_test)
X_test.columns = [
    'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
    'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
    '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
    'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',
    'ID2'
]

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [18]:
#Checking shapes
#
train.shape, X_train.shape

((29132, 30), (66353, 31))

In [19]:
#Look of train data after alters
#
X_train.head(2)

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3,product_pred,ID2
0,4WKQSBB,1/2/2019,F_sex,M_marital_status,1987,1X1H_branch_code,2A7I_occupation_code,T4MS_occupation_category_code,0,0,...,0,1,0,0,0,0,0,0,RVSZ,1
1,4WKQSBB,1/2/2019,F_sex,M_marital_status,1987,1X1H_branch_code,2A7I_occupation_code,T4MS_occupation_category_code,0,0,...,0,0,0,0,0,0,0,0,K6QO,2


In [20]:
#Look of test data after alters
#
X_test.head(2)

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3,ID2
0,F86J5PC,1/12/2018,M_sex,M_marital_status,1984,94KC_branch_code,DZRV_occupation_code,90QI_occupation_category_code,0,0,...,0,0,0,0,0,0,0,0,0,1
1,H6141K3,1/10/2019,M_sex,M_marital_status,1996,1X1H_branch_code,J9SY_occupation_code,90QI_occupation_category_code,0,0,...,0,0,1,0,0,0,0,0,0,2


In [21]:
#It is true values for train data
#
df_train_true.head(2)

Unnamed: 0,P5DA,RIBP,8NN1,7POT,66FJ,GYSR,SOP4,RVSZ,PYUQ,LJR9,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


# Reshaping data

In [22]:
#Make data with reshape
#
features_train = []
features_test = []
columns = []

append_features = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status',
    'branch_code', 'occupation_code', 'occupation_category_code', 'birth_year'
]
for f in append_features:

    features_train.append(X_train[f].values.reshape(-1, 1))
    features_test.append(X_test[f].values.reshape(-1, 1))

    columns.append(np.array([f]))

y_train = X_train[['product_pred']]

In [23]:
features_train = np.concatenate(features_train, axis=1)
features_test = np.concatenate(features_test, axis=1)
columns = np.concatenate(np.array(columns))

X_train = pd.DataFrame(features_train)
X_train.columns = columns

X_test = pd.DataFrame(features_test)
X_test.columns = columns

# Add new features

In [24]:
#Reformatting date of join to some features: year, month, day, day of week, day of year of join; add age of clients
#
for df in [X_train, X_test]:
    df['join_date'] = pd.to_datetime(df.join_date, format='%d/%m/%Y')

    df['from_begin'] = (df.join_date - pd.datetime(2010, 1, 1)).dt.days

    df['join_day'] = df['join_date'].dt.day
    df['join_month'] = df['join_date'].dt.month
    df['join_year'] = df['join_date'].dt.year
    df['dayofweek'] = df['join_date'].dt.weekday
    df['day_of_year'] = df['join_date'].dt.dayofyear

    df['age'] = (df['join_year'] - df['birth_year']).astype(float)

In [25]:
#Concating train and test data
#
common = X_train.append(X_test)

In [26]:
common['branch_start']=pd.datetime.now().year-common.groupby('branch_code')['join_year'].transform('min')

In [27]:
def transform(df, row):
    df[row[0]]=common.groupby(row[1])[row[2]].transform(row[3])

In [28]:
row_features = [['nuniq_people', 'branch_code', 'ID', 'nunique'],
                ['nuniq_branch_in_year', 'join_year', 'branch_code', 'nunique'], 
                ['nuniq_year', 'branch_code', 'join_year', 'nunique'], 
                ['nuniq_month', 'branch_code', 'join_month', 'nunique'], 
                ['mean_age_in_branch', 'branch_code', 'age', 'mean'],
                ['std_age_in_branch', 'branch_code', 'age', 'std'],
                ['median_age_in_branch', 'branch_code', 'age', 'median'],
                ['mean_age_in_occupation', 'occupation_code', 'age', 'mean'],
                ['std_age_in_occupation', 'occupation_code', 'age', 'std'],
                ['median_age_in_occupation', 'occupation_code', 'age', 'median']]
for row in row_features:
    transform(common,row)

In [29]:
common['birth_year_binary']= pd.cut(common['birth_year'], bins=5)

common['branch_ocupation']=common['branch_code']+'_'+common['occupation_code']
common['branch_ocupcode']=common['branch_code']+'_'+common['occupation_category_code']

In [30]:
common['Number_of_Insurance_Bought']=common.iloc[:, :21].sum(axis=1)

def mapper(common):
    if common['Number_of_Insurance_Bought']==1:
        return 'One'
    elif (common['Number_of_Insurance_Bought']>1) & (common['Number_of_Insurance_Bought']<5):
        return 'Medium'
    elif (common['Number_of_Insurance_Bought']>4 )& (common['Number_of_Insurance_Bought']<8):
        return 'High'    
    else:
        return 'Too High'   
common['Insurance_Count']=common.apply(lambda common:mapper(common) ,axis = 1)
del common['Number_of_Insurance_Bought']

In [31]:
for name in [
        'sex', 'marital_status', 'occupation_code', 'occupation_category_code',
        'birth_year_binary', 'branch_ocupation', 'branch_ocupcode', 'Insurance_Count'
]:
    freq = (common.groupby(name).size()) / len(common)
    common[name + '_freq'] = common[name].apply(lambda x: freq[x])
    common[name + '_freq'] = common[name + '_freq'].astype(float)

In [32]:
le_ins = LabelEncoder()
common['Insurance_Count'] = le_ins.fit_transform(common['Insurance_Count'])

In [33]:
for i,row in tqdm_notebook(common.iterrows()):
    res = []
    for c in names_products:
        if row[c] == 1:
            res.append(c)
    common.loc[common.index == i, 'product_comb'] = '_'.join(sorted(res))
common['product_comb'] = le_ins.fit_transform(common['product_comb'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [34]:
for col in tqdm_notebook(names_products):
    for cols in names_products:
        if col!=cols:
            common[col+'_'+cols]=common.groupby(col)[cols].transform(sum)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [35]:
common.drop(
    columns=['birth_year_binary', 'branch_ocupation', 'branch_ocupcode'],
    inplace=True)

In [36]:
#Approximate counting of days after open branches and after first buy of each product
#
for code in tqdm_notebook(common.branch_code.unique()):
    common.loc[common.branch_code == code, 'from_arise_branch'] = \
    common.loc[common.branch_code == code, 'from_begin'] - common.loc[common.branch_code == code, 'from_begin'].min()
    for product in names_products:
        common.loc[common.branch_code == code, 'from_arise_product_'+product+'_in_branch'] = \
        common.loc[common.branch_code == code, 'from_begin'] - common.loc[(common.branch_code == code)&(common[product]==1), 'from_begin'].min()

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [37]:
for product in tqdm_notebook(names_products):
    common['from_arise_product_'+product] = (common['join_date'] - common.loc[common[product] == 1, 'join_date'].min()).dt.days
    common[product+'_'+'sum_in_branch']=common.groupby('branch_code')[product].transform(sum)
    common[product+'_'+'_age_mean']=common.groupby(product)['age'].transform('mean')
    common[product+'_'+'_age_std']=common.groupby(product)['age'].transform('std')
    common[product+'_'+'_age_median']=common.groupby(product)['age'].transform('median')
    common[product+'_'+'_sum_join_year']=common.groupby('join_year')[product].transform(sum)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [41]:
#Splitting concating data to train and test
#
X_train = common[:66353]
X_test = common[66353:]

# Encoding

In [46]:
#Encoding of target values that look like names of missing products
#
le = LabelEncoder()
le.fit(y_train.iloc[:, 0])

y_train = pd.DataFrame(le.transform(y_train.iloc[:, 0]))
y_train.columns = ['target']

In [48]:
#Merging amount of purchased products
#
X_train = X_train.merge(train[['ID', 'sum']])
X_test = X_test.merge(test[['ID', 'sum']])

In [49]:
#Adding features with replacing to string type to use them like cat_features
#
for df in [X_train, X_test]:
    df['dayofweek_cat'] = df['dayofweek'].astype(str)
    df['from_begin_cat'] = df['from_begin'].astype(str)
    df['birth_year'] = df['birth_year'].astype(str)
    df['join_year_cat'] = df['join_year'].astype(str)
    df['sum_cat'] = df['sum'].astype(str)
    df['day_of_year_cat'] = df['day_of_year'].astype(str)

# Model

## Model main

In [50]:
#Names of cat_features
#
cat_features = [
    'sex',
    'marital_status',
    'branch_code',
    'occupation_category_code',
    'occupation_code',
    'dayofweek_cat',
    'from_begin_cat',
    'sum_cat',
    'birth_year',
    'join_year_cat'
]

In [51]:
model_cat = CatBoostClassifier(
    **{
        'depth': 5,
        'n_estimators': 15000,
        'learning_rate': 0.01,
        'random_state': 567,
        'task_type': 'GPU',
        'thread_count': 1,
        "verbose": 100,
        "use_best_model": True,
        'nan_mode': 'Max',
    })

probs = []
probs_train = []
i = 1
scoring = 0
group_kfold = GroupKFold(n_splits=5)
cols = X_train.drop(columns=['ID', 'ID2', 'join_date']).columns
for train_index, test_index in group_kfold.split(X_train, y_train,
                                                 np.array(X_train['ID'])):
    X_real_train, X_valid = X_train.iloc[train_index], X_train.iloc[test_index]
    y_real_train, y_valid = y_train.iloc[train_index], y_train.iloc[test_index]
    print('Fold', i)
    model_cat.fit(
        X_real_train[cols],
        y_real_train,
        cat_features=cat_features,
        eval_set=[(X_valid[cols], y_valid)],
        early_stopping_rounds = 200,
    )
    scoring += model_cat.get_best_score()['validation']['MultiClass']

    proba = model_cat.predict_proba(X_test[cols])
    probs.append(proba)
    probs_train.append(model_cat.predict_proba(X_train[cols]))
    i += 1
scoring /= 5
print('MEAN SCORE =', scoring)

Fold 1
0:	learn: 2.9445576	test: 2.9428904	best: 2.9428904 (0)	total: 77.7ms	remaining: 19m 26s
100:	learn: 0.9699479	test: 0.9618732	best: 0.9618732 (100)	total: 5.8s	remaining: 14m 15s
200:	learn: 0.6838025	test: 0.6798409	best: 0.6798409 (200)	total: 11.4s	remaining: 13m 58s
300:	learn: 0.5793816	test: 0.5790330	best: 0.5790330 (300)	total: 16.8s	remaining: 13m 41s
400:	learn: 0.5248620	test: 0.5276316	best: 0.5276316 (400)	total: 22s	remaining: 13m 21s
500:	learn: 0.4913727	test: 0.4965360	best: 0.4965360 (500)	total: 27.3s	remaining: 13m 11s
600:	learn: 0.4687746	test: 0.4759556	best: 0.4759556 (600)	total: 32.6s	remaining: 13m 1s
700:	learn: 0.4517229	test: 0.4605643	best: 0.4605643 (700)	total: 37.9s	remaining: 12m 53s
800:	learn: 0.4381602	test: 0.4486232	best: 0.4486232 (800)	total: 43.2s	remaining: 12m 45s
900:	learn: 0.4274216	test: 0.4394789	best: 0.4394789 (900)	total: 48.4s	remaining: 12m 37s
1000:	learn: 0.4178731	test: 0.4313687	best: 0.4313687 (1000)	total: 53.6s	remai

8800:	learn: 0.2866961	test: 0.3724647	best: 0.3724497 (8715)	total: 7m 21s	remaining: 5m 11s
8900:	learn: 0.2859461	test: 0.3724325	best: 0.3724298 (8897)	total: 7m 26s	remaining: 5m 6s
9000:	learn: 0.2852204	test: 0.3723815	best: 0.3723774 (8996)	total: 7m 31s	remaining: 5m 1s
9100:	learn: 0.2845444	test: 0.3723280	best: 0.3723280 (9100)	total: 7m 36s	remaining: 4m 56s
9200:	learn: 0.2837712	test: 0.3722565	best: 0.3722565 (9200)	total: 7m 41s	remaining: 4m 50s
9300:	learn: 0.2830607	test: 0.3722009	best: 0.3722009 (9300)	total: 7m 46s	remaining: 4m 45s
9400:	learn: 0.2823363	test: 0.3721711	best: 0.3721694 (9358)	total: 7m 51s	remaining: 4m 40s
9500:	learn: 0.2816317	test: 0.3721397	best: 0.3721264 (9494)	total: 7m 56s	remaining: 4m 35s
9600:	learn: 0.2809993	test: 0.3721344	best: 0.3721242 (9538)	total: 8m 1s	remaining: 4m 30s
9700:	learn: 0.2803175	test: 0.3721418	best: 0.3721211 (9633)	total: 8m 6s	remaining: 4m 25s
9800:	learn: 0.2796034	test: 0.3721264	best: 0.3721145 (9724)	to

7500:	learn: 0.2986146	test: 0.3708438	best: 0.3708386 (7471)	total: 6m 16s	remaining: 6m 16s
7600:	learn: 0.2978320	test: 0.3707964	best: 0.3707964 (7600)	total: 6m 21s	remaining: 6m 11s
7700:	learn: 0.2971115	test: 0.3707068	best: 0.3707068 (7700)	total: 6m 26s	remaining: 6m 6s
7800:	learn: 0.2962688	test: 0.3706367	best: 0.3706354 (7799)	total: 6m 31s	remaining: 6m 1s
7900:	learn: 0.2954515	test: 0.3706102	best: 0.3705969 (7888)	total: 6m 36s	remaining: 5m 56s
8000:	learn: 0.2946311	test: 0.3705748	best: 0.3705715 (7997)	total: 6m 41s	remaining: 5m 51s
8100:	learn: 0.2938061	test: 0.3705295	best: 0.3705261 (8087)	total: 6m 46s	remaining: 5m 46s
8200:	learn: 0.2930550	test: 0.3704819	best: 0.3704692 (8189)	total: 6m 51s	remaining: 5m 41s
8300:	learn: 0.2922324	test: 0.3704177	best: 0.3704177 (8300)	total: 6m 56s	remaining: 5m 35s
8400:	learn: 0.2914426	test: 0.3703328	best: 0.3703322 (8395)	total: 7m 1s	remaining: 5m 30s
8500:	learn: 0.2906545	test: 0.3702983	best: 0.3702943 (8489)	t

7100:	learn: 0.3066156	test: 0.3763372	best: 0.3763372 (7100)	total: 5m 57s	remaining: 6m 37s
7200:	learn: 0.3058966	test: 0.3762786	best: 0.3762786 (7200)	total: 6m 2s	remaining: 6m 32s
7300:	learn: 0.3051038	test: 0.3761495	best: 0.3761495 (7300)	total: 6m 7s	remaining: 6m 27s
7400:	learn: 0.3043295	test: 0.3760555	best: 0.3760538 (7399)	total: 6m 12s	remaining: 6m 22s
7500:	learn: 0.3035415	test: 0.3759647	best: 0.3759647 (7500)	total: 6m 18s	remaining: 6m 18s
7600:	learn: 0.3027972	test: 0.3758383	best: 0.3758347 (7596)	total: 6m 23s	remaining: 6m 13s
7700:	learn: 0.3019187	test: 0.3756992	best: 0.3756992 (7700)	total: 6m 28s	remaining: 6m 8s
7800:	learn: 0.3011553	test: 0.3755568	best: 0.3755561 (7799)	total: 6m 33s	remaining: 6m 2s
7900:	learn: 0.3003440	test: 0.3754536	best: 0.3754521 (7898)	total: 6m 38s	remaining: 5m 57s
8000:	learn: 0.2995573	test: 0.3753286	best: 0.3753271 (7998)	total: 6m 43s	remaining: 5m 52s
8100:	learn: 0.2987896	test: 0.3752060	best: 0.3752060 (8100)	to

3600:	learn: 0.3392124	test: 0.3769761	best: 0.3769761 (3600)	total: 3m 7s	remaining: 9m 52s
3700:	learn: 0.3379441	test: 0.3765148	best: 0.3765148 (3700)	total: 3m 11s	remaining: 9m 45s
3800:	learn: 0.3366381	test: 0.3761183	best: 0.3761183 (3800)	total: 3m 16s	remaining: 9m 39s
3900:	learn: 0.3353981	test: 0.3757377	best: 0.3757377 (3900)	total: 3m 21s	remaining: 9m 33s
4000:	learn: 0.3341714	test: 0.3754291	best: 0.3754283 (3997)	total: 3m 26s	remaining: 9m 27s
4100:	learn: 0.3329611	test: 0.3751036	best: 0.3751036 (4100)	total: 3m 31s	remaining: 9m 22s
4200:	learn: 0.3319368	test: 0.3748113	best: 0.3748090 (4197)	total: 3m 36s	remaining: 9m 16s
4300:	learn: 0.3308520	test: 0.3745480	best: 0.3745480 (4300)	total: 3m 41s	remaining: 9m 10s
4400:	learn: 0.3297650	test: 0.3743114	best: 0.3743106 (4399)	total: 3m 46s	remaining: 9m 4s
4500:	learn: 0.3286908	test: 0.3740695	best: 0.3740695 (4500)	total: 3m 51s	remaining: 8m 58s
4600:	learn: 0.3276579	test: 0.3738413	best: 0.3738413 (4600)	

200:	learn: 0.6761444	test: 0.7050895	best: 0.7050895 (200)	total: 14.5s	remaining: 17m 47s
300:	learn: 0.5715305	test: 0.6024932	best: 0.6024932 (300)	total: 21.5s	remaining: 17m 29s
400:	learn: 0.5181647	test: 0.5508480	best: 0.5508480 (400)	total: 28.1s	remaining: 17m 4s
500:	learn: 0.4840808	test: 0.5186106	best: 0.5186106 (500)	total: 34.9s	remaining: 16m 50s
600:	learn: 0.4617105	test: 0.4980052	best: 0.4980052 (600)	total: 41.7s	remaining: 16m 38s
700:	learn: 0.4452419	test: 0.4829969	best: 0.4829969 (700)	total: 48.5s	remaining: 16m 28s
800:	learn: 0.4315767	test: 0.4711263	best: 0.4711263 (800)	total: 57s	remaining: 16m 50s
900:	learn: 0.4202764	test: 0.4615710	best: 0.4615710 (900)	total: 1m 3s	remaining: 16m 36s
1000:	learn: 0.4112617	test: 0.4540898	best: 0.4540898 (1000)	total: 1m 10s	remaining: 16m 22s
1100:	learn: 0.4027244	test: 0.4473620	best: 0.4473620 (1100)	total: 1m 17s	remaining: 16m 12s
1200:	learn: 0.3954945	test: 0.4418959	best: 0.4418959 (1200)	total: 1m 23s	r

9000:	learn: 0.2848238	test: 0.3997300	best: 0.3997300 (8999)	total: 9m 34s	remaining: 6m 22s
9100:	learn: 0.2841335	test: 0.3997097	best: 0.3997093 (9098)	total: 9m 39s	remaining: 6m 15s
9200:	learn: 0.2834343	test: 0.3997057	best: 0.3996978 (9150)	total: 9m 44s	remaining: 6m 8s
9300:	learn: 0.2826697	test: 0.3996663	best: 0.3996599 (9260)	total: 9m 49s	remaining: 6m 1s
9400:	learn: 0.2818948	test: 0.3996026	best: 0.3996021 (9399)	total: 9m 54s	remaining: 5m 53s
9500:	learn: 0.2811680	test: 0.3995901	best: 0.3995835 (9493)	total: 9m 59s	remaining: 5m 46s
9600:	learn: 0.2804824	test: 0.3995799	best: 0.3995784 (9599)	total: 10m 3s	remaining: 5m 39s
9700:	learn: 0.2797218	test: 0.3995514	best: 0.3995498 (9697)	total: 10m 8s	remaining: 5m 32s
9800:	learn: 0.2790184	test: 0.3995658	best: 0.3995486 (9725)	total: 10m 13s	remaining: 5m 25s
9900:	learn: 0.2783224	test: 0.3995481	best: 0.3995409 (9897)	total: 10m 18s	remaining: 5m 18s
10000:	learn: 0.2776369	test: 0.3995318	best: 0.3995216 (998

In [52]:
pd.DataFrame(model_cat.feature_importances_,
                     index=cols,
                     columns=['importance']).query('importance>1')

Unnamed: 0,importance
RVSZ,3.425022
sex,1.030568
marital_status,6.894693
occupation_code,2.663958
birth_year,2.52465
from_begin,1.836085
day_of_year,1.021888
age,2.946634
mean_age_in_occupation,1.728786
std_age_in_occupation,2.435206


In [53]:
#Meaning predict values
#
new_a = np.ones((10000,21)) * 0.0
for r in probs:
    new_a += r
new_a /= 5

In [54]:
y_test = pd.DataFrame(new_a)
y_test.columns = le.inverse_transform(y_test.columns)

In [55]:
for i,row in tqdm_notebook(X_test.iterrows()):
    summ = 0
    
    for c in names_products:
        if row[c] == 1:
            y_test.loc[y_test.index == i, c] = 1.0
        else:
            summ += y_test.loc[y_test.index == i, c].values[0]
    for c in names_products:
        if row[c] != 1.0:
            y_test.loc[y_test.index == i, c] /= summ

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [56]:
#Reformat predict values to necessary view and replace that are given
#
answer_mass = []
for i in range(X_test.shape[0]):
    id = X_test['ID'].iloc[i]
    for c in y_test.columns:
        answer_mass.append([id + ' X ' + c, y_test[c].iloc[i]])

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
df_answer.head()

Unnamed: 0,ID X PCODE,Label
0,F86J5PC X 66FJ,7.9e-05
1,F86J5PC X 7POT,8e-05
2,F86J5PC X 8NN1,2e-06
3,F86J5PC X AHXO,1.7e-05
4,F86J5PC X BSTQ,2.3e-05


In [72]:
#Blending models
#
df_new = pd.read_csv('submiss.csv')
df = df_new.merge(
    df_answer.rename(columns={'Label': 'lbl'}))
df['Label'] = df['Label']*0.8 + df['lbl']*0.2
df.drop(['lbl'], axis=1, inplace=True)
df_new = pd.read_csv('submis_1.csv')
df = df.merge(
    df_new.rename(columns={'Label': 'lbl'}))
df['Label'] = df['Label']*0.6 + df['lbl']*0.4
df.drop(['lbl'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID X PCODE,Label
0,0021EE1 X P5DA,5e-06
1,0029J1L X P5DA,0.000114
2,004QK71 X P5DA,9.1e-05
3,005AP9V X P5DA,2.1e-05
4,0096G27 X P5DA,1.5e-05


In [73]:
#Back to the first view
#
df_replaced = copy.copy(df)
df_replaced['ID'] = list(map(lambda x: x.split(' X ')[0], df_replaced['ID X PCODE']))
df_replaced['PCODE'] = list(map(lambda x: x.split(' X ')[1], df_replaced['ID X PCODE']))
df_replaced = df_replaced.set_index(['ID','PCODE'])['Label'].unstack().reset_index()

In [74]:
#Extra read data
#
train_base = pd.read_csv('Train.csv')
test_base = pd.read_csv('Test.csv')

In [76]:
#Making dict with known values from statistics
#
dict_post = {}
for pr1 in names_products:
    for pr2 in names_products:
        if pr1 == pr2:
            continue
        stats = train_base[train_base[pr1] == 1][pr2].value_counts()

        stats0 = stats.get(0, 0)
        stats1 = stats.get(1, 0)

        threshold = 0.999

        if stats0 + stats1 > 100 and \
                (stats0 / (stats0 + stats1) > threshold or stats1 / (stats1 + stats0) > threshold ):
            if (stats1 == 0 or stats0 > stats1):
                dict_post[(pr1, pr2)] = (1,0)
            elif stats0 == 0 or stats1 > stats0:
                dict_post[(pr1, pr2)] = (1,1)

In [77]:
#Replacing known values from our predictions
#
for i, row in tqdm_notebook(df_replaced.iterrows()):
    for key in dict_post.keys():
        if row[key[0]] == 1:
            if row[key[1]] != dict_post[key][1]:
                if dict_post[key][1] == 1:
                    for c in names_products:
                        if c != key[1] and row[c] != 1:
                            df_replaced.loc[df_replaced.index == i, c] = 1e-53
                        elif c==key[1]:
                            df_replaced.loc[df_replaced.index == i, key[1]] = 0.9999999999999999999999999999999999

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [78]:
#Making submission view
#
products =df_replaced[['P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3']]
df_replaced = df_replaced.melt(id_vars=['ID'], value_vars=products, var_name = "PCODE", value_name="Label" )
df_replaced['ID X PCODE'] = df_replaced['ID'] + ' X ' + df_replaced['PCODE']
df_replaced = df_replaced[['ID X PCODE', 'Label']]
df_replaced.head()

Unnamed: 0,ID X PCODE,Label
0,0021EE1 X P5DA,1e-53
1,0029J1L X P5DA,0.0001138897
2,004QK71 X P5DA,9.056966e-05
3,005AP9V X P5DA,2.055958e-05
4,0096G27 X P5DA,1.524471e-05


In [79]:
df_replaced.to_csv('submis.csv', index = False)

Submit