# Download libraries and data

In [1]:
import pandas as pd
import numpy as np
import copy
from itertools import combinations

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm, tqdm_notebook

pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings("ignore")
import time

In [171]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sub = pd.read_csv('SampleSubmission.csv')

# Checking the data

In [172]:
train.head(2)

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,8NN1,7POT,66FJ,GYSR,SOP4,RVSZ,PYUQ,LJR9,N2MW,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,4WKQSBB,1/2/2019,F,M,1987,1X1H,2A7I,T4MS,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
1,CP5S02H,1/6/2019,F,M,1981,UAOD,2A7I,T4MS,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [173]:
test.head(2)

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,8NN1,7POT,66FJ,GYSR,SOP4,RVSZ,PYUQ,LJR9,N2MW,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,F86J5PC,1/12/2018,M,M,1984,94KC,DZRV,90QI,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,H6141K3,1/10/2019,M,M,1996,1X1H,J9SY,90QI,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [174]:
sub.head(2)

Unnamed: 0,ID X PCODE,Label
0,F86J5PC X P5DA,0
1,F86J5PC X RIBP,0


# Replacing non common occupation codes to occupation category code

In [175]:
replace_train = list(
    set(train['occupation_code'].unique().tolist()) -
    set(test['occupation_code']))
replace_test = list(
    set(test['occupation_code'].unique().tolist()) -
    set(train['occupation_code']))

train['occupation_code'] = train['occupation_code'].replace(
    replace_train, np.nan)
test['occupation_code'] = test['occupation_code'].replace(replace_test, np.nan)
train['occupation_code'].fillna(train['occupation_category_code'],
                                inplace=True)
test['occupation_code'].fillna(test['occupation_category_code'], inplace=True)

# Getting right format

In [176]:
#Make spliting train clients info. Trying to reproduce the situation with test
#
X_train = []
X_train_columns = train.columns[:-1]
client_index = 0

for line in tqdm_notebook(train.values):

    info = line[:8]
    info_products = line[8:-1]
    indexes = [k for k, i in enumerate(info_products) if i == 1]

    for i in indexes:

        client_index += 1

        for k in range(len(info_products)):

            if k == i:

                info_products_transformed = list(copy.copy(info_products))
                info_products_transformed[i] = 0

                X_train.append(
                    list(info) + info_products_transformed +
                    [X_train_columns[8 + k]] + [client_index])

X_train = pd.DataFrame(X_train)
X_train.columns = [
    'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
    'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
    '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
    'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',
    'target', 'ID2'
]
train = X_train.copy()

In [177]:
#Make info about true values in data of predictions
#
X_test = []
true_values = []
client_index = 0
for line in tqdm_notebook(test.values):

    client_index += 1

    info = line[:8]
    info_products = line[8:-1]
    indexes = [k for k, i in enumerate(info_products) if i == 1]

    X_test.append(list(info) + list(info_products) + [client_index])

    for true in test.columns[8:][indexes]:
        true_values.append(line[0] + ' X ' + true)

X_test = pd.DataFrame(X_test)
X_test.columns = [
    'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
    'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
    '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
    'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',
    'ID2'
]
test = X_test.copy()

In [178]:
train['marital_status'] = train['marital_status'].replace(['f'], ['F'])

In [179]:
df = train.append(test)

# Feature Engineering

In [180]:
def create_date_featues(df):

    df['Join_Year'] = pd.to_datetime(df['join_date']).dt.year

    df['Join_Month'] = pd.to_datetime(df['join_date']).dt.month

    df['Join_Day'] = pd.to_datetime(df['join_date']).dt.day

    df['DayOfyear'] = pd.to_datetime(df['join_date']).dt.dayofyear

    return df

In [181]:
df = create_date_featues(df)

In [182]:
df['birth_year_bin'] = pd.cut(df['birth_year'], bins=5)

In [183]:
%%time
columns = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]
for col in columns:
    df[col + '_' + 'sum'] = df.groupby('branch_code')[col].transform(sum)

Wall time: 116 ms


In [184]:
for col in columns:
    df[col + '_' +
       'Join_year_sum'] = df.groupby('Join_Year')[col].transform(sum)

In [189]:
df['join_date'] = pd.to_datetime(df['join_date'])
for col in columns:
    df['from_arise_col_' +
       col] = (df['join_date'] -
               df.loc[df[col] == 1, 'join_date'].min()).dt.days

In [194]:
df['Number_of_Insurance_Bought'] = df.iloc[:, 8:29].sum(axis=1)


def mapper(df):
    if df['Number_of_Insurance_Bought'] == 1:
        return 'One'
    elif (df['Number_of_Insurance_Bought'] >
          1) & (df['Number_of_Insurance_Bought'] < 5):
        return 'Medium'
    elif (df['Number_of_Insurance_Bought'] >
          4) & (df['Number_of_Insurance_Bought'] < 8):
        return 'High'
    else:
        return 'Too High'


df['Insurance_Count'] = df.apply(lambda df: mapper(df), axis=1)
del df['Number_of_Insurance_Bought']

In [195]:
df['branch_start_year'] = df.groupby('branch_code')['Join_Year'].transform('min')
df['branch_since'] = 2020 - df['branch_start_year']
del df['branch_start_year']

In [196]:
df['Unique_customers_per_branch'] = df.groupby('branch_code')['ID'].transform('nunique')
df['Unique_Insurance_per_branch'] = df.groupby('branch_code')['target'].transform('nunique')

df['Unique_year_per_branch'] = df.groupby('branch_code')['Join_Year'].transform('nunique')
df['Unique_month_per_branch'] = df.groupby('branch_code')['Join_Month'].transform('nunique')
df['Unique_branch_per_year'] = df.groupby('Join_Year')['branch_code'].transform('nunique')

In [197]:
df['Age'] = df['Join_Year'] - df['birth_year']
df['Average_Age_per_branch'] = df.groupby('branch_code')['Age'].transform('mean')

df['Average_Age_per_occupation'] = df.groupby('occupation_code')['Age'].transform('mean')

for col in columns:
    df[col + '_' + 'meanAge'] = df.groupby(col)['Age'].transform('mean')

del df['Age']

In [198]:
df.reset_index(drop=True, inplace=True)

In [199]:
names_products = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]

In [200]:
#Add glue togethered targets
#
for i, row in tqdm_notebook(df.iterrows()):
    res = []
    for c in names_products:
        if row[c] == 1:
            res.append(c)
    df.loc[df.index == i, 'product_comb'] = '_'.join(sorted(res))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [201]:
df.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,8NN1,7POT,66FJ,GYSR,SOP4,RVSZ,PYUQ,LJR9,N2MW,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3,target,ID2,Join_Year,Join_Month,Join_Day,DayOfyear,birth_year_bin,P5DA_sum,RIBP_sum,8NN1_sum,7POT_sum,66FJ_sum,GYSR_sum,SOP4_sum,RVSZ_sum,PYUQ_sum,LJR9_sum,N2MW_sum,AHXO_sum,BSTQ_sum,FM3X_sum,...,from_arise_col_8NN1,from_arise_col_7POT,from_arise_col_66FJ,from_arise_col_GYSR,from_arise_col_SOP4,from_arise_col_RVSZ,from_arise_col_PYUQ,from_arise_col_LJR9,from_arise_col_N2MW,from_arise_col_AHXO,from_arise_col_BSTQ,from_arise_col_FM3X,from_arise_col_K6QO,from_arise_col_QBOL,from_arise_col_JWFN,from_arise_col_JZ9D,from_arise_col_J9JW,from_arise_col_GHYX,from_arise_col_ECY3,Insurance_Count,branch_since,Unique_customers_per_branch,Unique_Insurance_per_branch,Unique_year_per_branch,Unique_month_per_branch,Unique_branch_per_year,Average_Age_per_branch,Average_Age_per_occupation,P5DA_meanAge,RIBP_meanAge,8NN1_meanAge,7POT_meanAge,66FJ_meanAge,GYSR_meanAge,SOP4_meanAge,RVSZ_meanAge,PYUQ_meanAge,LJR9_meanAge,N2MW_meanAge,AHXO_meanAge,BSTQ_meanAge,FM3X_meanAge,K6QO_meanAge,QBOL_meanAge,JWFN_meanAge,JZ9D_meanAge,J9JW_meanAge,GHYX_meanAge,ECY3_meanAge,product_comb
0,4WKQSBB,2019-01-02,F,M,1987,1X1H,2A7I,T4MS,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,RVSZ,1,2019.0,1.0,2.0,2.0,"(1979.4, 1995.2]",0,14,0,5,1,0,0,1598,8,0,4,0,8,0,...,3284.0,3284.0,3282.0,2191.0,3282.0,3284.0,3284.0,3284.0,2919.0,3278.0,3277.0,2554.0,3282.0,3284.0,2912.0,3278.0,3278.0,3277.0,3281.0,One,2.0,1738,14,3,1,14.0,37.532588,38.035144,38.412348,38.350148,38.411398,38.41096,38.405632,38.414079,38.429297,38.36121,38.396291,38.419403,38.365864,38.382695,38.413804,38.413013,38.384302,37.673714,38.4074,38.429089,38.431485,38.408466,38.360615,K6QO
1,4WKQSBB,2019-01-02,F,M,1987,1X1H,2A7I,T4MS,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,K6QO,2,2019.0,1.0,2.0,2.0,"(1979.4, 1995.2]",0,14,0,5,1,0,0,1598,8,0,4,0,8,0,...,3284.0,3284.0,3282.0,2191.0,3282.0,3284.0,3284.0,3284.0,2919.0,3278.0,3277.0,2554.0,3282.0,3284.0,2912.0,3278.0,3278.0,3277.0,3281.0,One,2.0,1738,14,3,1,14.0,37.532588,38.035144,38.412348,38.350148,38.411398,38.41096,38.405632,38.414079,38.429297,38.46995,38.396291,38.419403,38.365864,38.382695,38.413804,38.413013,38.436159,37.673714,38.4074,38.429089,38.431485,38.408466,38.360615,RVSZ
2,CP5S02H,2019-01-06,F,M,1981,UAOD,2A7I,T4MS,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,RVSZ,3,2019.0,1.0,6.0,6.0,"(1979.4, 1995.2]",0,54,6,13,6,0,18,4191,360,44,25,8,37,0,...,3288.0,3288.0,3286.0,2195.0,3286.0,3288.0,3288.0,3288.0,2923.0,3282.0,3281.0,2558.0,3286.0,3288.0,2916.0,3282.0,3282.0,3281.0,3285.0,One,9.0,4714,18,10,1,14.0,37.961993,38.035144,38.412348,38.350148,38.411398,38.41096,38.405632,38.414079,38.429297,38.36121,38.396291,38.419403,38.365864,38.382695,38.413804,38.413013,38.384302,37.673714,38.4074,38.429089,38.431485,38.408466,38.360615,K6QO
3,CP5S02H,2019-01-06,F,M,1981,UAOD,2A7I,T4MS,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,K6QO,4,2019.0,1.0,6.0,6.0,"(1979.4, 1995.2]",0,54,6,13,6,0,18,4191,360,44,25,8,37,0,...,3288.0,3288.0,3286.0,2195.0,3286.0,3288.0,3288.0,3288.0,2923.0,3282.0,3281.0,2558.0,3286.0,3288.0,2916.0,3282.0,3282.0,3281.0,3285.0,One,9.0,4714,18,10,1,14.0,37.961993,38.035144,38.412348,38.350148,38.411398,38.41096,38.405632,38.414079,38.429297,38.46995,38.396291,38.419403,38.365864,38.382695,38.413804,38.413013,38.436159,37.673714,38.4074,38.429089,38.431485,38.408466,38.360615,RVSZ
4,2YKDILJ,2013-01-06,M,U,1991,748L,QZYX,90QI,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,SOP4,5,2013.0,1.0,6.0,6.0,"(1979.4, 1995.2]",7,2031,144,259,366,2,416,10711,1142,254,161,38,391,133,...,1097.0,1097.0,1095.0,4.0,1095.0,1097.0,1097.0,1097.0,732.0,1091.0,1090.0,367.0,1095.0,1097.0,725.0,1091.0,1091.0,1090.0,1094.0,Medium,10.0,10919,21,11,6,7.0,38.560827,31.423673,38.412348,38.350148,38.411398,38.41096,38.405632,38.414079,38.429297,38.46995,38.396291,38.419403,38.365864,38.382695,38.413804,38.413013,38.436159,37.673714,38.4074,38.429089,38.431485,38.408466,40.115056,ECY3_RVSZ


## Interaction Feature

In [202]:
df['Join_Year'] = df['Join_Year'].astype(str)
df['birth_year_bin'] = df['birth_year_bin'].astype(str)

In [203]:
df['bc_oc'] = df['branch_code'] + '_' + df['occupation_code']
df['bc_occ'] = df['branch_code'] + '_' + df['occupation_category_code']

## Label Encoding

In [204]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in [
        'product_comb', 'Insurance_Count', 'sex', 'marital_status',
        'branch_code', 'occupation_category_code', 'occupation_code'
]:
    df[col] = le.fit_transform(df[col])

## Frequency Encoding

In [205]:
fe_pol = (df.groupby('product_comb').size()) / len(df)
df['product_comb_fe'] = df['product_comb'].apply(lambda x: fe_pol[x])

In [206]:
fe_pol = (df.groupby('bc_occ').size()) / len(df)
df['bc_occ'] = df['bc_occ'].apply(lambda x: fe_pol[x])

In [207]:
fe_pol = (df.groupby('bc_oc').size()) / len(df)
df['bc_oc'] = df['bc_oc'].apply(lambda x: fe_pol[x])

In [208]:
fe_pol = (df.groupby('birth_year_bin').size()) / len(df)
df['birth_year_bin'] = df['birth_year_bin'].apply(lambda x: fe_pol[x])
df['birth_year_bin'] = df['birth_year_bin'].astype(float)

In [209]:
fe_pol = (df.groupby('occupation_code').size()) / len(df)
df['occupation_code_fe'] = df['occupation_code'].apply(lambda x: fe_pol[x])

In [210]:
fe_pol = (df.groupby('occupation_category_code').size()) / len(df)
df['occupation_category_code'] = df['occupation_category_code'].apply(lambda x: fe_pol[x])

In [211]:
fe_pol = (df.groupby('sex').size()) / len(df)
df['sex_fe'] = df['sex'].apply(lambda x: fe_pol[x])

In [212]:
fe_pol = (df.groupby('Insurance_Count').size()) / len(df)
df['Insurance_Count_fe'] = df['Insurance_Count'].apply(lambda x: fe_pol[x])

In [213]:
df['Join_Year'] = df['Join_Year'].astype(float)

In [214]:
#Relationship between targets
#
for col in columns:
    for cols in columns:
        if col != cols:
            df[col + '_' + cols] = df.groupby(col)[cols].transform(sum)

In [215]:
df['num_freq'] = df.groupby('product_comb_fe')['ID'].transform('count')

## Getting back train and test

In [239]:
train = df[:train.shape[0]]
test = df[-test.shape[0]:]

In [48]:
len(train),train['target'].nunique()

(66353, 21)

## Removing records if target count is less than 3

In [240]:
train['target_count'] = train.groupby(['branch_code',
                                       'target'])['target'].transform('count')
train = train[train['target_count'] > 2]
del train['target_count']

In [54]:
len(train), train['target'].nunique()

(66290, 20)

## Label Encoding Target

In [241]:
te = LabelEncoder()
train['target'] = te.fit_transform(train['target'])

# StratifiedKFold

In [60]:
#LGB model
#
err = []
y_pred_tot_lgb = 0

fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1997)
i = 1
x = train.drop(columns={'join_date', 'ID', 'ID2', 'target'})
y = train[['target']]
ID = test['ID']
testing = test.drop(columns={'join_date', 'ID', 'ID2', 'target'})
for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    m = LGBMClassifier(n_estimators=10000,
                       n_jobs=-1,
                       random_state=69,
                       learning_rate=0.01,
                       max_depth=5,
                       num_leaves=128,
                       colsample_bytree=0.5,
                       colsample_bynode=0.5,
                       min_data_in_leaf=40,
                       bagging_freq=2,
                       bagging_fraction=0.9,
                       reg_alpha=0.5,
                       reg_lambda=1)
    m.fit(x_train,
          y_train,
          eval_set=[(x_train, y_train), (x_val, y_val)],
          early_stopping_rounds=20,
          eval_metric='multi_logloss',
          verbose=200)
    pred_y = m.predict_proba(x_val)
    print(i, " err_lgm: ", log_loss(y_val, pred_y))
    err.append(log_loss(y_val, pred_y))
    pred_test = m.predict_proba(testing)
    y_pred_tot_lgb += pred_test
y_pred_tot_lgb = y_pred_tot_lgb / 3
(err[0] + err[1] + err[2]) / 3

Training until validation scores don't improve for 20 rounds
[200]	training's multi_logloss: 0.425229	valid_1's multi_logloss: 0.442599
[400]	training's multi_logloss: 0.342772	valid_1's multi_logloss: 0.377401
[600]	training's multi_logloss: 0.312031	valid_1's multi_logloss: 0.363581
[800]	training's multi_logloss: 0.291081	valid_1's multi_logloss: 0.358438
[1000]	training's multi_logloss: 0.27363	valid_1's multi_logloss: 0.355791
[1200]	training's multi_logloss: 0.258673	valid_1's multi_logloss: 0.354585
Early stopping, best iteration is:
[1366]	training's multi_logloss: 0.247805	valid_1's multi_logloss: 0.354216
1  err_lgm:  0.35421631499025946
Training until validation scores don't improve for 20 rounds
[200]	training's multi_logloss: 0.418809	valid_1's multi_logloss: 0.455144
[400]	training's multi_logloss: 0.33489	valid_1's multi_logloss: 0.393109
[600]	training's multi_logloss: 0.3046	valid_1's multi_logloss: 0.381151
[800]	training's multi_logloss: 0.283089	valid_1's multi_logl

0.36476640700836144

In [62]:
#XGB model
#
err = []
y_pred_tot_xgb = 0

fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1997)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    m = XGBClassifier(
        n_estimators=10000,
        eta=0.1,
        n_jobs=-1,
        random_state=69,
        reg_alpha=0.5,  #reg_lambda=1.2 
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        colsample_bynode=0.8,
        subsample=0.9,
        gamma=1.5,
        max_depth=7)
    m.fit(x_train,
          y_train,
          eval_set=[(x_train, y_train), (x_val, y_val)],
          early_stopping_rounds=20,
          eval_metric='mlogloss',
          verbose=20)
    pred_y = m.predict_proba(x_val)
    print(i, " err_lgm: ", log_loss(y_val, pred_y))
    err.append(log_loss(y_val, pred_y))
    pred_test = m.predict_proba(testing)
    y_pred_tot_xgb += pred_test
y_pred_tot_xgb = y_pred_tot_xgb / 3
(err[0] + err[1] + err[2]) / 3

[0]	validation_0-mlogloss:2.23928	validation_1-mlogloss:2.24216
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 20 rounds.
[20]	validation_0-mlogloss:0.55627	validation_1-mlogloss:0.58586
[40]	validation_0-mlogloss:0.36262	validation_1-mlogloss:0.41120
[60]	validation_0-mlogloss:0.30980	validation_1-mlogloss:0.37351
[80]	validation_0-mlogloss:0.28610	validation_1-mlogloss:0.36247
[100]	validation_0-mlogloss:0.27054	validation_1-mlogloss:0.35819
[120]	validation_0-mlogloss:0.25927	validation_1-mlogloss:0.35662
[140]	validation_0-mlogloss:0.25005	validation_1-mlogloss:0.35568
[160]	validation_0-mlogloss:0.24265	validation_1-mlogloss:0.35481
[180]	validation_0-mlogloss:0.23612	validation_1-mlogloss:0.35453
[200]	validation_0-mlogloss:0.23081	validation_1-mlogloss:0.35423
[220]	validation_0-mlogloss:0.22614	validation_1-mlogloss:0.35409
[240]	validation_0-mlogloss:0.22208	validation_

0.36375298214269725

# Averaging submission

In [66]:
pred = y_pred_tot_lgb * 0.6 + y_pred_tot_xgb * 0.4
y_test = pd.DataFrame(pred)
y_test.columns = te.inverse_transform(y_test.columns)

In [69]:
%%time
answer_mass = []
for i in range(test.shape[0]):
    test['ID'] = ID
    id = test['ID'].iloc[i]
    for c in y_test.columns:
        answer_mass.append([id + ' X ' + c, y_test[c].iloc[i]])

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
for i in tqdm_notebook(range(df_answer.shape[0])):
    if df_answer['ID X PCODE'].iloc[i] in true_values:
        df_answer['Label'].iloc[i] = 1.0
df_answer.head()

HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))


Wall time: 37 s


Unnamed: 0,ID X PCODE,Label
0,F86J5PC X 66FJ,7.7e-05
1,F86J5PC X 7POT,0.000107
2,F86J5PC X 8NN1,1.4e-05
3,F86J5PC X AHXO,0.000102
4,F86J5PC X BSTQ,2.1e-05


In [70]:
sub1 = df_answer[['ID X PCODE', 'Label']]
sub1.reset_index(drop=True, inplace=True)

In [71]:
sub = pd.read_csv('SampleSubmission.csv')

In [72]:
sub.sort_values(by=['ID X PCODE'], inplace=True)
sub1.sort_values(by=['ID X PCODE'], inplace=True)

In [73]:
actual = sub1
findl = actual['ID X PCODE'].values
replacel = actual['Label'].values
sub.loc[sub['ID X PCODE'].isin(findl), ['Label']] = replacel

In [81]:
#Make submission
sub.to_csv('submiss.csv',index=False)

Open Zimnat_insurance_cat_target+multy.ipynb