# Insurance-Recommendation-Challenge-Solution

## Install and import necessary libraries

In [None]:
# Install CatBoost
!pip install catboost

In [None]:
#Import libraries
import pandas as pd, os, gc
import numpy as np
import math
import copy
from itertools import combinations

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import roc_curve, auc, log_loss

from tqdm import tqdm, tqdm_notebook

from sklearn.model_selection import GroupShuffleSplit, StratifiedKFold, train_test_split, GroupKFold
from catboost import CatBoostClassifier
# from xgboost import XGBClassifier

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# mount drive

from google.colab import drive
drive.mount('/content/drive')

## Load Data

In [None]:
# Load data from Drive

# specify data path
path = '/content/drive/MyDrive/COLAB/DATASETS/Zimnat Insurance Competition'

train = pd.read_csv(path + '/Train.csv')
test = pd.read_csv(path + '/Test.csv')
sub = pd.read_csv(path + '/SampleSubmission.csv')

In [None]:
train.head()

In [None]:
test.head()

## Data preparing

In [None]:
replace_train=list(set(train['occupation_code'].unique().tolist())-set(test['occupation_code']))
replace_test=list(set(test['occupation_code'].unique().tolist())-set(train['occupation_code']))

train['occupation_code']=train['occupation_code'].replace(replace_train,np.nan)
test['occupation_code']=test['occupation_code'].replace(replace_test,np.nan)
train['occupation_code'].fillna(train['occupation_category_code'],inplace=True)
test['occupation_code'].fillna(test['occupation_category_code'],inplace=True)

In [None]:
# Adding amount of purchased products for each client(for test without 1 missing)
train['sum'] = train.iloc[:, 8:].T.sum()
test['sum'] = test.iloc[:, 8:].T.sum()+1

In [None]:
train.loc[train.marital_status == 'f', 'marital_status'] = 'F'

In [None]:
# Renaming features to prevent any repeating
train['sex'] += '_sex'
train['marital_status'] += '_marital_status'
train['branch_code'] += '_branch_code'
train['occupation_code'] += '_occupation_code'
train['occupation_category_code'] += '_occupation_category_code'
test['sex'] += '_sex'
test['marital_status'] += '_marital_status'
test['branch_code'] += '_branch_code'
test['occupation_code'] += '_occupation_code'
test['occupation_category_code'] += '_occupation_category_code'

In [None]:
names_products = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]

In [None]:
# Make spliting train clients info. Trying to reproduce the situation with test
X_train = []
X_train_columns = train.columns[:-1]
df_train_true = []
client_index = 0

for line in tqdm_notebook(train.values):

    info = line[:8]
    info_products = line[8:-1]
    indexes = [k for k, i in enumerate(info_products) if i == 1]

    for i in indexes:

        client_index += 1

        for k in range(len(info_products)):

            if k == i:

                info_products_transformed = list(copy.copy(info_products))
                df_train_true.append(info_products)
                info_products_transformed[i] = 0

                X_train.append(
                    list(info) + info_products_transformed +
                    [X_train_columns[8 + k]] + [client_index])

X_train = pd.DataFrame(X_train)
df_train_true = pd.DataFrame(df_train_true)
df_train_true.columns = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3'
]
X_train.columns = [
    'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
    'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
    '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
    'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',
    'product_pred', 'ID2'
]

In [None]:
# Make info about true values in data of predictions
X_test = []
true_values = []
client_index = 0
for line in tqdm_notebook(test.values):

    client_index += 1

    info = line[:8]
    info_products = line[8:-1]
    indexes = [k for k, i in enumerate(info_products) if i == 1]

    X_test.append(list(info) + list(info_products) + [client_index])

    for true in test.columns[8:][indexes]:
        true_values.append(line[0] + ' X ' + true)

X_test = pd.DataFrame(X_test)
X_test.columns = [
    'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
    'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
    '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
    'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',
    'ID2'
]

In [None]:
# Checking shapes
train.shape, X_train.shape

In [None]:
X_train.head(2)

In [None]:
X_test.head(2)

## Reshaping data

In [None]:
# Make data with reshape
features_train = []
features_test = []
columns = []

append_features = [
    'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',
    'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
    'J9JW', 'GHYX', 'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status',
    'branch_code', 'occupation_code', 'occupation_category_code', 'birth_year'
]
for f in append_features:

    features_train.append(X_train[f].values.reshape(-1, 1))
    features_test.append(X_test[f].values.reshape(-1, 1))

    columns.append(np.array([f]))

y_train = X_train[['product_pred']]

In [None]:
features_train = np.concatenate(features_train, axis=1)
features_test = np.concatenate(features_test, axis=1)
columns = np.concatenate(np.array(columns))

X_train = pd.DataFrame(features_train)
X_train.columns = columns

X_test = pd.DataFrame(features_test)
X_test.columns = columns

## Features Engineering

In [None]:
# Reformatting date of join to some features: year, month, day, day of week, day of year of join; add age of clients
for df in [X_train, X_test]:
    df['join_date'] = pd.to_datetime(df.join_date, format='%d/%m/%Y')

    df['from_begin'] = (df.join_date - pd.datetime(2010, 1, 1)).dt.days

    df['join_day'] = df['join_date'].dt.day
    df['join_month'] = df['join_date'].dt.month
    df['join_year'] = df['join_date'].dt.year
    df['dayofweek'] = df['join_date'].dt.weekday
    df['day_of_year'] = df['join_date'].dt.dayofyear

    df['age'] = (df['join_year'] - df['birth_year']).astype(float)

In [None]:
# Concating train and test data
common = X_train.append(X_test)

In [None]:
common['branch_start']=pd.datetime.now().year-common.groupby('branch_code')['join_year'].transform('min')

In [None]:
def transform(df, row):
    df[row[0]]=common.groupby(row[1])[row[2]].transform(row[3])

In [None]:
row_features = [['nuniq_people', 'branch_code', 'ID', 'nunique'],
                ['nuniq_branch_in_year', 'join_year', 'branch_code', 'nunique'], 
                ['nuniq_year', 'branch_code', 'join_year', 'nunique'], 
                ['nuniq_month', 'branch_code', 'join_month', 'nunique'], 
                ['mean_age_in_branch', 'branch_code', 'age', 'mean'],
                ['std_age_in_branch', 'branch_code', 'age', 'std'],
                ['median_age_in_branch', 'branch_code', 'age', 'median'],
                ['mean_age_in_occupation', 'occupation_code', 'age', 'mean'],
                ['std_age_in_occupation', 'occupation_code', 'age', 'std'],
                ['median_age_in_occupation', 'occupation_code', 'age', 'median']]
for row in row_features:
    transform(common,row)

In [None]:
common['birth_year_binary']= pd.cut(common['birth_year'], bins=5)

common['branch_ocupation']=common['branch_code']+'_'+common['occupation_code']
common['branch_ocupcode']=common['branch_code']+'_'+common['occupation_category_code']

In [None]:
common['Number_of_Insurance_Bought']=common.iloc[:, :21].sum(axis=1)

def mapper(common):
    if common['Number_of_Insurance_Bought']==1:
        return 'One'
    elif (common['Number_of_Insurance_Bought']>1) & (common['Number_of_Insurance_Bought']<5):
        return 'Medium'
    elif (common['Number_of_Insurance_Bought']>4 )& (common['Number_of_Insurance_Bought']<8):
        return 'High'    
    else:
        return 'Too High'   
common['Insurance_Count']=common.apply(lambda common:mapper(common) ,axis = 1)
del common['Number_of_Insurance_Bought']

In [None]:
for name in [
        'sex', 'marital_status', 'occupation_code', 'occupation_category_code',
        'birth_year_binary', 'branch_ocupation', 'branch_ocupcode', 'Insurance_Count'
]:
    freq = (common.groupby(name).size()) / len(common)
    common[name + '_freq'] = common[name].apply(lambda x: freq[x])
    common[name + '_freq'] = common[name + '_freq'].astype(float)

In [None]:
le_ins = LabelEncoder()
common['Insurance_Count'] = le_ins.fit_transform(common['Insurance_Count'])

In [None]:
for i,row in tqdm_notebook(common.iterrows()):
    res = []
    for c in names_products:
        if row[c] == 1:
            res.append(c)
    common.loc[common.index == i, 'product_comb'] = '_'.join(sorted(res))
common['product_comb'] = le_ins.fit_transform(common['product_comb'])

In [None]:
for col in tqdm_notebook(names_products):
    for cols in names_products:
        if col!=cols:
            common[col+'_'+cols]=common.groupby(col)[cols].transform(sum)

In [None]:
common.drop(
    columns=['birth_year_binary', 'branch_ocupation', 'branch_ocupcode'],
    inplace=True)

In [None]:
# Approximate counting of days after open branches and after first buy of each product
for code in tqdm_notebook(common.branch_code.unique()):
    common.loc[common.branch_code == code, 'from_arise_branch'] = \
    common.loc[common.branch_code == code, 'from_begin'] - common.loc[common.branch_code == code, 'from_begin'].min()
    for product in names_products:
        common.loc[common.branch_code == code, 'from_arise_product_'+product+'_in_branch'] = \
        common.loc[common.branch_code == code, 'from_begin'] - common.loc[(common.branch_code == code)&(common[product]==1), 'from_begin'].min()

In [None]:
for product in tqdm_notebook(names_products):
    common['from_arise_product_'+product] = (common['join_date'] - common.loc[common[product] == 1, 'join_date'].min()).dt.days
    common[product+'_'+'sum_in_branch']=common.groupby('branch_code')[product].transform(sum)
    common[product+'_'+'_age_mean']=common.groupby(product)['age'].transform('mean')
    common[product+'_'+'_age_std']=common.groupby(product)['age'].transform('std')
    common[product+'_'+'_age_median']=common.groupby(product)['age'].transform('median')
    common[product+'_'+'_sum_join_year']=common.groupby('join_year')[product].transform(sum)

In [None]:
# Splitting concating data to train and test
X_train = common[:66353]
X_test = common[66353:]

## Encoding

In [None]:
#Encoding of target values that look like names of missing products
#
le = LabelEncoder()
le.fit(y_train.iloc[:, 0])

y_train = pd.DataFrame(le.transform(y_train.iloc[:, 0]))
y_train.columns = ['target']

In [None]:
#Merging amount of purchased products
#
X_train = X_train.merge(train[['ID', 'sum']])
X_test = X_test.merge(test[['ID', 'sum']])

In [None]:
#Adding features with replacing to string type to use them like cat_features
#
for df in [X_train, X_test]:
    df['dayofweek_cat'] = df['dayofweek'].astype(str)
    df['from_begin_cat'] = df['from_begin'].astype(str)
    df['birth_year'] = df['birth_year'].astype(str)
    df['join_year_cat'] = df['join_year'].astype(str)
    df['sum_cat'] = df['sum'].astype(str)
    df['day_of_year_cat'] = df['day_of_year'].astype(str)

## Model

In [None]:
# categorical features
cat_features = ['sex','marital_status','branch_code','occupation_category_code','occupation_code','dayofweek_cat','from_begin_cat',\
                'sum_cat','birth_year','join_year_cat']

In [None]:
model_cat = CatBoostClassifier(depth = 5, n_estimators = 15000, learning_rate = 0.01, random_state = 567, task_type = 'GPU', \
                               thread_count = 1, verbose = 100, use_best_model = True, nan_mode = 'Max')

probs = []
probs_train = []
i = 1
scoring = 0
group_kfold = GroupKFold(n_splits=5)
cols = X_train.drop(columns=['ID', 'ID2', 'join_date']).columns
for train_index, test_index in group_kfold.split(X_train, y_train,
                                                 np.array(X_train['ID'])):
    X_real_train, X_valid = X_train.iloc[train_index], X_train.iloc[test_index]
    y_real_train, y_valid = y_train.iloc[train_index], y_train.iloc[test_index]
    print('Fold', i)
    model_cat.fit(
        X_real_train[cols],
        y_real_train,
        cat_features=cat_features,
        eval_set=[(X_valid[cols], y_valid)],
        early_stopping_rounds = 200,
    )
    scoring += model_cat.get_best_score()['validation']['MultiClass']

    proba = model_cat.predict_proba(X_test[cols])
    probs.append(proba)
    probs_train.append(model_cat.predict_proba(X_train[cols]))
    i += 1
scoring /= 5
print('MEAN SCORE =', scoring)

In [None]:
pd.DataFrame(model_cat.feature_importances_,
                     index=cols,
                     columns=['importance']).query('importance>1')

In [None]:
# Meaning predict values
new_a = np.ones((10000,21)) * 0.0
for r in probs:
    new_a += r
new_a /= 5

In [None]:
y_test = pd.DataFrame(new_a)
y_test.columns = le.inverse_transform(y_test.columns)

In [None]:
for i,row in tqdm_notebook(X_test.iterrows()):
    summ = 0
    
    for c in names_products:
        if row[c] == 1:
            y_test.loc[y_test.index == i, c] = 1.0
        else:
            summ += y_test.loc[y_test.index == i, c].values[0]
    for c in names_products:
        if row[c] != 1.0:
            y_test.loc[y_test.index == i, c] /= summ

In [None]:
# Reformating predicted value
answer_mass = []
for i in range(X_test.shape[0]):
    id = X_test['ID'].iloc[i]
    for c in y_test.columns:
        answer_mass.append([id + ' X ' + c, y_test[c].iloc[i]])

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
df_answer.head()

## Submission

In [None]:
# Submit
df_answer.to_csv('submis_1.csv', index = False)

Public leaderboard -> 0.027052459 <br>
Private Leaderboard -> 0.026766397