## Generatig predicitons for the submission dataset

In [1]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm
import numpy as np

### 0. Loading and preparing data

In [2]:
df = pd.read_parquet('../data/processed/test_features.parquet')
df.shape

(62096, 959)

In [3]:
df.reset_index(drop=True, inplace=True)

In [4]:
df.head()

Unnamed: 0,user_id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,...,deltas_no_mean,deltas_no_std,deltas_no_max,deltas_no_median,deltas_no_num_outliers,secs_elapsed_mean,secs_elapsed_std,secs_elapsed_max,secs_elapsed_median,device_count
0,5uwns89zht,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,...,5196.666667,6359.828632,15022.0,1405.0,1.0,14898.375,21621.413801,64883.0,2018.5,2.0
1,jtl0dijy2j,-unknown-,-1.0,basic,0,en,direct,direct,untracked,Moweb,...,1831.882353,2244.092765,8105.0,664.0,1.0,13164.157895,18194.475733,61881.0,3228.0,2.0
2,xx0ulgorjt,-unknown-,-1.0,basic,0,en,direct,direct,linked,Web,...,791.036364,1185.291862,4609.0,280.0,2.0,16820.258621,36679.560932,270618.0,8635.5,1.0
3,6c6puo6ix0,-unknown-,-1.0,basic,0,en,direct,direct,linked,Web,...,1490.444444,3739.483865,12042.0,77.0,1.0,11181.909091,30370.377866,106481.0,231.0,1.0
4,czqhjk3yfe,-unknown-,-1.0,basic,0,en,direct,direct,untracked,Web,...,5992.0,12386.32721,35580.0,186.0,1.0,23895.947368,53805.073963,219369.0,1123.0,1.0


In [5]:
df.drop('country_destination', axis=1, inplace=True)

In [6]:
x = df.drop('user_id', axis=1)

### 1. Splitting tain data into stratified train, validation and test sets

In [7]:
cat_features = [
    'gender',
    'signup_method',
    'signup_flow',
    'language',
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'signup_app',
    'first_device_type',
    'first_browser',
    'dow_registered',
    'hr_registered',
    'age_group',
]

In [8]:
for col in cat_features:
    x[col].fillna('', inplace=True)
    x[col] = x[col].astype('category')

### 2. Loading model

In [9]:
model = CatBoostClassifier()
model.load_model('../models/model2.cbm')

<catboost.core.CatBoostClassifier at 0x7f24bd16cca0>

### 3. Predicting Country of Destination

In [10]:
# x_pool = Pool(x, cat_features=cat_features)

In [11]:
# preds = model.predict(x, prediction_type='Class')
# preds = [el[0] for el in preds.tolist()]

In [12]:
# Counter(preds)

### 3.2 Predicting Country of Destination using multiple predicitons where applicable

In [13]:
# classes = list(model.classes_)
# classes

# preds = model.predict_proba(x)
# preds = preds.tolist()
# # preds = [el[0] for el in preds]
# preds_df = pd.DataFrame(preds)
# preds_df.shape

# preds_df['preds'] = preds

# preds_df['amax'] = preds_df.preds.apply(lambda x: max(x))

# preds_df.head()

# def find_nth_max_index(x, ix=1):
#     second_max = sorted(x)[-ix]
#     return x.index(second_max)


# def get_nth_best_result(x, ix=1):
#     return classes[find_nth_max_index(x, ix=ix)]

# find_nth_max_index(preds_df.loc[0].preds), get_nth_best_result(preds_df.loc[0].preds)

# preds_df['second'] = preds_df.preds.apply(lambda x: get_nth_best_result(x, ix=2))
# preds_df['first'] = preds_df.preds.apply(lambda x: get_nth_best_result(x))

# preds_df.head()

# preds_df['result'] = preds_df[['first', 'second']].apply(lambda x: list(x), axis=1)

# preds_df.head()

# mask = preds_df.amax < 0.65
# mask.sum()

# preds_df.loc[~mask, 'result'] = preds_df.loc[~mask, 'first']

# preds_df['id'] = df['user_id']

# preds_df.sample(10, random_state=42)

# submission = preds_df[['id', 'result']].explode('result')
# submission.shape

# submission.head()

# submission[submission.id == 'ycr4e6e5qv']



### 3.3 Predicting 5 Countries of Destination per each user

In [14]:
classes = list(model.classes_)
classes

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']

In [15]:
preds = model.predict_proba(x)
preds = preds.tolist()

In [16]:
preds_df = pd.DataFrame({'preds': preds})
preds_df['amax'] = preds_df.preds.apply(lambda x: max(x))

In [17]:
preds_df.head()

Unnamed: 0,preds,amax
0,"[0.0008875477231299356, 0.0018805796276288864,...",0.690769
1,"[0.00029260611175401977, 0.0009368319392331375...",0.922361
2,"[0.0001592182064140176, 0.002028397378663301, ...",0.89928
3,"[0.00028890401365596426, 0.0016611787305380538...",0.893099
4,"[0.0015819269488524252, 0.019569871009474507, ...",0.528054


In [18]:
def find_max_n_indeces(x, n=1):
    n_indeces = sorted(x, reverse=True)[:n]
    n_indeces = [x.index(el) for el in n_indeces]
    return n_indeces


def get_best_n_result(x, n=5):
    n_indeces = find_max_n_indeces(x, n=n)
    best_n = [classes[ix] for ix in n_indeces]
    return best_n

In [19]:
find_max_n_indeces(preds_df.loc[0].preds, n=5), get_best_n_result(preds_df.loc[0].preds, n=5)

([7, 10, 11, 4, 3], ['NDF', 'US', 'other', 'FR', 'ES'])

In [20]:
preds_df['best5'] = preds_df.preds.apply(lambda x: get_best_n_result(x, n=5))

In [21]:
preds_df.head()

Unnamed: 0,preds,amax,best5
0,"[0.0008875477231299356, 0.0018805796276288864,...",0.690769,"[NDF, US, other, FR, ES]"
1,"[0.00029260611175401977, 0.0009368319392331375...",0.922361,"[NDF, US, other, IT, FR]"
2,"[0.0001592182064140176, 0.002028397378663301, ...",0.89928,"[NDF, US, other, FR, IT]"
3,"[0.00028890401365596426, 0.0016611787305380538...",0.893099,"[NDF, US, other, FR, IT]"
4,"[0.0015819269488524252, 0.019569871009474507, ...",0.528054,"[US, NDF, other, FR, IT]"


In [22]:
preds_df['id'] = df['user_id']

In [23]:
preds_df.head()

Unnamed: 0,preds,amax,best5,id
0,"[0.0008875477231299356, 0.0018805796276288864,...",0.690769,"[NDF, US, other, FR, ES]",5uwns89zht
1,"[0.00029260611175401977, 0.0009368319392331375...",0.922361,"[NDF, US, other, IT, FR]",jtl0dijy2j
2,"[0.0001592182064140176, 0.002028397378663301, ...",0.89928,"[NDF, US, other, FR, IT]",xx0ulgorjt
3,"[0.00028890401365596426, 0.0016611787305380538...",0.893099,"[NDF, US, other, FR, IT]",6c6puo6ix0
4,"[0.0015819269488524252, 0.019569871009474507, ...",0.528054,"[US, NDF, other, FR, IT]",czqhjk3yfe


In [24]:
submission = preds_df[['id', 'best5']].explode('best5')
submission.shape

(310480, 2)

In [25]:
submission.head()

Unnamed: 0,id,best5
0,5uwns89zht,NDF
0,5uwns89zht,US
0,5uwns89zht,other
0,5uwns89zht,FR
0,5uwns89zht,ES


### 4 Saving NDCG aware submission

In [31]:
submission.columns = ['id', 'country']

In [32]:
submission.to_csv('../data/results/submission8.csv', index=False)