## Generatig predicitons for the submission dataset

In [1]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm
import numpy as np
import glob

### 0. Loading and preparing data

In [2]:
# df = pd.read_parquet('../data/processed/test_features.parquet')
# df = pd.read_parquet('../data/processed/test_features_uncorr.parquet')
df = pd.read_parquet('../data/processed/test_features.parquet')
df.shape

(62096, 1105)

In [4]:
df['nan_counts'] = df.isnull().sum(axis=1)

In [5]:
if 'train_flag' in df:
    df.drop('train_flag', inplace=True, axis=1)
    df.reset_index(drop=True, inplace=True)
    df.shape

In [6]:
df.head()

Unnamed: 0,user_id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,...,deltas_no_num_outliers,secs_elapsed_mean,secs_elapsed_std,secs_elapsed_max,secs_elapsed_min,secs_elapsed_median,secs_elapsed_mode,secs_elapsed_mode_count,device_count,nan_counts
0,5uwns89zht,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,...,1.0,14898.375,21621.413801,64883.0,0.0,2018.5,0.0,1.0,2.0,331
1,jtl0dijy2j,-unknown-,-1.0,basic,0,en,direct,direct,untracked,Moweb,...,1.0,13164.157895,18194.475733,61881.0,0.0,3228.0,0.0,1.0,2.0,327
2,xx0ulgorjt,-unknown-,-1.0,basic,0,en,direct,direct,linked,Web,...,2.0,16820.258621,36679.560932,270618.0,0.0,8635.5,768.0,2.0,1.0,330
3,6c6puo6ix0,-unknown-,-1.0,basic,0,en,direct,direct,linked,Web,...,1.0,11181.909091,30370.377866,106481.0,0.0,231.0,0.0,1.0,1.0,329
4,czqhjk3yfe,-unknown-,-1.0,basic,0,en,direct,direct,untracked,Web,...,1.0,23895.947368,53805.073963,219369.0,0.0,1123.0,0.0,2.0,1.0,322


In [7]:
df.drop('country_destination', axis=1, inplace=True)

In [8]:
x = df.drop('user_id', axis=1)

In [9]:
cat_features = [
    'gender',
    'signup_method',
    'signup_flow',
    'language',
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'signup_app',
    'first_device_type',
    'first_browser',
    'dow_registered',
    'hr_registered',
    'age_group',
    'dow_registered',
    'day_registered',
    'month_registered',
    'year_registered',
]

In [10]:
cat_features_remained = list(set(cat_features).intersection(set(df)))
cat_features_removed = set(cat_features) - set(df)
len(cat_features), len(cat_features_remained), len(cat_features_removed)                                      

(17, 16, 0)

In [11]:
for col in cat_features_remained:
    x[col].fillna('', inplace=True)
    x[col] = x[col].astype('category')

### 2. Loading model

In [12]:
def find_latest_model(path='../models/', ext='cbm'):
    files = glob.glob(f"{path}*.{ext}")
    files = [file for file in files if len(file) > 25]
    files = sorted(files)
    return files[-1]

In [13]:
path = find_latest_model()
path

'../models/model_2021_09_07_13_13_22.cbm'

In [14]:
model = CatBoostClassifier()
model.load_model(path)

<catboost.core.CatBoostClassifier at 0x7f5ea7c55ca0>

### 3.3 Predicting 5 Countries of Destination per each user

In [15]:
classes = list(model.classes_)
classes

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']

In [16]:
preds = model.predict_proba(x)
preds = preds.tolist()

In [17]:
preds_df = pd.DataFrame({'preds': preds})
preds_df['amax'] = preds_df.preds.apply(lambda x: max(x))

In [41]:
preds_df.head()

Unnamed: 0,preds,amax
0,"[0.0010269467471738652, 0.0014246948595342296,...",0.747805
1,"[0.0004502293971102514, 0.00125392177918135, 0...",0.935231
2,"[0.000261753904187894, 0.001617964762779733, 0...",0.925779
3,"[0.000669060777995511, 0.002272208988472939, 0...",0.900147
4,"[0.002998427260432197, 0.01306027545451322, 0....",0.573226


In [22]:
def find_max_n_indeces(x, n=1):
    n_indeces = sorted(x, reverse=True)[:n]
    n_indeces = [x.index(el) for el in n_indeces]
    return n_indeces


def get_best_n_result(x, n=5):
    n_indeces = find_max_n_indeces(x, n=n)
    best_n = [classes[ix] for ix in n_indeces]
    return best_n

In [23]:
find_max_n_indeces(preds_df.loc[0].preds, n=5), get_best_n_result(preds_df.loc[0].preds, n=5)

([7, 10, 11, 4, 6], ['NDF', 'US', 'other', 'FR', 'IT'])

In [43]:
find_max_n_indeces(preds_df.loc[0].preds, n=5), get_best_n_result(preds_df.loc[0].preds, n=5)

([7, 10, 11, 4, 6], ['NDF', 'US', 'other', 'FR', 'IT'])

In [24]:
preds_df['best5'] = preds_df.preds.apply(lambda x: get_best_n_result(x, n=5))

In [25]:
preds_df.head()

Unnamed: 0,preds,amax,best5
0,"[0.0014153491591661668, 0.0019351088592688157,...",0.738517,"[NDF, US, other, FR, IT]"
1,"[0.0003372142934274278, 0.0009500093615831471,...",0.933253,"[NDF, US, other, FR, IT]"
2,"[0.0005100003462029771, 0.0024536754525149116,...",0.902827,"[NDF, US, other, FR, IT]"
3,"[0.0006337672109018841, 0.001682966073952498, ...",0.900345,"[NDF, US, other, FR, IT]"
4,"[0.0014454263928507245, 0.010040377251178444, ...",0.61781,"[US, NDF, other, FR, IT]"


In [45]:
preds_df.head()

Unnamed: 0,preds,amax,best5
0,"[0.0010269467471738652, 0.0014246948595342296,...",0.747805,"[NDF, US, other, FR, IT]"
1,"[0.0004502293971102514, 0.00125392177918135, 0...",0.935231,"[NDF, US, other, FR, IT]"
2,"[0.000261753904187894, 0.001617964762779733, 0...",0.925779,"[NDF, US, other, FR, IT]"
3,"[0.000669060777995511, 0.002272208988472939, 0...",0.900147,"[NDF, US, other, FR, IT]"
4,"[0.002998427260432197, 0.01306027545451322, 0....",0.573226,"[US, NDF, other, FR, IT]"


In [26]:
preds_df['id'] = df['user_id']

In [27]:
preds_df.head()

Unnamed: 0,preds,amax,best5,id
0,"[0.0014153491591661668, 0.0019351088592688157,...",0.738517,"[NDF, US, other, FR, IT]",5uwns89zht
1,"[0.0003372142934274278, 0.0009500093615831471,...",0.933253,"[NDF, US, other, FR, IT]",jtl0dijy2j
2,"[0.0005100003462029771, 0.0024536754525149116,...",0.902827,"[NDF, US, other, FR, IT]",xx0ulgorjt
3,"[0.0006337672109018841, 0.001682966073952498, ...",0.900345,"[NDF, US, other, FR, IT]",6c6puo6ix0
4,"[0.0014454263928507245, 0.010040377251178444, ...",0.61781,"[US, NDF, other, FR, IT]",czqhjk3yfe


In [28]:
submission = preds_df[['id', 'best5']].explode('best5')
submission.shape

(310480, 2)

In [29]:
submission.head()

Unnamed: 0,id,best5
0,5uwns89zht,NDF
0,5uwns89zht,US
0,5uwns89zht,other
0,5uwns89zht,FR
0,5uwns89zht,IT


### 4 Saving NDCG aware submission

In [30]:
submission.columns = ['id', 'country']

In [31]:
path[-24:-4]

'_2021_09_07_13_13_22'

In [32]:
submission.to_csv(f'../data/results/submission{path[-24:-4]}.csv', index=False)

In [31]:
def get_sum_of_top_n(x, n=1):
    return sum(sorted(x, reverse=True)[:n])

In [39]:
preds_df['sum5'] = preds_df.preds.apply(lambda x: get_sum_of_top_n(x, 5))
preds_df['sum4'] = preds_df.preds.apply(lambda x: get_sum_of_top_n(x, 4))

In [104]:
preds_df.head()

Unnamed: 0,preds,amax,best5,id,sum5,sum4
0,"[0.0017519423826669101, 0.0025576312800302354,...",0.722504,"[NDF, US, other, FR, ES]",5uwns89zht,0.98146,0.975482
1,"[0.0003099675259051736, 0.0012446993304778772,...",0.915085,"[NDF, US, other, FR, IT]",jtl0dijy2j,0.992252,0.988651
2,"[0.0004179162173000709, 0.0018952815415556588,...",0.907168,"[NDF, US, other, FR, IT]",xx0ulgorjt,0.988325,0.983484
3,"[0.0005074904824394839, 0.002024573985089579, ...",0.89367,"[NDF, US, other, FR, IT]",6c6puo6ix0,0.987371,0.981406
4,"[0.0016562760789289195, 0.0210273290185944, 0....",0.535922,"[US, NDF, other, FR, IT]",czqhjk3yfe,0.910988,0.873365


In [129]:
s = preds_df[['id', 'preds']].explode('preds').copy(deep=True)
s.shape

(745152, 2)

In [130]:
s.preds = s.preds.astype(float)

In [131]:
classes

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']

In [132]:
s['country'] = classes * int(len(s) / len(classes))

In [133]:
s.head()

Unnamed: 0,id,preds,country
0,5uwns89zht,0.001752,AU
0,5uwns89zht,0.002558,CA
0,5uwns89zht,0.002094,DE
0,5uwns89zht,0.005978,ES
0,5uwns89zht,0.009647,FR


In [134]:
type(s.loc[0].preds)

pandas.core.series.Series

In [135]:
s = s.sort_values(['id', 'preds'], ascending=[True, False])
s.reset_index(drop=True, inplace=True)

In [136]:
ss = s[['id', 'preds']].groupby('id', as_index=False).preds.cumsum()

In [137]:
s['preds_sum'] = ss

In [138]:
s.head(15)

Unnamed: 0,id,preds,country,preds_sum
0,0010k6l0om,0.894683,NDF,0.894683
1,0010k6l0om,0.060999,US,0.955681
2,0010k6l0om,0.016941,other,0.972623
3,0010k6l0om,0.007212,FR,0.979835
4,0010k6l0om,0.006491,IT,0.986326
5,0010k6l0om,0.003739,ES,0.990065
6,0010k6l0om,0.003323,GB,0.993388
7,0010k6l0om,0.003198,CA,0.996586
8,0010k6l0om,0.001122,DE,0.997708
9,0010k6l0om,0.000962,NL,0.99867


In [183]:
best = s.groupby('id', as_index=False).head(4)
best.shape

(186288, 4)

In [184]:
best.head()

Unnamed: 0,id,preds,country,preds_sum
0,0010k6l0om,0.894683,NDF,0.894683
1,0010k6l0om,0.060999,US,0.955681
2,0010k6l0om,0.016941,other,0.972623
12,0031awlkjq,0.827403,NDF,0.827403
13,0031awlkjq,0.116693,US,0.944096


In [185]:
result = s[s.preds_sum < 0.90]
result.shape, s.shape, result.id.nunique(), s.id.nunique()

((86578, 4), (745152, 4), 45815, 62096)

In [186]:
result = pd.concat([best, result])
result = result.drop_duplicates(['id', 'country'])
result.shape, s.shape, result.id.nunique(), s.id.nunique()

((192736, 4), (745152, 4), 62096, 62096)

In [187]:
result[['id', 'country']].to_csv('../data/results/submission_cumsum_head3_90.csv', index=False)
result.shape

(192736, 4)

In [33]:
'gender' in df

True

In [4]:
df.shape

(62096, 1105)

In [5]:
users = pd.get_dummies(df[['gender', 'user_id', 'country_destination']], columns=['gender'])

In [6]:
users.head()

Unnamed: 0,user_id,country_destination,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER
213451,5uwns89zht,,0,1,0,0
213452,jtl0dijy2j,,1,0,0,0
213453,xx0ulgorjt,,1,0,0,0
213454,6c6puo6ix0,,1,0,0,0
213455,czqhjk3yfe,,1,0,0,0
