## Generatig predicitons for the submission dataset

In [25]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
from collections import Counter
import numpy as np

### 0. Loading and preparing data

In [2]:
df = pd.read_parquet('../data/processed/features_test.parquet')
df.shape

(61664, 336)

In [3]:
x = df.drop('user_id', axis=1)

### 1. Splitting tain data into stratified train, validation and test sets

In [4]:
cat_features = [
    'gender',
    'signup_method',
    'signup_flow',
    'language',
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'signup_app',
    'first_device_type',
    'first_browser',
    'dow_registered',
    'hr_registered'
]

In [5]:
for col in cat_features:
    x[col].fillna('', inplace=True)
    x[col] = x[col].astype('category')

### 2. Loading model

In [18]:
model = CatBoostClassifier()
model.load_model('../models/model1.cbm')

<catboost.core.CatBoostClassifier at 0x7f1369e77460>

### 3. Predicting Country of Destination

In [19]:
x_pool = Pool(x, cat_features=cat_features)

In [None]:
preds = model.predict(x, prediction_type='Class')
preds = [el[0] for el in preds.tolist()]

In [21]:
Counter(preds)

Counter({'NDF': 46876, 'US': 14691, 'other': 50, 'FR': 46, 'IT': 1})

### 3.2 Predicting Country of Destination using multiple predicitons where applicable

In [45]:
classes = list(model.classes_)
classes

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']

In [130]:
preds = model.predict_proba(x)
preds = preds.tolist()
# preds = [el[0] for el in preds]
preds_df = pd.DataFrame(preds)
preds_df.shape

(61664, 12)

In [131]:
preds_df['preds'] = preds

In [132]:
preds_df['amax'] = preds_df.preds.apply(lambda x: max(x))

In [133]:
preds_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,preds,amax
0,0.000578,0.002447,0.001743,0.005855,0.010774,0.005971,0.008226,0.677725,0.002294,0.000546,0.252005,0.031836,"[0.0005779137823761972, 0.002447165487599636, ...",0.677725
1,0.000271,0.001503,0.000369,0.002284,0.007125,0.002967,0.004562,0.919998,0.00039,0.000359,0.048673,0.011498,"[0.0002714279874518555, 0.0015025634956925442,...",0.919998
2,0.000413,0.002987,0.000417,0.003089,0.006431,0.002637,0.005533,0.912368,0.00038,0.000317,0.054302,0.011125,"[0.00041309920474994876, 0.002987160209617467,...",0.912368
3,0.000471,0.001796,0.000461,0.004815,0.004574,0.002477,0.004116,0.900181,0.000379,0.0004,0.065988,0.014343,"[0.0004712294642750235, 0.001795560675008842, ...",0.900181
4,0.001789,0.020239,0.008261,0.030256,0.048493,0.036123,0.068212,0.179902,0.01091,0.011019,0.478735,0.10606,"[0.001789468640802202, 0.020239206634337554, 0...",0.478735


In [134]:
def find_nth_max_index(x, ix=1):
    second_max = sorted(x)[-ix]
    return x.index(second_max)


def get_nth_best_result(x, ix=1):
    return classes[find_nth_max_index(x, ix=ix)]

In [135]:
find_nth_max_index(preds_df.loc[0].preds), get_nth_best_result(preds_df.loc[0].preds)

(7, 'NDF')

In [136]:
preds_df['second'] = preds_df.preds.apply(lambda x: get_nth_best_result(x, ix=2))
preds_df['first'] = preds_df.preds.apply(lambda x: get_nth_best_result(x))

In [137]:
preds_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,preds,amax,second,first
0,0.000578,0.002447,0.001743,0.005855,0.010774,0.005971,0.008226,0.677725,0.002294,0.000546,0.252005,0.031836,"[0.0005779137823761972, 0.002447165487599636, ...",0.677725,US,NDF
1,0.000271,0.001503,0.000369,0.002284,0.007125,0.002967,0.004562,0.919998,0.00039,0.000359,0.048673,0.011498,"[0.0002714279874518555, 0.0015025634956925442,...",0.919998,US,NDF
2,0.000413,0.002987,0.000417,0.003089,0.006431,0.002637,0.005533,0.912368,0.00038,0.000317,0.054302,0.011125,"[0.00041309920474994876, 0.002987160209617467,...",0.912368,US,NDF
3,0.000471,0.001796,0.000461,0.004815,0.004574,0.002477,0.004116,0.900181,0.000379,0.0004,0.065988,0.014343,"[0.0004712294642750235, 0.001795560675008842, ...",0.900181,US,NDF
4,0.001789,0.020239,0.008261,0.030256,0.048493,0.036123,0.068212,0.179902,0.01091,0.011019,0.478735,0.10606,"[0.001789468640802202, 0.020239206634337554, 0...",0.478735,NDF,US


In [138]:
preds_df['result'] = preds_df[['first', 'second']].apply(lambda x: list(x), axis=1)

In [139]:
preds_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,preds,amax,second,first,result
0,0.000578,0.002447,0.001743,0.005855,0.010774,0.005971,0.008226,0.677725,0.002294,0.000546,0.252005,0.031836,"[0.0005779137823761972, 0.002447165487599636, ...",0.677725,US,NDF,"[NDF, US]"
1,0.000271,0.001503,0.000369,0.002284,0.007125,0.002967,0.004562,0.919998,0.00039,0.000359,0.048673,0.011498,"[0.0002714279874518555, 0.0015025634956925442,...",0.919998,US,NDF,"[NDF, US]"
2,0.000413,0.002987,0.000417,0.003089,0.006431,0.002637,0.005533,0.912368,0.00038,0.000317,0.054302,0.011125,"[0.00041309920474994876, 0.002987160209617467,...",0.912368,US,NDF,"[NDF, US]"
3,0.000471,0.001796,0.000461,0.004815,0.004574,0.002477,0.004116,0.900181,0.000379,0.0004,0.065988,0.014343,"[0.0004712294642750235, 0.001795560675008842, ...",0.900181,US,NDF,"[NDF, US]"
4,0.001789,0.020239,0.008261,0.030256,0.048493,0.036123,0.068212,0.179902,0.01091,0.011019,0.478735,0.10606,"[0.001789468640802202, 0.020239206634337554, 0...",0.478735,NDF,US,"[US, NDF]"


In [140]:
mask = preds_df.amax < 0.65
mask.sum()

21887

In [141]:
preds_df.loc[~mask, 'result'] = preds_df.loc[~mask, 'first']

In [142]:
preds_df['id'] = df['user_id']

In [143]:
preds_df.sample(10, random_state=42)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,preds,amax,second,first,result,id
56320,0.000586,0.002352,0.000951,0.006002,0.009454,0.006064,0.008225,0.893691,0.000693,0.000408,0.057902,0.013671,"[0.0005861863170349066, 0.0023516672143977764,...",0.893691,US,NDF,NDF,6i6oe2pq90
15712,0.000409,0.001753,0.000294,0.001995,0.006271,0.002547,0.004392,0.923978,0.000329,0.000254,0.046665,0.011112,"[0.00040943721713411144, 0.0017530625799288865...",0.923978,US,NDF,NDF,io2xkaqu6h
3732,0.000213,0.001078,0.000218,0.001576,0.002227,0.001174,0.001582,0.936217,0.000336,0.000142,0.046047,0.009191,"[0.00021253732231914826, 0.0010781847446486594...",0.936217,US,NDF,NDF,1zmcx1p3xu
21993,0.000567,0.001486,0.000732,0.005199,0.006932,0.002898,0.003417,0.901046,0.00039,0.000673,0.065777,0.010882,"[0.0005670184542179818, 0.00148605245597758, 0...",0.901046,US,NDF,NDF,litjt696rv
6682,0.002335,0.011288,0.006072,0.085135,0.073497,0.029284,0.025426,0.319069,0.00885,0.012805,0.316088,0.110151,"[0.0023352425752437077, 0.011288058452726091, ...",0.319069,US,NDF,"[NDF, US]",ycr4e6e5qv
53238,0.000255,0.002004,0.000329,0.001676,0.004854,0.001775,0.004483,0.935086,0.000387,0.000245,0.040047,0.008859,"[0.0002547908901068263, 0.002004080563396204, ...",0.935086,US,NDF,NDF,2e8urjyhwp
22876,0.001371,0.004586,0.002075,0.003522,0.007551,0.002514,0.002611,0.783965,0.000867,0.000836,0.153241,0.03686,"[0.0013709516773848266, 0.0045858476075972265,...",0.783965,US,NDF,NDF,dmew2au9zm
47706,0.000662,0.001857,0.000411,0.00294,0.006014,0.003265,0.003277,0.920422,0.000385,0.000459,0.043792,0.016515,"[0.0006624997136144932, 0.0018574907181551862,...",0.920422,US,NDF,NDF,ybazcnmmrg
56201,0.003617,0.003175,0.003559,0.016131,0.020863,0.008571,0.007974,0.724606,0.00217,0.000723,0.151593,0.057018,"[0.0036173196018861975, 0.003174843369357812, ...",0.724606,US,NDF,NDF,6rjejcdak2
20274,0.000678,0.004385,0.000933,0.010821,0.046518,0.013971,0.008514,0.057509,0.003511,0.000901,0.687726,0.164533,"[0.0006776935272795066, 0.004385459094757413, ...",0.687726,other,US,US,yvu0efzcip


In [144]:
submission = preds_df[['id', 'result']].explode('result')
submission.shape

(83551, 2)

In [145]:
submission.head()

Unnamed: 0,id,result
0,5uwns89zht,NDF
1,jtl0dijy2j,NDF
2,xx0ulgorjt,NDF
3,6c6puo6ix0,NDF
4,czqhjk3yfe,US


In [146]:
submission[submission.id == 'ycr4e6e5qv']

Unnamed: 0,id,result
6682,ycr4e6e5qv,NDF
6682,ycr4e6e5qv,US


### 3.3 Saving NDCG aware submission

In [147]:
submission.columns = ['id', 'country']

In [148]:
submission.to_csv('../data/results/submission4.csv', index=False)

### 4. Assembling submission data

In [21]:
sample = pd.read_csv('../data/original/sample_submission_NDF.csv')
sample.head()

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,jtl0dijy2j,NDF
2,xx0ulgorjt,NDF
3,6c6puo6ix0,NDF
4,czqhjk3yfe,NDF


In [25]:
submission = pd.DataFrame({'id': df['user_id'], 'country': preds})
submission.shape

(61664, 2)

In [26]:
submission.head()

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,jtl0dijy2j,NDF
2,xx0ulgorjt,NDF
3,6c6puo6ix0,NDF
4,czqhjk3yfe,US


In [27]:
submission.to_csv('../data/results/submission1.csv', index=False)