In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
%matplotlib inline

In [2]:
from catboost import CatBoostClassifier

In [3]:
df_train = pd.read_csv('onetwotrip_challenge_train.csv')
df_test = pd.read_csv('onetwotrip_challenge_test.csv')

In [4]:
# features for training
features = list(filter(lambda x: 'field' in x, df_train.columns))

In [5]:
features

['field0',
 'field1',
 'field2',
 'field3',
 'field4',
 'field5',
 'field6',
 'field7',
 'field8',
 'field9',
 'field10',
 'field11',
 'field12',
 'field13',
 'field14',
 'field15',
 'field16',
 'field17',
 'field18',
 'field19',
 'field20',
 'field21',
 'field22',
 'field23',
 'field24',
 'field25',
 'field26',
 'field27',
 'field28',
 'field29']

In [6]:
goals = list(filter(lambda x: 'goal' in x and 'indicator' not in x, df_train.columns))[:-1]

In [7]:
kf = StratifiedKFold(n_splits=5, random_state=0)

In [8]:
clfs = [CatBoostClassifier(logging_level='Silent', random_state=0) for goal in goals]

In [9]:
def train_clfs(X, goals, clfs):
    for idx in range(goals.shape[1]):
        y = goals.iloc[:, idx]
        clfs[idx].fit(X, y)
    return clfs

def predict_goals_proba(X, clfs, goals_cols):
    res = pd.DataFrame([], columns=goals_cols)
    for i, clf in enumerate(clfs):
        res[goals_cols[i]] = clf.predict_proba(X)[:, 1]
    return pd.DataFrame(res, columns=goals_cols)

def score_probas(goals_probas, goals_true):
    scores = []
    for goal in goals_probas.columns:
        scores.append(roc_auc_score(goals_true[goal], goals_probas[goal]))
    return np.mean(scores)

In [10]:
Y = (df_train[goals] == 1).any(axis=1)*1

In [11]:
# cross validation
scores = []
for train_idx, test_idx in kf.split(df_train[features], Y):
    tmp_train, tmp_test = df_train.loc[train_idx], df_train.loc[test_idx]
    train_clfs(tmp_train[features], tmp_train[goals], clfs)
    probas = predict_goals_proba(tmp_test[features], clfs, goals)
    scores += [score_probas(probas, tmp_test[goals])]
print(f"Score {np.mean(scores)} ± {np.std(scores)}")

Score 0.8023737503929688 ± 0.0020414703790985794


In [None]:
# fit classifier on the whole dataset
train_clfs(df_train[features], df_train[goals], clfs)

In [None]:
# get probabilities
probas = predict_goals_proba(df_test[features], clfs, goals)

In [None]:
probas.to_csv('sub2.csv')