In [1]:
import pandas as pd
import numpy as np

import feather

from itertools import combinations
from tqdm import tqdm_notebook as tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

from scipy.sparse.csgraph import connected_components

from sklearn.cross_validation import KFold

import gc

In [2]:
from sklearn.metrics import roc_auc_score

def roc_auc(y, y_pred):
    auc = roc_auc_score(y, y_pred)
    if auc < 0.5:
        return 1 - auc
    return auc

In [3]:
def tqdm_pool(pool, func, array):
    progress = tqdm(total=len(array))
    
    futures = []
    for a in array:
        f = pool.submit(func, a)
        f.add_done_callback(lambda x: progress.update())
        futures.append(f)

    results = [f.result() for f in as_completed(futures)]
    progress.close()
    return results

In [4]:
from collections import Counter
import numpy as np

def fit_mtv(x, y, m, C):
    cnt = Counter(x[y == 1])
    cnt_all = Counter(x)

    probs = {}

    for k, v in cnt.items():
        v_all = cnt_all[k]
        probs[k] = (v + C * m) / (v_all + C)

    return probs

def transform_mtv(x, probs, m):
    res = [probs.get(v, m) for v in x]
    return np.array(res, dtype='float32')

def fit_transform_mtv(x_train, y_train, x_test, C):
    m = y_train.mean()
    probs = fit_mtv(x_train, y_train, m, C)
    return transform_mtv(x_test, probs, m)

def mtv_cv(cv, x, y, C):
    train_res = np.zeros(len(y), dtype='float32')

    for train_idx, test_idx in cv:
        x_train = x[train_idx]
        y_train = y[train_idx]
        x_test = x[test_idx]

        result = fit_transform_mtv(x_train, y_train, x_test, C)
        train_res[test_idx] = result

    return train_res

In [5]:
!mkdir tmp_val
!mkdir tmp_test

mkdir: cannot create directory ‘tmp_val’: File exists
mkdir: cannot create directory ‘tmp_test’: File exists


In [6]:
def mtv_calc(params):
    cv = params['cv']
    y_train = params['target_array_train']

    feature_name = params['feature_name']
    x_train = params['feature_array_train']
    x_test = params['feature_array_test']

    auc_check = params['auc_feature_selection']
    auc_threshold = params['auc_threshold']
    
    C = params['C']

    result = {}
    result['feature_name'] = feature_name
    
    train_res = mtv_cv(cv, x_train, y_train, C)
    auc = roc_auc(y_train, train_res)
    result['feature_auc'] = auc    

    if not auc_check or (auc_check and auc >= auc_threshold):
        result['train_res'] = train_res
        test_res = fit_transform_mtv(x_train, y_train, x_test, C)
        result['test_res'] = test_res
    else:
        result['train_res'] = None
        result['test_res'] = None

    return result

def mtv_apply_parallel(pool, cv, df_train, df_test, target, features, C, auc_selection=False):
    y = df_train[target].values

    jobs_params = []
    
    for f in features:
        params = {
            'C': C,
            'cv': cv,
            'feature_name': f,
            'target_array_train': y,
            'feature_array_train': df_train[f].values,
            'feature_array_test': df_test[f].values,
            'auc_feature_selection': auc_selection,
            'auc_threshold': 0.55,
        }

        jobs_params.append(params)

    return tqdm_pool(pool, mtv_calc, jobs_params)

In [7]:
def mtv_pair_calc(params):
    C = params['C']

    cv = params['cv']
    y_train = params['target_array_train']
    
    feature_name_1 = params['feature_name_1']
    feature_name_2 = params['feature_name_2']
    
    auc_check = params['auc_feature_selection']
    auc_threshold = params['auc_threshold']

    x_1_train = params['feature_array_1_train']
    x_2_train = params['feature_array_2_train']
    x_train = x_1_train + '_' + x_2_train

    result = {}
    result['feature_name'] = (feature_name_1, feature_name_2)

    train_res = mtv_cv(cv, x_train, y_train, C)
    auc = roc_auc(y_train, train_res)
    result['feature_auc'] = auc    

    if not auc_check or (auc_check and auc >= auc_threshold):
        result['train_res'] = train_res

        x_1_test = params['feature_array_1_test']
        x_2_test = params['feature_array_2_test']
        x_test = x_1_test + '_' + x_2_test

        test_res = fit_transform_mtv(x_train, y_train, x_test, C)
        result['test_res'] = test_res

    else:
        result['train_res'] = None
        result['test_res'] = None

    return result

def mtv_pairs_apply_parallel(pool, cv, df_train, df_test, target, combs, C, auc_selection=False):
    y = df_train[target].values
    jobs_params = []

    for c1, c2 in combs:
        params = {
            'C': C,
            'cv': cv,

            'target_array_train': y,

            'feature_name_1': c1,
            'feature_name_2': c2,
            
            'feature_array_1_train': df_train[c1].values,
            'feature_array_2_train': df_train[c2].values,
        
            'feature_array_1_test': df_test[c1].values,
            'feature_array_2_test': df_test[c2].values,
            
            'auc_feature_selection': auc_selection,
            'auc_threshold': 0.55,
            
        }

        jobs_params.append(params)

    return tqdm_pool(pool, mtv_pair_calc, jobs_params)

In [25]:
def mtv_triple_calc(params):
    C = params['C']

    cv = params['cv']
    y_train = params['target_array_train']
    
    feature_name_1 = params['feature_name_1']
    feature_name_2 = params['feature_name_2']
    feature_name_3 = params['feature_name_3']

    auc_check = params['auc_feature_selection']
    auc_threshold = params['auc_threshold']
    
    x_1_train = params['feature_array_1_train']
    x_2_train = params['feature_array_2_train']
    x_3_train = params['feature_array_3_train']
    x_train = x_1_train + '_' + x_2_train + '_' + x_3_train

    result = {}
    result['feature_name'] = (feature_name_1, feature_name_2, feature_name_3)

    train_res = mtv_cv(cv, x_train, y_train, C)
    auc = roc_auc(y_train, train_res)
    result['feature_auc'] = auc    

    if not auc_check or (auc_check and auc >= auc_threshold):
        result['train_res'] = train_res

        x_1_test = params['feature_array_1_test']
        x_2_test = params['feature_array_2_test']
        x_3_test = params['feature_array_3_test']
        x_test = x_1_test + '_' + x_2_test + '_' + x_3_test

        test_res = fit_transform_mtv(x_train, y_train, x_test, C)
        result['test_res'] = test_res

    else:
        result['train_res'] = None
        result['test_res'] = None

    return result

def mtv_triple_apply_parallel(pool, cv, df_train, df_test, target, combs, C, auc_selection=False):
    y = df_train[target].values
    jobs_params = []

    for c1, c2, c3 in combs:
        params = {
            'C': C,
            'cv': cv,

            'target_array_train': y,

            'feature_name_1': c1,
            'feature_name_2': c2,
            'feature_name_3': c3,

            'feature_array_1_train': df_train[c1].values,
            'feature_array_2_train': df_train[c2].values,
            'feature_array_3_train': df_train[c3].values,

            'feature_array_1_test': df_test[c1].values,
            'feature_array_2_test': df_test[c2].values,
            'feature_array_3_test': df_test[c3].values,

            'auc_feature_selection': auc_selection,
            'auc_threshold': 0.55,
        }

        jobs_params.append(params)

    return tqdm_pool(pool, mtv_triple_calc, jobs_params)

In [9]:
def select_comb3_candidates(features, res, res2):
    good_single = {r['feature_name'] for r in res if r['train_res'] is not None}

    good_pairs = set()

    for r in res2:
        if r['train_res'] is None:
            continue
        (c1, c2) = r['feature_name']
        good_pairs.add((c1, c2))
        good_pairs.add((c2, c1))

    combs3 = list(combinations(features, 3))

    good_comb3_candidates = []

    for comb in combs3:
        if set(comb) & good_single:
            good_comb3_candidates.append(comb)
            continue

        comb2of3 = set(combinations(comb, 2))

        if comb2of3 & good_pairs:
            good_comb3_candidates.append(comb)
            continue

    return good_comb3_candidates

In [10]:
val = True

if val:
    df_train = feather.read_dataframe('feather/df_train.feather')
    df_test = feather.read_dataframe('feather/df_val.feather')
    out = 'tmp_val'
else:
    df_train = feather.read_dataframe('feather/df_train_all.feather')
    df_test = feather.read_dataframe('feather/df_test.feather')
    out = 'tmp_test'

In [14]:
len(df_train), len(df_test)

(1600000, 400000)

In [15]:
to_exlude = ['decision']

features = list(df_train.columns)
for c in to_exlude:
    features.remove(c)

target = 'decision'
C = 12
cv = KFold(len(df_train), n_folds=3, shuffle=False, random_state=1)

In [16]:
with ProcessPoolExecutor(max_workers=20) as pool:
    res = mtv_apply_parallel(pool, cv, df_train, df_test, target, features, C)
    gc.collect()    

    combs = list(combinations(features, 2))

    res2 = mtv_pairs_apply_parallel(pool, cv, df_train, df_test, target, combs, C)
    gc.collect()

    good_comb3_candidates = select_comb3_candidates(features, res, res2)
    gc.collect()

    res3 = mtv_triple_apply_parallel(pool, cv, df_train, df_test, target, good_comb3_candidates, C)
    gc.collect()






In [17]:
res2_sorted = sorted(res2, key=lambda x: -x['feature_auc'])
len_res2_notnull = len([r for r in res2_sorted if r['train_res'] is not None])

res3_sorted = sorted(res3, key=lambda x: -x['feature_auc'])

In [18]:
top_values = []
res23 = []

for r in res2_sorted + res3_sorted:
    if r['train_res'] is None:
        continue

    top_values.append(r['train_res'][:2000])
    res23.append(r)

corr = np.corrcoef(top_values)

_, comps = connected_components(corr > 0.99)
comps, len(comps)

(array([  0,   1,   2, ..., 283, 283, 283], dtype=int32), 9318)

In [19]:
final_res23 = []

seen = set()
added_3 = 0

for i, comp in enumerate(comps):
    if comp in seen:
        continue
    seen.add(comp)

    final_res23.append(res23[i])

    if i >= len_res2_notnull:
        added_3 = added_3 + 1
        if added_3 >= 500:
            break

In [20]:
feature_names_all = []
X_train_all = []
X_test_all = []

for r in res:
    if r['train_res'] is None:
        continue
    X_train_all.append(r['train_res'])
    X_test_all.append(r['test_res'])
    feature_names_all.append(r['feature_name'])

for r in final_res23:
    if r['train_res'] is None:
        continue
    X_train_all.append(r['train_res'])
    X_test_all.append(r['test_res'])
    feature_names_all.append('&'.join(r['feature_name']))

In [21]:
X_train = np.array(X_train_all).T
X_test = np.array(X_test_all).T

In [24]:
X_train.shape

(1600000, 798)

In [25]:
X_test.shape

(400000, 798)

In [26]:
del X_train_all, X_test_all

In [11]:
import pickle

In [28]:
np.save(out + '/X_train.npy', X_train)
np.save(out + '/X_test.npy', X_test)

with open(out + '/features.bin', 'wb') as f:
    pickle.dump(feature_names_all, f)

In [29]:
y_train = df_train.decision.values
y_test = df_test.decision.values

In [30]:
import xgboost as xgb

In [31]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names_all, missing=np.nan)
dval = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names_all, missing=np.nan)
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [34]:
xgb_pars = {
    'eta': 0.3,
    'gamma': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'tree_method': 'approx',
# not deafauts
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 20,
    'seed': 42,
    'silent': 1
}

n_estimators = 300

In [35]:
model = xgb.train(xgb_pars, dtrain, num_boost_round=n_estimators, verbose_eval=1,
                  evals=watchlist)

[0]	train-auc:0.771995	val-auc:0.7733
[1]	train-auc:0.77827	val-auc:0.778838
[2]	train-auc:0.781124	val-auc:0.781578
[3]	train-auc:0.783138	val-auc:0.783398
[4]	train-auc:0.784463	val-auc:0.784683
[5]	train-auc:0.785752	val-auc:0.785897
[6]	train-auc:0.786886	val-auc:0.786882
[7]	train-auc:0.787755	val-auc:0.787596
[8]	train-auc:0.788747	val-auc:0.788512
[9]	train-auc:0.789656	val-auc:0.789368
[10]	train-auc:0.790604	val-auc:0.790262
[11]	train-auc:0.791349	val-auc:0.790955
[12]	train-auc:0.792132	val-auc:0.791598
[13]	train-auc:0.792708	val-auc:0.792112
[14]	train-auc:0.793423	val-auc:0.792725
[15]	train-auc:0.794186	val-auc:0.793307
[16]	train-auc:0.794624	val-auc:0.793678
[17]	train-auc:0.795176	val-auc:0.794089
[18]	train-auc:0.795603	val-auc:0.794412
[19]	train-auc:0.796153	val-auc:0.794876
[20]	train-auc:0.796652	val-auc:0.795246
[21]	train-auc:0.797022	val-auc:0.795518
[22]	train-auc:0.797445	val-auc:0.795821
[23]	train-auc:0.797894	val-auc:0.796173
[24]	train-auc:0.798314	val-a

In [36]:
xgb_pars = {
    'eta': 0.3,
    'gamma': 0,
    'max_depth': 10,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'tree_method': 'approx',
# not deafauts
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 20,
    'seed': 42,
    'silent': 1
}

n_estimators = 300

In [37]:
model = xgb.train(xgb_pars, dtrain, num_boost_round=n_estimators, verbose_eval=1,
                  evals=watchlist)

[0]	train-auc:0.786543	val-auc:0.781259
[1]	train-auc:0.793264	val-auc:0.786857
[2]	train-auc:0.796577	val-auc:0.789617
[3]	train-auc:0.799173	val-auc:0.791589
[4]	train-auc:0.801457	val-auc:0.7931
[5]	train-auc:0.803346	val-auc:0.794228
[6]	train-auc:0.805162	val-auc:0.795295
[7]	train-auc:0.806663	val-auc:0.796173
[8]	train-auc:0.808287	val-auc:0.797071
[9]	train-auc:0.809741	val-auc:0.797954
[10]	train-auc:0.811333	val-auc:0.798805
[11]	train-auc:0.813002	val-auc:0.799547
[12]	train-auc:0.814467	val-auc:0.800233
[13]	train-auc:0.81579	val-auc:0.801021
[14]	train-auc:0.817086	val-auc:0.801521
[15]	train-auc:0.818147	val-auc:0.802007
[16]	train-auc:0.82006	val-auc:0.80292
[17]	train-auc:0.821399	val-auc:0.803568
[18]	train-auc:0.822848	val-auc:0.804203
[19]	train-auc:0.82397	val-auc:0.804759
[20]	train-auc:0.825229	val-auc:0.805181
[21]	train-auc:0.826775	val-auc:0.805891
[22]	train-auc:0.828153	val-auc:0.806526
[23]	train-auc:0.829438	val-auc:0.806941
[24]	train-auc:0.83067	val-auc:0

In [10]:
val = False

df_train = feather.read_dataframe('feather/df_train_all.feather')
df_test = feather.read_dataframe('feather/df_test.feather')
out = 'tmp_test'

In [23]:
to_exlude = ['decision']

features = list(df_train.columns)
for c in to_exlude:
    features.remove(c)

target = 'decision'
C = 12
cv = KFold(len(df_train), n_folds=3, shuffle=False, random_state=1)

In [12]:
with open('tmp_val/features.bin', 'rb') as f:
    feature_names_all = pickle.load(f)

In [19]:
feature_orig = [s.split('&') for s in feature_names_all]
feature_orig_1 = [s[0] for s in feature_orig if len(s) == 1]
feature_orig_2 = [tuple(s) for s in feature_orig if len(s) == 2]
feature_orig_3 = [tuple(s) for s in feature_orig if len(s) == 3]

In [26]:
with ProcessPoolExecutor(max_workers=20) as pool:
    res = mtv_apply_parallel(pool, cv, df_train, df_test, target, feature_orig_1, C, auc_selection=False)
    gc.collect()    

    res2 = mtv_pairs_apply_parallel(pool, cv, df_train, df_test, target, feature_orig_2, C, auc_selection=False)
    gc.collect()

    res3 = mtv_triple_apply_parallel(pool, cv, df_train, df_test, target, feature_orig_3, C)
    gc.collect()

In [27]:
feature_names_all = []
X_train_all = []
X_test_all = []

for r in res:
    if r['train_res'] is None:
        continue
    X_train_all.append(r['train_res'])
    X_test_all.append(r['test_res'])
    feature_names_all.append(r['feature_name'])

for r in res2 + res3:
    if r['train_res'] is None:
        continue
    X_train_all.append(r['train_res'])
    X_test_all.append(r['test_res'])
    feature_names_all.append('&'.join(r['feature_name']))

In [28]:
X_train = np.array(X_train_all).T
X_test = np.array(X_test_all).T

In [31]:
y_train = df_train.decision.values

In [33]:
import xgboost as xgb

In [30]:
xgb_pars = {
    'eta': 0.3,
    'gamma': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'tree_method': 'approx',
# not deafauts
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 20,
    'seed': 42,
    'silent': 1
}

n_estimators = 100

In [34]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names_all, missing=np.nan)
watchlist = [(dtrain, 'train')]

dtest = xgb.DMatrix(X_test, feature_names=feature_names_all, missing=np.nan)


In [35]:
model = xgb.train(xgb_pars, dtrain, num_boost_round=n_estimators, verbose_eval=1,
                  evals=watchlist)

[0]	train-auc:0.769196
[1]	train-auc:0.775669
[2]	train-auc:0.778812
[3]	train-auc:0.780118
[4]	train-auc:0.7814
[5]	train-auc:0.782615
[6]	train-auc:0.783865
[7]	train-auc:0.784916
[8]	train-auc:0.785895
[9]	train-auc:0.786609
[10]	train-auc:0.787494
[11]	train-auc:0.788228
[12]	train-auc:0.789061
[13]	train-auc:0.789566
[14]	train-auc:0.790252
[15]	train-auc:0.790866
[16]	train-auc:0.791389
[17]	train-auc:0.791987
[18]	train-auc:0.792364
[19]	train-auc:0.792757
[20]	train-auc:0.793328
[21]	train-auc:0.793917
[22]	train-auc:0.794274
[23]	train-auc:0.794712
[24]	train-auc:0.795068
[25]	train-auc:0.79536
[26]	train-auc:0.795726
[27]	train-auc:0.796086
[28]	train-auc:0.796422
[29]	train-auc:0.796744
[30]	train-auc:0.797148
[31]	train-auc:0.79748
[32]	train-auc:0.797701
[33]	train-auc:0.797979
[34]	train-auc:0.79824
[35]	train-auc:0.798485
[36]	train-auc:0.798757
[37]	train-auc:0.799065
[38]	train-auc:0.799295
[39]	train-auc:0.79956
[40]	train-auc:0.79984
[41]	train-auc:0.800099
[42]	trai

In [39]:
pred = model.predict(dtest, ntree_limit=50)
with open('results_xgb6_50.txt', 'w') as f:
    for p in pred:
        f.write('%.5f\n' % p)

In [40]:
pred = model.predict(dtest, ntree_limit=75)
with open('results_xgb6_75.txt', 'w') as f:
    for p in pred:
        f.write('%.5f\n' % p)

In [38]:
pred = model.predict(dtest, ntree_limit=100)
with open('results_xgb6_100.txt', 'w') as f:
    for p in pred:
        f.write('%.5f\n' % p)