In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from pandas.api.types import is_numeric_dtype
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

# Train

In [2]:
df_trade_final = pd.read_pickle('df_trade_final.pkl')

In [None]:
features = df_trade_final.columns.drop(['CustomerIdx', 'IsinIdx', 'CustomerInterest',
                                        'ActualMaturityDateKey', 'IssueDateKey',
                                        'week_start',
                                        'next_week_start'])

In [5]:
df_trade_final.fillna(0, inplace=True)
df_trade_final.head()

Unnamed: 0,week_start,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Sector,Subsector,Region_x,Country,TickerIdx,...,TradeStatus_NotTraded.sum_ByIsin,TradeStatus_Unknown.sum_ByIsin,ExecutedBuy.sum_ByIsin,ExecutedSell.sum_ByIsin,ExecutedBalance.sum_ByIsin,CustomerIdx.count,CustomerIdx.nunique,all_period,past_period,remaining_period
0,2018-01-01,2,17040,0,1.0,0,21,0,10,238,...,62.0,301.0,272.0,128.0,144.0,696.0,548.0,36524,187,36337
1,2018-01-01,2,17040,1,0.0,0,21,0,10,238,...,62.0,301.0,272.0,128.0,144.0,696.0,548.0,36524,187,36337
2,2018-01-01,9,3428,0,1.0,1,23,1,75,1116,...,44.0,25.0,83.0,116.0,-33.0,437.0,382.0,7305,445,6860
3,2018-01-01,9,3428,1,0.0,1,23,1,75,1116,...,44.0,25.0,83.0,116.0,-33.0,437.0,382.0,7305,445,6860
4,2018-01-01,9,5964,0,0.0,1,23,1,75,328,...,22.0,243.0,147.0,135.0,12.0,441.0,306.0,10958,868,10090


In [6]:
last_week = df_trade_final.week_start.max()
last_week

Timestamp('2018-04-16 00:00:00')

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_trade_final[features] = scaler.fit_transform(df_trade_final[features])

In [16]:
X = df_trade_final.loc[df_trade_final.week_start != last_week, features].values
y = df_trade_final[df_trade_final.week_start != last_week].CustomerInterest.values

X_val = df_trade_final.loc[df_trade_final.week_start == last_week, features].values
y_val = df_trade_final[df_trade_final.week_start == last_week].CustomerInterest.values

print('Train:', X.shape, y.shape)
print('Val:', X_val.shape, y_val.shape)

Train: (986488, 66) (986488,)
Val: (490695, 66) (490695,)


In [31]:
%%time
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(10, random_state=1986, n_jobs=-1)
classifier.fit(X, y)

y_pred = classifier.predict_proba(X_val)
print(roc_auc_score(y_val, y_pred[:,1]))

0.55877297939202
CPU times: user 40.8 s, sys: 2.88 s, total: 43.7 s
Wall time: 13.3 s


In [19]:
import tensorflow as tf
import tflearn

tf.reset_default_graph()

targets = np.unique(y)
y2 = (y.reshape((-1, 1)) == targets).astype(np.uint8)
y_val2 = (y_val.reshape((-1, 1)) == targets).astype(np.uint8)

input_layer = tflearn.input_data((None, len(features)))
net = tflearn.fully_connected(input_layer, 256, activation='relu')
net = tflearn.dropout(net, 0.8)
net = tflearn.fully_connected(net, 256, activation='relu')
net = tflearn.dropout(net, 0.8)
net = tflearn.fully_connected(net, 128, activation='relu')
net = tflearn.dropout(net, 0.8)
net = tflearn.fully_connected(net, 64, activation='relu')
net = tflearn.dropout(net, 0.8)
net = tflearn.fully_connected(net, len(targets), activation='softmax')
net = tflearn.regression(net, learning_rate=1e-5)
model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='tensorboard')

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [20]:
model.fit(X, y2, n_epoch=20, batch_size=15000, show_metric=True, shuffle=True, validation_set=(X_val, y_val2))

Training Step: 1319  | total loss: [1m[32m0.47687[0m[0m | time: 27.763s
| Adam | epoch: 020 | loss: 0.47687 - acc: 0.7361 -- iter: 975000/986488
Training Step: 1320  | total loss: [1m[32m0.47675[0m[0m | time: 30.441s
| Adam | epoch: 020 | loss: 0.47675 - acc: 0.7358 | val_loss: 0.36907 - val_acc: 0.9648 -- iter: 986488/986488
--


In [21]:
y_pred = model.predict(X_val)
print(roc_auc_score(y_val, y_pred[:,1]))

0.5439490123582332


In [17]:
%%time
import xgboost as xgb

classifier = xgb.XGBClassifier()
classifier.fit(X, y)

y_pred = classifier.predict_proba(X_val)
print(roc_auc_score(y_val, y_pred[:,1]))

0.566214609779676
CPU times: user 3min 27s, sys: 767 ms, total: 3min 27s
Wall time: 3min 27s


In [18]:
%%time
classifier = RandomForestClassifier(50, random_state=1986, n_jobs=-1)
classifier.fit(X, y)

y_pred = classifier.predict_proba(X_val)
print(roc_auc_score(y_val, y_pred[:,1]))

0.5848527032885595
CPU times: user 4min 33s, sys: 6.45 s, total: 4min 40s
Wall time: 1min 12s


In [19]:
%%time
classifier = RandomForestClassifier(100, random_state=1986, n_jobs=-1)
classifier.fit(X, y)

y_pred = classifier.predict_proba(X_val)
print(roc_auc_score(y_val, y_pred[:,1]))

0.5923716525039213
CPU times: user 9min 24s, sys: 12.5 s, total: 9min 36s
Wall time: 2min 28s


In [41]:
%%time
from sklearn.ensemble import ExtraTreesClassifier

classifier = ExtraTreesClassifier(100, criterion='entropy', random_state=1986, n_jobs=-1)
classifier.fit(X, y)

y_pred = classifier.predict_proba(X_val)
print(roc_auc_score(y_val, y_pred[:,1]))

0.5955335717215446
CPU times: user 5min 16s, sys: 18.7 s, total: 5min 34s
Wall time: 2min 12s


In [21]:
%%time
classifier = RandomForestClassifier(bootstrap=False, criterion='gini',
                                    max_depth=29, max_features=0.3375, n_estimators=414,
                                    random_state=1986, n_jobs=-1)
classifier.fit(X, y)

y_pred = classifier.predict_proba(X_val)
print(roc_auc_score(y_val, y_pred[:,1]))

0.6017560464303687
CPU times: user 2h 25min 31s, sys: 51.5 s, total: 2h 26min 23s
Wall time: 37min 54s


In [33]:
%%time
from lightgbm import LGBMClassifier

classifier = LGBMClassifier(random_state=123, metric='auc', num_leaves=31, max_depth=24,
                            n_estimators=251, learning_rate=0.2687098558722311,
                            colsample_bytree=0.6398839827468012)

classifier.fit(X, y)

y_pred = classifier.predict_proba(X_val)
print(roc_auc_score(y_val, y_pred[:,1]))

0.5893729097773983
CPU times: user 1min 34s, sys: 846 ms, total: 1min 34s
Wall time: 25.6 s


In [None]:
%%time
from catboost import CatBoostClassifier, Pool

# diz a modelo quais features sao categoricas
cat_features = list(range(0, 17)) + [18]
column_description_vector = ['numeric'] * df_trade_final[features].shape[1]
for x in cat_features[:]:
    column_description_vector[x] = 'factor'

cat_train = Pool(X, y, cat_features, column_description=column_description_vector)
cat_val = Pool(X_val, cat_features=cat_features, column_description=column_description_vector)

classifier = CatBoostClassifier(iterations=100, random_state=1986, custom_metric='AUC')
classifier.fit(cat_train)

y_pred = classifier.predict_proba(cat_val)
print(roc_auc_score(y_val, y_pred[:,1]))

# Submission

In [None]:
%%time
classifier.fit(df_trade_final[features].values,
               df_trade_final.CustomerInterest.values)

In [None]:
features_importance = zip(classifier.feature_importances_, features)
for importance, feature in sorted(features_importance, reverse=True):
    print("%s: %f%%" % (feature, importance))

In [34]:
df_challenge_final = pd.read_pickle('df_challenge_final.pkl')

In [35]:
features = df_challenge_final.columns.drop(['CustomerIdx', 'IsinIdx', 'CustomerInterest',
                                            'ActualMaturityDateKey', 'IssueDateKey', 'DateKey', 'PredictionIdx',
                                            'week_start',
                                            'next_week_start'])
features

Index(['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 'TickerIdx',
       'Seniority', 'Currency', 'ActivityGroup', 'Region_y', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'IssuedAmount', 'CouponType',
       'pca0_mean', 'pca0_min', 'pca0_max', 'pca0_incressing', 'pca0_skew',
       'pca0_kurt', 'pca0_std', 'pca1_mean', 'pca1_min', 'pca1_max',
       'pca1_incressing', 'pca1_skew', 'pca1_kurt', 'pca1_std', 'pca2_mean',
       'pca2_min', 'pca2_max', 'pca2_incressing', 'pca2_skew', 'pca2_kurt',
       'pca2_std', 'pca3_mean', 'pca3_min', 'pca3_max', 'pca3_incressing',
       'pca3_skew', 'pca3_kurt', 'pca3_std', 'pca4_mean', 'pca4_min',
       'pca4_max', 'pca4_incressing', 'pca4_skew', 'pca4_kurt', 'pca4_std',
       'BuySell_Buy.sum', 'BuySell_Sell.sum', 'TradeStatus_Done.sum',
       'TradeStatus_IOI.sum', 'TradeStatus_NotTraded.sum',
       'TradeStatus_Unknown.sum', 'ExecutedBuy.sum', 'Executed

In [36]:
df_challenge_final[features].head()

Unnamed: 0,BuySell,Sector,Subsector,Region_x,Country,TickerIdx,Seniority,Currency,ActivityGroup,Region_y,...,TradeStatus_NotTraded.sum_ByIsin,TradeStatus_Unknown.sum_ByIsin,ExecutedBuy.sum_ByIsin,ExecutedSell.sum_ByIsin,ExecutedBalance.sum_ByIsin,CustomerIdx.count,CustomerIdx.nunique,all_period,past_period,remaining_period
0,0,0,22,0,83,2740,5,19,0,0,...,5.0,111.0,50.0,66.0,-16.0,381.0,327.0,2923,1866,1057
1,1,2,3,2,60,3450,5,11,0,5,...,29.0,29.0,72.0,57.0,15.0,573.0,541.0,2659,696,1963
2,1,2,3,2,36,2573,5,11,0,5,...,11.0,28.0,48.0,45.0,3.0,431.0,412.0,2922,899,2023
3,1,2,32,2,74,2540,5,9,0,5,...,20.0,39.0,60.0,69.0,-9.0,149.0,125.0,4383,1512,2871
4,1,0,2,2,4,1662,5,9,0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4383,1628,2755


In [37]:
X_test = scaler.transform(df_challenge_final[features].values)

In [None]:
y_pred = classifier.predict_proba(X_test)

In [None]:
sns.distplot(y_pred[:,1]);

In [None]:
df_submission = pd.read_csv('data/sample_submission.csv')
df_submission.CustomerInterest = y_pred[:,1]
df_submission.to_csv('camargo-sub_et_hist.csv', index=False)
df_submission.head()

# Tunning

In [28]:
classifier = LGBMClassifier(random_state=123, metric='auc')

In [29]:
from bayes_opt import BayesianOptimization


param_grid = {
    'num_leaves': (15, 40),
    'max_depth': (-1, 25),
    'n_estimators': (100, 300),
    'learning_rate': (0.01, 0.3),
    'colsample_bytree': (0.5, 1)
}

def model_cv(**params):
    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = -1 if params['n_estimators'] < 0 else int(params['n_estimators'])
    params['num_leaves'] = int(params['num_leaves'])
    
    classifier.set_params(**params).fit(X, y)
    y_pred = classifier.predict_proba(X_val)
    return roc_auc_score(y_val, y_pred[:,1])


bo = BayesianOptimization(model_cv, param_grid)
bo.maximize(5, 100)
print('BayesianOptimization) Score: %.16f   Params: %s' % (abs(bo.res['max']['max_val']),
                                                           bo.res['max']['max_params']))
classifier.set_params(**bo.res['max']['max_params'])

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   n_estimators |   num_leaves | 
    1 | 00m16s | [35m   0.58796[0m | [32m            0.6059[0m | [32m         0.1067[0m | [32m    22.3567[0m | [32m      110.9568[0m | [32m     28.3495[0m | 
    2 | 00m12s |    0.55049 |             0.5211 |          0.1361 |      1.3112 |       169.1231 |      36.8582 | 
    3 | 00m15s |    0.57962 |             0.6385 |          0.1133 |     12.8993 |       106.9373 |      15.4553 | 
    4 | 00m24s | [35m   0.58906[0m | [32m            0.9807[0m | [32m         0.1929[0m | [32m     3.5742[0m | [32m      150.8450[0m | [32m     17.1983[0m | 
    5 | 00m28s |    0.55520 |             0.9256 |          0.2422 |      1.5926 |       258.8668 |      17.4518 | 
[31mBayesian Optimization[0m
[94m------------

  " state: %s" % convergence_dict)


   18 | 00m38s |    0.59118 |             0.7002 |          0.2440 |     24.9906 |       264.9232 |      39.6599 | 
   19 | 00m28s |    0.58974 |             0.6937 |          0.1329 |     14.0280 |       174.1122 |      15.1092 | 
   20 | 00m24s |    0.59304 |             0.6923 |          0.2973 |     23.5061 |       136.8241 |      39.8724 | 
   21 | 00m45s |    0.57169 |             0.8009 |          0.0167 |     24.9070 |       192.7410 |      39.9323 | 
   22 | 00m27s |    0.58862 |             0.9736 |          0.2265 |     13.4593 |       122.2621 |      39.3657 | 
   23 | 00m32s |    0.58947 |             0.5000 |          0.2756 |      8.7072 |       288.7215 |      28.4327 | 
   24 | 00m41s |    0.59108 |             0.9879 |          0.1601 |     24.7587 |       226.9123 |      22.1201 | 
   25 | 00m21s |    0.58892 |             0.5270 |          0.2919 |     15.3266 |       142.5515 |      28.9114 | 
   26 | 00m37s |    0.59512 |             0.5438 |          0.1190 |    

  " state: %s" % convergence_dict)


   34 | 00m41s |    0.58192 |             0.9899 |          0.2040 |      4.5510 |       218.3965 |      15.0624 | 
   35 | 00m25s |    0.59325 |             0.9758 |          0.1398 |     24.9483 |       101.3657 |      16.1385 | 
   36 | 00m34s |    0.58954 |             0.9407 |          0.1069 |     24.6700 |       167.0308 |      15.1658 | 
   37 | 01m04s |    0.58006 |             0.9506 |          0.0191 |     12.7123 |       245.4855 |      39.7974 | 
   38 | 00m40s |    0.58187 |             0.9904 |          0.2960 |      9.6748 |       206.4892 |      39.5023 | 
   39 | 00m28s |    0.58840 |             0.8369 |          0.2793 |     -0.4726 |       140.7905 |      39.8752 | 
   40 | 00m43s |    0.59688 |             0.8009 |          0.1521 |     24.5538 |       276.6487 |      15.4883 | 


  " state: %s" % convergence_dict)


   41 | 00m27s |    0.58693 |             0.9895 |          0.1889 |     24.1451 |       114.7760 |      15.3369 | 
   42 | 00m38s |    0.59574 |             0.9994 |          0.2308 |     12.5675 |       197.6931 |      15.0581 | 
   43 | 00m42s |    0.57729 |             0.8743 |          0.0239 |     16.0751 |       149.8836 |      39.9176 | 
   44 | 00m29s |    0.59313 |             0.6337 |          0.2882 |     24.8863 |       180.7069 |      24.2339 | 
   45 | 00m23s |    0.58974 |             0.5262 |          0.2710 |      6.5776 |       139.2319 |      15.2811 | 
   46 | 00m47s |    0.58998 |             0.8658 |          0.2738 |     24.8746 |       277.4535 |      28.8841 | 


  " state: %s" % convergence_dict)


   47 | 00m30s |    0.59473 |             0.5097 |          0.2307 |     -0.7532 |       209.1952 |      27.4366 | 
   48 | 00m25s |    0.58942 |             0.6200 |          0.2816 |     -0.2128 |       117.1261 |      39.8694 | 
   49 | 00m32s |    0.58934 |             0.5261 |          0.2690 |     12.5632 |       217.6322 |      28.4228 | 
   50 | 00m38s |    0.58608 |             0.5212 |          0.1164 |     14.6564 |       279.6271 |      15.0333 | 
   51 | 00m40s | [35m   0.60069[0m | [32m            0.6399[0m | [32m         0.2687[0m | [32m    24.7589[0m | [32m      251.1466[0m | [32m     31.5713[0m | 
   52 | 00m30s |    0.59692 |             0.5553 |          0.2622 |     20.0782 |       185.4860 |      15.0155 | 
   53 | 00m52s |    0.59293 |             0.9705 |          0.2559 |     24.8261 |       263.9879 |      27.2381 | 
   54 | 00m38s |    0.59175 |             0.5037 |          0.2853 |     24.2665 |       250.4973 |      39.8292 | 
   55 | 00m38s |  

  " state: %s" % convergence_dict)


   57 | 00m33s |    0.57854 |             0.9805 |          0.0523 |     -0.6037 |       100.4809 |      15.1554 | 
   58 | 00m38s |    0.59308 |             0.9886 |          0.2019 |     24.8338 |       155.9004 |      26.4739 | 
   59 | 01m09s |    0.59411 |             0.9954 |          0.0381 |     -0.3444 |       291.0290 |      22.8047 | 
   60 | 00m52s |    0.59662 |             0.9689 |          0.2310 |     -0.7788 |       250.1398 |      39.0172 | 
   61 | 00m45s |    0.59155 |             0.9572 |          0.2805 |     24.7025 |       215.5943 |      26.8272 | 
   62 | 01m05s |    0.57067 |             0.9149 |          0.0102 |     -0.9674 |       238.6549 |      32.1840 | 
   63 | 00m31s |    0.59424 |             0.9854 |          0.2222 |     10.5136 |       100.0389 |      25.2664 | 
   64 | 00m39s |    0.59475 |             0.9980 |          0.2083 |     -0.5283 |       141.9202 |      25.5771 | 


  " state: %s" % convergence_dict)


   65 | 00m54s |    0.59736 |             0.9120 |          0.1826 |     -0.2339 |       260.0480 |      39.7467 | 
   66 | 01m04s |    0.58744 |             0.8662 |          0.0353 |     24.9517 |       255.0783 |      33.1455 | 


  " state: %s" % convergence_dict)


   67 | 00m36s |    0.58878 |             0.5586 |          0.2936 |     23.5738 |       220.8894 |      39.9880 | 
   68 | 00m56s |    0.58823 |             1.0000 |          0.3000 |     17.4158 |       283.0834 |      40.0000 | 
   69 | 00m42s |    0.58828 |             0.9312 |          0.2916 |     -0.7715 |       195.5763 |      31.3125 | 


  " state: %s" % convergence_dict)


   70 | 00m55s |    0.58430 |             0.9674 |          0.2979 |     10.7783 |       296.1275 |      15.3704 | 
   71 | 00m44s |    0.58182 |             0.9320 |          0.2846 |     24.8538 |       207.2020 |      38.9765 | 


  " state: %s" % convergence_dict)


   72 | 00m51s |    0.58925 |             0.9869 |          0.0567 |     24.6797 |       175.9303 |      39.3588 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   73 | 00m45s |    0.58688 |             0.8466 |          0.1771 |     -0.9857 |       208.2313 |      39.5048 | 
   74 | 00m55s |    0.57571 |             0.9756 |          0.0200 |     11.3717 |       187.3202 |      24.7738 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   75 | 00m45s |    0.58134 |             0.7216 |          0.2971 |      8.6896 |       264.9482 |      31.0264 | 


  " state: %s" % convergence_dict)


   76 | 00m36s |    0.56691 |             0.9210 |          0.0237 |     -0.9152 |       100.2217 |      26.9990 | 
   77 | 00m35s |    0.59434 |             0.9316 |          0.1196 |      9.6202 |       106.0449 |      39.9218 | 
   78 | 00m52s |    0.58623 |             0.9924 |          0.2895 |     24.3354 |       264.4387 |      15.1567 | 
   79 | 00m40s |    0.58633 |             1.0000 |          0.3000 |     25.0000 |       146.4720 |      37.5108 | 


  " state: %s" % convergence_dict)


   80 | 00m31s |    0.53054 |             0.5000 |          0.0100 |     -1.0000 |       142.1619 |      15.2071 | 
   81 | 00m36s |    0.58010 |             0.9980 |          0.2784 |     13.9707 |       150.6024 |      15.6614 | 
   82 | 00m37s |    0.56732 |             0.9322 |          0.0680 |      4.9766 |       154.6001 |      28.1945 | 
   83 | 00m40s |    0.58758 |             0.9953 |          0.0591 |      9.4109 |       130.6009 |      24.2884 | 
   84 | 00m29s |    0.59327 |             0.8005 |          0.2302 |     24.7188 |       115.8021 |      39.6963 | 
   85 | 00m41s |    0.58024 |             0.9575 |          0.2543 |     -0.2006 |       188.3751 |      39.7297 | 
   86 | 00m45s |    0.58812 |             0.7340 |          0.2942 |     24.1799 |       287.4516 |      15.3024 | 
   87 | 00m27s |    0.57766 |             0.5758 |          0.2998 |      4.7759 |       137.4260 |      29.8679 | 
   88 | 00m33s |    0.58835 |             0.9813 |          0.2009 |    

  " state: %s" % convergence_dict)


  104 | 00m46s |    0.59006 |             0.7732 |          0.2883 |     24.9523 |       250.6254 |      24.6963 | 
  105 | 00m40s |    0.58397 |             0.5000 |          0.3000 |     19.2771 |       273.4998 |      39.3562 | 
BayesianOptimization) Score: 0.6006942301283495   Params: {'num_leaves': 31.57127605192371, 'max_depth': 24.75887277264046, 'n_estimators': 251.1465886227144, 'learning_rate': 0.2687098558722311, 'colsample_bytree': 0.6398839827468012}


LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.6398839827468012,
        learning_rate=0.2687098558722311, max_depth=24.75887277264046,
        metric='auc', min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=251.1465886227144, n_jobs=-1,
        num_leaves=31.57127605192371, objective=None, random_state=123,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)