In [1]:
import followthemoney_predict as ftmp
from followthemoney_predict.pipelines.xref import util
from followthemoney import model as ftm_model
import followthemoney as ftm

In [2]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [3]:
import pandas as pd
import seaborn as sns
import pylab as py
import numpy as np

from tqdm.notebook import tqdm

The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
  import matplotlib as mpl


In [4]:
import time

In [5]:
%autoreload 2
pd.options.display.max_columns = None

In [6]:
df = pd.read_parquet('../data/xref.aleph.all.parquet')

In [7]:
base_model = ftmp.pipelines.xref.models.XrefXGBoost()
train, test = base_model.prepair_train_test(df)
del df

In [8]:
train_X = util.xarray(train.features)
test_X = util.xarray(test.features)

In [9]:
train_groups = train.left_id + train.right_id

In [10]:
param_grid = {
    "n_jobs": [1],
    "objective": ["binary:logistic"],
    "n_estimators": [120, 140, 160],
    "max_depth": [12, 14, 16, 18],
    "learning_rate": [0.1, 0.2, 0, 3],
    "subsample": [0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bylevel": [0.9, 1.0],
    "min_child_weight": [0.6, 8.0, 10.0],
    "gamma": [1.0, 1.2, 1.4],
    "reg_lambda": [0.15, 0.2, 0.25],
}


fit_params = {'eval_metric': 'auc',
              'early_stopping_rounds': 10,
              'eval_set': [(test_X, test.judgement)],
              'sample_weight_eval_set': [test.weight]}

In [11]:
clf = xgb.XGBClassifier()
rs_clf = RandomizedSearchCV(clf, param_grid, n_iter=300,
                            n_jobs=8, verbose=4, cv=2,
                            scoring='roc_auc', refit=True,
                            pre_dispatch='n_jobs', random_state=42)
print("Randomized search..")
rs_clf.fit(train_X, train.judgement, groups=train_groups, **fit_params)

Randomized search..
Fitting 2 folds for each of 300 candidates, totalling 600 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  17 tasks      | elapsed: 18.8min
[Parallel(n_jobs=8)]: Done  90 tasks      | elapsed: 108.7min
[Parallel(n_jobs=8)]: Done 213 tasks      | elapsed: 221.5min
[Parallel(n_jobs=8)]: Done 384 tasks      | elapsed: 391.7min
[Parallel(n_jobs=8)]: Done 600 out of 600 | elapsed: 602.5min finished


[0]	validation_0-auc:0.93270
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.93049
[2]	validation_0-auc:0.93363
[3]	validation_0-auc:0.93618
[4]	validation_0-auc:0.93599
[5]	validation_0-auc:0.93775
[6]	validation_0-auc:0.93840
[7]	validation_0-auc:0.93852
[8]	validation_0-auc:0.93878
[9]	validation_0-auc:0.93869
[10]	validation_0-auc:0.93887
[11]	validation_0-auc:0.93913
[12]	validation_0-auc:0.93954
[13]	validation_0-auc:0.93989
[14]	validation_0-auc:0.93998
[15]	validation_0-auc:0.94005
[16]	validation_0-auc:0.94017
[17]	validation_0-auc:0.94012
[18]	validation_0-auc:0.94017
[19]	validation_0-auc:0.94039
[20]	validation_0-auc:0.94050
[21]	validation_0-auc:0.94072
[22]	validation_0-auc:0.94074
[23]	validation_0-auc:0.94064
[24]	validation_0-auc:0.94077
[25]	validation_0-auc:0.94104
[26]	validation_0-auc:0.94106
[27]	validation_0-auc:0.94092
[28]	validation_0-auc:0.94092
[29]	validation_0-auc:0.94093
[30]	validation_0-auc:0.94107
[31]	validation_

RandomizedSearchCV(cv=2,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                   param_distributions={'colsample_bylevel': [0.9, 1.0],
                                        'colsample_bytree': [0.6, 0.7, 0.8, 0.9,
                           

In [12]:
best_score = rs_clf.best_score_
best_params = rs_clf.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

Best score: 0.9757834849957818
Best params: 
colsample_bylevel: 1.0
colsample_bytree: 0.9
gamma: 1.2
learning_rate: 0.2
max_depth: 18
min_child_weight: 0.6
n_estimators: 120
n_jobs: 1
objective: 'binary:logistic'
reg_lambda: 0.2
subsample: 0.8


In [13]:
predict = rs_clf.predict_proba(test_X)
base_model.describe_predictions(test, predict)

Num Positive Predictions: 1040283
Num Negavie Predictions: 612792
Certain Positives
    [pos] Flixflax AG in Liquidation (co: ch, re: CH-020, id: CHE-16) vs <NA> (co: ch, re: CH-020, ad: CH, 89, id: CHE-16)-> { F: 0.00, T: 100.00 }
    [pos] <NA> (co: ch, re: CH-020, id: CHE-10) vs THAMA AG (co: ch, re: CH-020, ad: CH, 84, id: CHE-10)-> { F: 0.00, T: 100.00 }
    [pos] Livermore Real Estate 1 AG in Liquidation (re: CH-020, ad: CH, 80, id: CHE-11) vs <NA> (co: ch, re: CH-020, ad: CH, 80, id: CHE-11)-> { F: 0.00, T: 100.00 }
    [pos] Муниципальное учреждение здравоохранения Тамбовская центральная районная больница (co: ru, ad: 676950) vs <NA> (co: ru, ad: 676950)-> { F: 0.00, T: 100.00 }
    [pos] OPERETTENBUEHNE HOMBRECHTIKON (re: CH-020, ad: CH, 86, id: CHE-10) vs <NA> (co: ch, re: CH-020, ad: CH, 86, id: CHE-10)-> { F: 0.00, T: 100.00 }
Certain Negatives
    [neg] Pierre Poilievre (bi: 1979-0, na: ca) vs A. D. Premadasa (bi: 1948-1, na: lk)-> { F: 100.00, T: 0.00 }
    [neg] Hr. Slam