In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(0, '../python')
from preprocessing import load_suffixes, prepare_input
from recalibrator import Recalibrator
from utils import match
from confidence_intervals import confidence_intervals

In [7]:
trios = ["ajt", "chd", "corpas", "yri"]

# # Pre-processing. Uncomment during first run of the script, then
# # comment to avoid re-computing

# for trio in trios:
#     data_dir = '../data/' + trio + '/'
#     df = load_suffixes(data_dir)
#     df.to_csv(trio + '.csv', index=False)

For each trio, construct a training set from the remaining trios and save the predictions into a dictionary.

In [None]:

results_cum = {}

for test in trios:
    results = {}
    df_train = pd.DataFrame()
    
    for train in trios:
        if train != test:
            df_train = df_train.append(pd.read_csv(train + '.csv'))

    df_test = pd.read_csv(test + '.csv')
    
    X_train = prepare_input(df_train, target_cols=['justchild^GT'])
    y_train = df_train['justchild^GT'].values
    X_test = prepare_input(df_test, target_cols=['justchild^GT'])
    y_test = df_test['justchild^GT'].values

    contaminations = df_test['contamination'].values
    contamination_values = list(sorted(np.unique(contaminations)))

    X_tests = {}
    y_tests = {}
    idx = {}

    for contamination in contamination_values:
        idx[contamination] = contaminations == contamination
        X_tests[contamination] = X_test[idx[contamination]]
        y_tests[contamination] = y_test[idx[contamination]]

    results['y_test'] = y_test
    results['preds_naive'] = df_test['abortus^GT'].values
    results['idx'] = idx

    r = Recalibrator()
    print(test)
    r.train(X_train, y_train)
    
    results['preds_meta'] = r.model_meta.predict(X_test)
    results['preds_lr'] = r.predict_lr(X_test)
    results['preds_xgb'] = r.predict_xgb(X_test)
    results['preds_ci'] = r.model_ci.predict(X_test)

    results_cum[test] = results


ajt




Training logistic regression




Training XGB
[0]	validation_0-merror:0.054629
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.052079
[2]	validation_0-merror:0.051673
[3]	validation_0-merror:0.051002
[4]	validation_0-merror:0.04958
[5]	validation_0-merror:0.049152
[6]	validation_0-merror:0.047837
[7]	validation_0-merror:0.046786
[8]	validation_0-merror:0.046094
[9]	validation_0-merror:0.04579
[10]	validation_0-merror:0.044693
[11]	validation_0-merror:0.0443
[12]	validation_0-merror:0.043578
[13]	validation_0-merror:0.043176
[14]	validation_0-merror:0.042766
[15]	validation_0-merror:0.042364
[16]	validation_0-merror:0.041924
[17]	validation_0-merror:0.041625
[18]	validation_0-merror:0.041121
[19]	validation_0-merror:0.040758
[20]	validation_0-merror:0.040446
[21]	validation_0-merror:0.040113
[22]	validation_0-merror:0.039763
[23]	validation_0-merror:0.03937
[24]	validation_0-merror:0.039126
[25]	validation_0-merror:0.038844
[26]	validation_0-merror:0.038575
[27]	validation_0

[236]	validation_0-merror:0.030736
[237]	validation_0-merror:0.030741
[238]	validation_0-merror:0.030753
[239]	validation_0-merror:0.030736
[240]	validation_0-merror:0.030702
[241]	validation_0-merror:0.030698
[242]	validation_0-merror:0.030702
[243]	validation_0-merror:0.030689
[244]	validation_0-merror:0.030664
[245]	validation_0-merror:0.030659
[246]	validation_0-merror:0.030651
[247]	validation_0-merror:0.030647
[248]	validation_0-merror:0.030647
[249]	validation_0-merror:0.030634
[250]	validation_0-merror:0.030668
[251]	validation_0-merror:0.030608
[252]	validation_0-merror:0.030591
[253]	validation_0-merror:0.0306
[254]	validation_0-merror:0.0306
[255]	validation_0-merror:0.03057
[256]	validation_0-merror:0.030514
[257]	validation_0-merror:0.030531
[258]	validation_0-merror:0.030544
[259]	validation_0-merror:0.030518
[260]	validation_0-merror:0.030523
[261]	validation_0-merror:0.03054
[262]	validation_0-merror:0.030501
[263]	validation_0-merror:0.030471
[264]	validation_0-merror:

[471]	validation_0-merror:0.029361
[472]	validation_0-merror:0.029374
[473]	validation_0-merror:0.029374
[474]	validation_0-merror:0.029365
[475]	validation_0-merror:0.029386
[476]	validation_0-merror:0.029395
[477]	validation_0-merror:0.029374
[478]	validation_0-merror:0.029356
[479]	validation_0-merror:0.029365
[480]	validation_0-merror:0.029352
[481]	validation_0-merror:0.029344
[482]	validation_0-merror:0.029339
[483]	validation_0-merror:0.029318
[484]	validation_0-merror:0.029327
[485]	validation_0-merror:0.029322
[486]	validation_0-merror:0.029327
[487]	validation_0-merror:0.029301
[488]	validation_0-merror:0.029305
[489]	validation_0-merror:0.029267
[490]	validation_0-merror:0.029284
[491]	validation_0-merror:0.029292
[492]	validation_0-merror:0.029318
[493]	validation_0-merror:0.029339
[494]	validation_0-merror:0.029331
[495]	validation_0-merror:0.029335
[496]	validation_0-merror:0.029339
[497]	validation_0-merror:0.029318
[498]	validation_0-merror:0.029318
[499]	validation_0-m

  if diff:
  lower_bound = contaminations - z*np.sqrt(contaminations*(1 - contaminations)/df_test[ab_name + '^DP'].values)
  upper_bound = contaminations + z*np.sqrt(contaminations*(1 - contaminations)/df_test[ab_name + '^DP'].values)


chd
Training logistic regression




Training XGB
[0]	validation_0-merror:0.032968
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.032144
[2]	validation_0-merror:0.031206
[3]	validation_0-merror:0.03026
[4]	validation_0-merror:0.030078
[5]	validation_0-merror:0.029977
[6]	validation_0-merror:0.029427
[7]	validation_0-merror:0.029528
[8]	validation_0-merror:0.029359
[9]	validation_0-merror:0.028671
[10]	validation_0-merror:0.028077
[11]	validation_0-merror:0.027798
[12]	validation_0-merror:0.02758
[13]	validation_0-merror:0.027273
[14]	validation_0-merror:0.027022
[15]	validation_0-merror:0.026897
[16]	validation_0-merror:0.026569
[17]	validation_0-merror:0.026323
[18]	validation_0-merror:0.026121
[19]	validation_0-merror:0.025995
[20]	validation_0-merror:0.02564
[21]	validation_0-merror:0.025381
[22]	validation_0-merror:0.025033
[23]	validation_0-merror:0.024843
[24]	validation_0-merror:0.024637
[25]	validation_0-merror:0.02433
[26]	validation_0-merror:0.024237
[27]	validation_

Serialize and save the dictionary

In [None]:
import pickle

with open("results_1vA.pickle", "wb") as f:
    pickle.dump(results_cum, f)