In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score

from preprocessing import *
from recalibrator import Recalibrator
from utils import match
from confidence_intervals import confidence_intervals

In [2]:
trios = ["ajt", "chd", "corpas", "yri"]

# # Pre-processing. Uncomment during first run of the script, then
# # comment to avoid re-computing

# for trio in trios:
#     data_dir = '../data/' + trio + '/'
#     df = load_suffixes(data_dir)
#     df.to_csv(trio + '.csv')

In [3]:

results_cum = {}

for test in trios:
    results = {}
    df_train = pd.DataFrame()
    
    for train in trios:
        if train != test:
            df_train = df_train.append(pd.read_csv(train + '.csv'))

    df_test = pd.read_csv(test + '.csv')

    gt_cols = list(filter(match("GT", pos=-1), df_train.columns.values))
    to_drop = list(set(['#CHROM', 'POS', 'Unnamed: 0', 'Unnamed: 1'] + gt_cols))
    
    X_train = df_train.drop(to_drop + ["justchild^GT"], axis=1).values
    y_train = df_train['justchild^GT'].values
    X_test = df_test.drop(to_drop + ["justchild^GT"], axis=1).values
    y_test = df_test['justchild^GT'].values

    contaminations = df_test['contamination'].values
    contamination_values = list(sorted(np.unique(contaminations)))

    X_tests = {}
    y_tests = {}
    idx = {}

    for contamination in contamination_values:
        idx[contamination] = contaminations == contamination
        X_tests[contamination] = X_test[idx[contamination]]
        y_tests[contamination] = y_test[idx[contamination]]

    results['y_test'] = y_test
    results['preds_naive'] = df_test['abortus^GT'].values
    results['idx'] = idx

    r = Recalibrator()
    print(test)
    r.train(X_train, y_train)
    
    results['preds_lr'] = r.predict_lr(X_test)
    results['preds_xgb'] = r.predict_xgb(X_test)
    results['preds_ci'] = confidence_intervals(df_test)

    results_cum[test] = results


ajt
Training logistic regression




Training XGB
[0]	validation_0-merror:0.054629
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.052079
[2]	validation_0-merror:0.051673
[3]	validation_0-merror:0.051002
[4]	validation_0-merror:0.04958
[5]	validation_0-merror:0.049152
[6]	validation_0-merror:0.047837
[7]	validation_0-merror:0.046786
[8]	validation_0-merror:0.046094
[9]	validation_0-merror:0.04579
[10]	validation_0-merror:0.044693
[11]	validation_0-merror:0.0443
[12]	validation_0-merror:0.043578
[13]	validation_0-merror:0.043176
[14]	validation_0-merror:0.042766
[15]	validation_0-merror:0.042364
[16]	validation_0-merror:0.041924
[17]	validation_0-merror:0.041625
[18]	validation_0-merror:0.041121
[19]	validation_0-merror:0.040758
[20]	validation_0-merror:0.040446
[21]	validation_0-merror:0.040113
[22]	validation_0-merror:0.039763
[23]	validation_0-merror:0.03937
[24]	validation_0-merror:0.039126
[25]	validation_0-merror:0.038844
[26]	validation_0-merror:0.038575
[27]	validation_0

[236]	validation_0-merror:0.030809
[237]	validation_0-merror:0.030779
[238]	validation_0-merror:0.030762
[239]	validation_0-merror:0.030753
[240]	validation_0-merror:0.030766
[241]	validation_0-merror:0.03077
[242]	validation_0-merror:0.030753
[243]	validation_0-merror:0.030741
[244]	validation_0-merror:0.030762
[245]	validation_0-merror:0.030758
[246]	validation_0-merror:0.030749
[247]	validation_0-merror:0.030732
[248]	validation_0-merror:0.030715
[249]	validation_0-merror:0.030689
[250]	validation_0-merror:0.030664
[251]	validation_0-merror:0.030659
[252]	validation_0-merror:0.030655
[253]	validation_0-merror:0.030655
[254]	validation_0-merror:0.030664
[255]	validation_0-merror:0.030625
[256]	validation_0-merror:0.030612
[257]	validation_0-merror:0.030574
[258]	validation_0-merror:0.030561
[259]	validation_0-merror:0.030565
[260]	validation_0-merror:0.030557
[261]	validation_0-merror:0.030557
[262]	validation_0-merror:0.030548
[263]	validation_0-merror:0.030544
[264]	validation_0-me

[472]	validation_0-merror:0.029472
[473]	validation_0-merror:0.029459
[474]	validation_0-merror:0.02948
[475]	validation_0-merror:0.029463
[476]	validation_0-merror:0.029485
[477]	validation_0-merror:0.029472
[478]	validation_0-merror:0.029442
[479]	validation_0-merror:0.029412
[480]	validation_0-merror:0.029395
[481]	validation_0-merror:0.029408
[482]	validation_0-merror:0.029403
[483]	validation_0-merror:0.029395
[484]	validation_0-merror:0.029403
[485]	validation_0-merror:0.029403
[486]	validation_0-merror:0.029442
[487]	validation_0-merror:0.029399
[488]	validation_0-merror:0.029361
[489]	validation_0-merror:0.029378
[490]	validation_0-merror:0.029378
[491]	validation_0-merror:0.029352
[492]	validation_0-merror:0.029318
[493]	validation_0-merror:0.029318
[494]	validation_0-merror:0.029352
[495]	validation_0-merror:0.029348
[496]	validation_0-merror:0.029339
[497]	validation_0-merror:0.029314
[498]	validation_0-merror:0.029297
[499]	validation_0-merror:0.029301
[500]	validation_0-me

  if diff:
  lower_bound = contaminations - z*np.sqrt(contaminations*(1 - contaminations)/df_test[sample_name + '^DP'].values)
  upper_bound = contaminations + z*np.sqrt(contaminations*(1 - contaminations)/df_test[sample_name + '^DP'].values)


chd
Training logistic regression
Training XGB
[0]	validation_0-merror:0.032968
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.032144
[2]	validation_0-merror:0.031206
[3]	validation_0-merror:0.03026
[4]	validation_0-merror:0.030078
[5]	validation_0-merror:0.029977
[6]	validation_0-merror:0.029427
[7]	validation_0-merror:0.029528
[8]	validation_0-merror:0.029359
[9]	validation_0-merror:0.028671
[10]	validation_0-merror:0.028085
[11]	validation_0-merror:0.027802
[12]	validation_0-merror:0.027584
[13]	validation_0-merror:0.027273
[14]	validation_0-merror:0.027022
[15]	validation_0-merror:0.026897
[16]	validation_0-merror:0.026569
[17]	validation_0-merror:0.026323
[18]	validation_0-merror:0.026121
[19]	validation_0-merror:0.025995
[20]	validation_0-merror:0.02564
[21]	validation_0-merror:0.025381
[22]	validation_0-merror:0.025033
[23]	validation_0-merror:0.024843
[24]	validation_0-merror:0.024637
[25]	validation_0-merror:0.02433
[26]	validation_

[236]	validation_0-merror:0.01667
[237]	validation_0-merror:0.016662
[238]	validation_0-merror:0.016646
[239]	validation_0-merror:0.016634
[240]	validation_0-merror:0.016622
[241]	validation_0-merror:0.016626
[242]	validation_0-merror:0.016626
[243]	validation_0-merror:0.016609
[244]	validation_0-merror:0.016593
[245]	validation_0-merror:0.016593
[246]	validation_0-merror:0.016597
[247]	validation_0-merror:0.016593
[248]	validation_0-merror:0.016585
[249]	validation_0-merror:0.016569
[250]	validation_0-merror:0.016561
[251]	validation_0-merror:0.016545
[252]	validation_0-merror:0.016516
[253]	validation_0-merror:0.016504
[254]	validation_0-merror:0.016508
[255]	validation_0-merror:0.016516
[256]	validation_0-merror:0.016496
[257]	validation_0-merror:0.016492
[258]	validation_0-merror:0.016476
[259]	validation_0-merror:0.01646
[260]	validation_0-merror:0.016436
[261]	validation_0-merror:0.01644
[262]	validation_0-merror:0.016452
[263]	validation_0-merror:0.01644
[264]	validation_0-merro

[471]	validation_0-merror:0.015579
[472]	validation_0-merror:0.015587
[473]	validation_0-merror:0.015575
[474]	validation_0-merror:0.015571
[475]	validation_0-merror:0.015554
[476]	validation_0-merror:0.015583
[477]	validation_0-merror:0.015583
[478]	validation_0-merror:0.015567
[479]	validation_0-merror:0.015563
[480]	validation_0-merror:0.015563
[481]	validation_0-merror:0.015563
[482]	validation_0-merror:0.015554
[483]	validation_0-merror:0.015538
[484]	validation_0-merror:0.015542
[485]	validation_0-merror:0.015542
[486]	validation_0-merror:0.015498
[487]	validation_0-merror:0.015506
[488]	validation_0-merror:0.015506
[489]	validation_0-merror:0.015514
[490]	validation_0-merror:0.015502
[491]	validation_0-merror:0.015502
[492]	validation_0-merror:0.015498
[493]	validation_0-merror:0.015486
[494]	validation_0-merror:0.015478
[495]	validation_0-merror:0.015474
[496]	validation_0-merror:0.015474
[497]	validation_0-merror:0.015453
[498]	validation_0-merror:0.015445
[499]	validation_0-m

  if diff:


corpas
Training logistic regression
Training XGB
[0]	validation_0-merror:0.04543
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.043704
[2]	validation_0-merror:0.04239
[3]	validation_0-merror:0.042042
[4]	validation_0-merror:0.041717
[5]	validation_0-merror:0.04062
[6]	validation_0-merror:0.040436
[7]	validation_0-merror:0.039627
[8]	validation_0-merror:0.039086
[9]	validation_0-merror:0.038333
[10]	validation_0-merror:0.038025
[11]	validation_0-merror:0.037424
[12]	validation_0-merror:0.036883
[13]	validation_0-merror:0.036467
[14]	validation_0-merror:0.035986
[15]	validation_0-merror:0.035389
[16]	validation_0-merror:0.034989
[17]	validation_0-merror:0.034648
[18]	validation_0-merror:0.034424
[19]	validation_0-merror:0.034051
[20]	validation_0-merror:0.033643
[21]	validation_0-merror:0.03325
[22]	validation_0-merror:0.03293
[23]	validation_0-merror:0.032481
[24]	validation_0-merror:0.032101
[25]	validation_0-merror:0.031716
[26]	validation

[235]	validation_0-merror:0.022957
[236]	validation_0-merror:0.022933
[237]	validation_0-merror:0.022929
[238]	validation_0-merror:0.022937
[239]	validation_0-merror:0.022933
[240]	validation_0-merror:0.022953
[241]	validation_0-merror:0.022917
[242]	validation_0-merror:0.022921
[243]	validation_0-merror:0.022901
[244]	validation_0-merror:0.022893
[245]	validation_0-merror:0.022877
[246]	validation_0-merror:0.022897
[247]	validation_0-merror:0.022889
[248]	validation_0-merror:0.022857
[249]	validation_0-merror:0.022853
[250]	validation_0-merror:0.022865
[251]	validation_0-merror:0.022845
[252]	validation_0-merror:0.022841
[253]	validation_0-merror:0.022833
[254]	validation_0-merror:0.022825
[255]	validation_0-merror:0.022805
[256]	validation_0-merror:0.022817
[257]	validation_0-merror:0.022801
[258]	validation_0-merror:0.022805
[259]	validation_0-merror:0.022801
[260]	validation_0-merror:0.022797
[261]	validation_0-merror:0.022769
[262]	validation_0-merror:0.022789
[263]	validation_0-m

[471]	validation_0-merror:0.021684
[472]	validation_0-merror:0.021672
[473]	validation_0-merror:0.021652
[474]	validation_0-merror:0.021676
[475]	validation_0-merror:0.021696
[476]	validation_0-merror:0.021684
[477]	validation_0-merror:0.02166
[478]	validation_0-merror:0.02164
[479]	validation_0-merror:0.021616
[480]	validation_0-merror:0.02164
[481]	validation_0-merror:0.021612
[482]	validation_0-merror:0.021584
[483]	validation_0-merror:0.021576
[484]	validation_0-merror:0.021596
[485]	validation_0-merror:0.02158
[486]	validation_0-merror:0.02158
[487]	validation_0-merror:0.02158
[488]	validation_0-merror:0.021576
[489]	validation_0-merror:0.021564
[490]	validation_0-merror:0.02158
[491]	validation_0-merror:0.021576
[492]	validation_0-merror:0.021551
[493]	validation_0-merror:0.021568
[494]	validation_0-merror:0.021568
[495]	validation_0-merror:0.021568
[496]	validation_0-merror:0.021551
[497]	validation_0-merror:0.021539
[498]	validation_0-merror:0.021507
[499]	validation_0-merror:0

  if diff:


yri
Training logistic regression
Training XGB
[0]	validation_0-merror:0.056071
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.055192
[2]	validation_0-merror:0.054605
[3]	validation_0-merror:0.05455
[4]	validation_0-merror:0.053959
[5]	validation_0-merror:0.053009
[6]	validation_0-merror:0.05266
[7]	validation_0-merror:0.049911
[8]	validation_0-merror:0.048924
[9]	validation_0-merror:0.047291
[10]	validation_0-merror:0.046449
[11]	validation_0-merror:0.046286
[12]	validation_0-merror:0.045765
[13]	validation_0-merror:0.044723
[14]	validation_0-merror:0.043769
[15]	validation_0-merror:0.043076
[16]	validation_0-merror:0.042517
[17]	validation_0-merror:0.042127
[18]	validation_0-merror:0.041717
[19]	validation_0-merror:0.041363
[20]	validation_0-merror:0.041084
[21]	validation_0-merror:0.040903
[22]	validation_0-merror:0.040735
[23]	validation_0-merror:0.040103
[24]	validation_0-merror:0.039782
[25]	validation_0-merror:0.039409
[26]	validation

[235]	validation_0-merror:0.03008
[236]	validation_0-merror:0.030071
[237]	validation_0-merror:0.030085
[238]	validation_0-merror:0.03008
[239]	validation_0-merror:0.030104
[240]	validation_0-merror:0.030127
[241]	validation_0-merror:0.030136
[242]	validation_0-merror:0.030155
[243]	validation_0-merror:0.030104
[244]	validation_0-merror:0.030085
[245]	validation_0-merror:0.030118
[246]	validation_0-merror:0.030085
[247]	validation_0-merror:0.030076
[248]	validation_0-merror:0.030066
[249]	validation_0-merror:0.030104
[250]	validation_0-merror:0.030071
[251]	validation_0-merror:0.030015
[252]	validation_0-merror:0.030001
[253]	validation_0-merror:0.030025
[254]	validation_0-merror:0.030052
[255]	validation_0-merror:0.030057
[256]	validation_0-merror:0.030048
[257]	validation_0-merror:0.030052
[258]	validation_0-merror:0.030025
[259]	validation_0-merror:0.030015
[260]	validation_0-merror:0.029983
[261]	validation_0-merror:0.030011
[262]	validation_0-merror:0.029997
[263]	validation_0-mer

[470]	validation_0-merror:0.028759
[471]	validation_0-merror:0.028787
[472]	validation_0-merror:0.028787
[473]	validation_0-merror:0.028787
[474]	validation_0-merror:0.028782
[475]	validation_0-merror:0.028778
[476]	validation_0-merror:0.028796
[477]	validation_0-merror:0.028759
[478]	validation_0-merror:0.028787
[479]	validation_0-merror:0.028778
[480]	validation_0-merror:0.028754
[481]	validation_0-merror:0.028754
[482]	validation_0-merror:0.028731
[483]	validation_0-merror:0.02874
[484]	validation_0-merror:0.028731
[485]	validation_0-merror:0.028712
[486]	validation_0-merror:0.028712
[487]	validation_0-merror:0.028717
[488]	validation_0-merror:0.028703
[489]	validation_0-merror:0.028699
[490]	validation_0-merror:0.028699
[491]	validation_0-merror:0.02868
[492]	validation_0-merror:0.028685
[493]	validation_0-merror:0.028712
[494]	validation_0-merror:0.028712
[495]	validation_0-merror:0.028722
[496]	validation_0-merror:0.028722
[497]	validation_0-merror:0.02875
[498]	validation_0-merr

[705]	validation_0-merror:0.028094
[706]	validation_0-merror:0.028084
[707]	validation_0-merror:0.028089
[708]	validation_0-merror:0.028089
[709]	validation_0-merror:0.028084
[710]	validation_0-merror:0.028052
[711]	validation_0-merror:0.02807
[712]	validation_0-merror:0.028075
[713]	validation_0-merror:0.02808
[714]	validation_0-merror:0.028075
[715]	validation_0-merror:0.028084
[716]	validation_0-merror:0.02808
[717]	validation_0-merror:0.028108
[718]	validation_0-merror:0.028098
[719]	validation_0-merror:0.02808
[720]	validation_0-merror:0.028084
[721]	validation_0-merror:0.028052
[722]	validation_0-merror:0.028038
[723]	validation_0-merror:0.028052
[724]	validation_0-merror:0.028047
[725]	validation_0-merror:0.028038
[726]	validation_0-merror:0.028042
[727]	validation_0-merror:0.028015
[728]	validation_0-merror:0.02801
[729]	validation_0-merror:0.028038
[730]	validation_0-merror:0.02801
[731]	validation_0-merror:0.02801
[732]	validation_0-merror:0.028015
[733]	validation_0-merror:0

  if diff:


In [None]:
import pickle

with open("results_1vA.pickle", "wb") as f:
    pickle.dump(results_cum, f)