In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(0, '../python')
from preprocessing import load_suffixes, prepare_input
from recalibrator import Recalibrator
from utils import match

np.random.seed(64)

In [2]:
trios = ["ajt", "chd", "corpas", "yri"]

# Pre-processing. Uncomment during first run of the script, then
# comment to avoid re-computing

for trio in trios:
    data_dir = '../data/' + trio + '/'
    df = load_suffixes(data_dir)
    df.to_csv(trio + '.csv', index=False)

100%|██████████| 12/12 [01:51<00:00,  9.06s/it]
100%|██████████| 12/12 [01:36<00:00,  8.37s/it]
100%|██████████| 12/12 [01:35<00:00,  7.98s/it]
100%|██████████| 12/12 [02:19<00:00, 11.49s/it]


For each trio, construct a training set from the remaining trios and save the predictions into a dictionary.

In [3]:

results_cum = {}

for test in trios:
    results = {}
    df_train = pd.DataFrame()
    
    for train in trios:
        if train != test:
            df_train = df_train.append(pd.read_csv(train + '.csv'))

    df_train = df_train[::100]
    df_test = pd.read_csv(test + '.csv')    
    
    X_train = prepare_input(df_train, target_cols=['justchild^GT'])
    y_train = df_train['justchild^GT'].values
    X_test = prepare_input(df_test, target_cols=['justchild^GT'])
    y_test = df_test['justchild^GT'].values

    contaminations = df_test['contamination'].values
    contamination_values = list(sorted(np.unique(contaminations)))

    X_tests = {}
    y_tests = {}
    idx = {}

    for contamination in contamination_values:
        idx[contamination] = contaminations == contamination
        X_tests[contamination] = X_test[idx[contamination]]
        y_tests[contamination] = y_test[idx[contamination]]
    
    results['dp'] = df_test['abortus^DP'].values
    results['y_test'] = y_test
    results['preds_naive'] = df_test['abortus^GT'].values
    results['idx'] = idx

    r = Recalibrator()
    print(test)
    r.train(X_train, y_train)
    
    results['preds_meta'] = r.model_meta.predict(X_test)
    results['probs'] = r.model_meta.predict_proba(X_test)
    results['preds_lr'] = r.predict_lr(X_test)
    results['preds_xgb'] = r.predict_xgb(X_test)
#     results['preds_ci'] = r.model_ci.predict(X_test)
    results['preds_mle'] = r.model_mle.predict(X_test)

    results_cum[test] = results


ajt
Training logistic regression




Training XGB
[0]	validation_0-merror:0.064502
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.05724
[2]	validation_0-merror:0.058095
[3]	validation_0-merror:0.058522
[4]	validation_0-merror:0.058949
[5]	validation_0-merror:0.057668
[6]	validation_0-merror:0.055532
[7]	validation_0-merror:0.052969
[8]	validation_0-merror:0.052969
[9]	validation_0-merror:0.052542
[10]	validation_0-merror:0.052114
[11]	validation_0-merror:0.050406
[12]	validation_0-merror:0.048697
[13]	validation_0-merror:0.049124
[14]	validation_0-merror:0.049124
[15]	validation_0-merror:0.04827
[16]	validation_0-merror:0.048697
[17]	validation_0-merror:0.047843
[18]	validation_0-merror:0.046134
[19]	validation_0-merror:0.045707
[20]	validation_0-merror:0.04528
[21]	validation_0-merror:0.045707
[22]	validation_0-merror:0.044425
[23]	validation_0-merror:0.044425
[24]	validation_0-merror:0.043144
[25]	validation_0-merror:0.043998
[26]	validation_0-merror:0.041862
[27]	validation



chd
Training logistic regression




Training XGB
[0]	validation_0-merror:0.047292
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.046079
[2]	validation_0-merror:0.037187
[3]	validation_0-merror:0.040016
[4]	validation_0-merror:0.036783
[5]	validation_0-merror:0.034762
[6]	validation_0-merror:0.034762
[7]	validation_0-merror:0.035974
[8]	validation_0-merror:0.03557
[9]	validation_0-merror:0.034762
[10]	validation_0-merror:0.034357
[11]	validation_0-merror:0.034762
[12]	validation_0-merror:0.034357
[13]	validation_0-merror:0.033549
[14]	validation_0-merror:0.033549
[15]	validation_0-merror:0.033549
[16]	validation_0-merror:0.03274
[17]	validation_0-merror:0.031124
[18]	validation_0-merror:0.031528
[19]	validation_0-merror:0.031528
[20]	validation_0-merror:0.031528
[21]	validation_0-merror:0.030315
[22]	validation_0-merror:0.029911
[23]	validation_0-merror:0.029911
[24]	validation_0-merror:0.029103
[25]	validation_0-merror:0.028698
[26]	validation_0-merror:0.029911
[27]	validatio



corpas
Training logistic regression




Training XGB
[0]	validation_0-merror:0.053264
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.054065
[2]	validation_0-merror:0.052062
[3]	validation_0-merror:0.048458
[4]	validation_0-merror:0.049259
[5]	validation_0-merror:0.047257
[6]	validation_0-merror:0.047657
[7]	validation_0-merror:0.046456
[8]	validation_0-merror:0.046856
[9]	validation_0-merror:0.046456
[10]	validation_0-merror:0.047257
[11]	validation_0-merror:0.045655
[12]	validation_0-merror:0.044453
[13]	validation_0-merror:0.044053
[14]	validation_0-merror:0.044053
[15]	validation_0-merror:0.043652
[16]	validation_0-merror:0.043652
[17]	validation_0-merror:0.042851
[18]	validation_0-merror:0.04165
[19]	validation_0-merror:0.041249
[20]	validation_0-merror:0.039648
[21]	validation_0-merror:0.039247
[22]	validation_0-merror:0.038446
[23]	validation_0-merror:0.038847
[24]	validation_0-merror:0.039247
[25]	validation_0-merror:0.038446
[26]	validation_0-merror:0.037645
[27]	validati



yri
Training logistic regression




Training XGB
[0]	validation_0-merror:0.051628
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.046047
[2]	validation_0-merror:0.049767
[3]	validation_0-merror:0.049767
[4]	validation_0-merror:0.047907
[5]	validation_0-merror:0.048372
[6]	validation_0-merror:0.047442
[7]	validation_0-merror:0.046977
[8]	validation_0-merror:0.046977
[9]	validation_0-merror:0.045581
[10]	validation_0-merror:0.044651
[11]	validation_0-merror:0.044651
[12]	validation_0-merror:0.045116
[13]	validation_0-merror:0.044186
[14]	validation_0-merror:0.044651
[15]	validation_0-merror:0.044651
[16]	validation_0-merror:0.045116
[17]	validation_0-merror:0.044186
[18]	validation_0-merror:0.043721
[19]	validation_0-merror:0.043256
[20]	validation_0-merror:0.042791
[21]	validation_0-merror:0.043256
[22]	validation_0-merror:0.042791
[23]	validation_0-merror:0.043721
[24]	validation_0-merror:0.043721
[25]	validation_0-merror:0.042791
[26]	validation_0-merror:0.043721
[27]	validat



Serialize and save the dictionary

In [4]:
import pickle

with open("results_1vA.pickle", "wb") as f:
    pickle.dump(results_cum, f)