In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(0, '../python')
from preprocessing import load_suffixes, prepare_input
from recalibrator import Recalibrator
from utils import match
from confidence_intervals import confidence_intervals

np.random.seed(64)

In [2]:
trios = ["ajt", "chd", "corpas", "yri"]

# Pre-processing. Uncomment during first run of the script, then
# comment to avoid re-computing

# for trio in trios:
#     data_dir = '../data/' + trio + '/'
#     df = load_suffixes(data_dir)
#     df.to_csv(trio + '.csv', index=False)

For each trio, construct a training set from the remaining trios and save the predictions into a dictionary.

In [3]:

results_cum = {}

for test in trios:
    results = {}
    df_train = pd.DataFrame()
    
    for train in trios:
        if train != test:
            df_train = df_train.append(pd.read_csv(train + '.csv'))

    df_train = df_train[::100]
    df_test = pd.read_csv(test + '.csv')    
    
    X_train = prepare_input(df_train, target_cols=['justchild^GT'])
    y_train = df_train['justchild^GT'].values
    X_test = prepare_input(df_test, target_cols=['justchild^GT'])
    y_test = df_test['justchild^GT'].values

    contaminations = df_test['contamination'].values
    contamination_values = list(sorted(np.unique(contaminations)))

    X_tests = {}
    y_tests = {}
    idx = {}

    for contamination in contamination_values:
        idx[contamination] = contaminations == contamination
        X_tests[contamination] = X_test[idx[contamination]]
        y_tests[contamination] = y_test[idx[contamination]]
    
    results['dp'] = df_test['abortus^DP'].values
    results['y_test'] = y_test
    results['preds_naive'] = df_test['abortus^GT'].values
    results['idx'] = idx

    r = Recalibrator()
    print(test)
    r.train(X_train, y_train)
    
    results['preds_meta'] = r.model_meta.predict(X_test)
    results['preds_lr'] = r.predict_lr(X_test)
    results['preds_xgb'] = r.predict_xgb(X_test)
    results['preds_ci'] = r.model_ci.predict(X_test)

    results_cum[test] = results


ajt
Training logistic regression
Training XGB
[0]	validation_0-merror:0.058095
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.050833
[2]	validation_0-merror:0.046561
[3]	validation_0-merror:0.049551
[4]	validation_0-merror:0.049551
[5]	validation_0-merror:0.050406
[6]	validation_0-merror:0.049551
[7]	validation_0-merror:0.049551
[8]	validation_0-merror:0.047416
[9]	validation_0-merror:0.046988
[10]	validation_0-merror:0.04528
[11]	validation_0-merror:0.044853
[12]	validation_0-merror:0.044425
[13]	validation_0-merror:0.044425
[14]	validation_0-merror:0.044425
[15]	validation_0-merror:0.044425
[16]	validation_0-merror:0.043998
[17]	validation_0-merror:0.044425
[18]	validation_0-merror:0.043571
[19]	validation_0-merror:0.043998
[20]	validation_0-merror:0.043998
[21]	validation_0-merror:0.042717
[22]	validation_0-merror:0.041435
[23]	validation_0-merror:0.041435
[24]	validation_0-merror:0.041435
[25]	validation_0-merror:0.040581
[26]	validatio

  lower_bound = contaminations - self.z*np.sqrt(contaminations*(1 - contaminations)/dps)
  upper_bound = contaminations + self.z*np.sqrt(contaminations*(1 - contaminations)/dps)
  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  mo_share = (2*ab_ad1/dps)
  idx_0_confirmed = (mo_share > lower_bound) & (mo_share < upper_bound) & idx_0
  idx_0_confirmed = (mo_share > lower_bound) & (mo_share < upper_bound) & idx_0
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  mo_share = (2*ab_ad0/dps)
  idx_1_confirmed = (mo_share > lower_bound) & (mo_share < upper_bound) & idx_1
  idx_1_confirmed = (mo_share > lower_bound) & (mo_share < upper_bound) & idx_1
  if diff:
  if diff:


chd
Training logistic regression
Training XGB
[0]	validation_0-merror:0.045271
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.038804
[2]	validation_0-merror:0.038804
[3]	validation_0-merror:0.04042
[4]	validation_0-merror:0.038399
[5]	validation_0-merror:0.037187
[6]	validation_0-merror:0.037591
[7]	validation_0-merror:0.037591
[8]	validation_0-merror:0.037187
[9]	validation_0-merror:0.036783
[10]	validation_0-merror:0.036783
[11]	validation_0-merror:0.037187
[12]	validation_0-merror:0.036783
[13]	validation_0-merror:0.036378
[14]	validation_0-merror:0.035166
[15]	validation_0-merror:0.034357
[16]	validation_0-merror:0.034357
[17]	validation_0-merror:0.033145
[18]	validation_0-merror:0.033145
[19]	validation_0-merror:0.03274
[20]	validation_0-merror:0.032336
[21]	validation_0-merror:0.031932
[22]	validation_0-merror:0.030719
[23]	validation_0-merror:0.031124
[24]	validation_0-merror:0.030315
[25]	validation_0-merror:0.030315
[26]	validation

  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  if diff:
  if diff:


corpas
Training logistic regression
Training XGB
[0]	validation_0-merror:0.062475
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.05847
[2]	validation_0-merror:0.062875
[3]	validation_0-merror:0.060072
[4]	validation_0-merror:0.059672
[5]	validation_0-merror:0.05807
[6]	validation_0-merror:0.057669
[7]	validation_0-merror:0.054065
[8]	validation_0-merror:0.052062
[9]	validation_0-merror:0.051262
[10]	validation_0-merror:0.050461
[11]	validation_0-merror:0.048058
[12]	validation_0-merror:0.048859
[13]	validation_0-merror:0.04966
[14]	validation_0-merror:0.048458
[15]	validation_0-merror:0.048058
[16]	validation_0-merror:0.047257
[17]	validation_0-merror:0.046456
[18]	validation_0-merror:0.045655
[19]	validation_0-merror:0.046055
[20]	validation_0-merror:0.045254
[21]	validation_0-merror:0.044453
[22]	validation_0-merror:0.044053
[23]	validation_0-merror:0.044453
[24]	validation_0-merror:0.043252
[25]	validation_0-merror:0.042451
[26]	validati

  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  if diff:
  if diff:


yri
Training logistic regression
Training XGB
[0]	validation_0-merror:0.070698
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.066977
[2]	validation_0-merror:0.060465
[3]	validation_0-merror:0.058605
[4]	validation_0-merror:0.056279
[5]	validation_0-merror:0.057674
[6]	validation_0-merror:0.05907
[7]	validation_0-merror:0.057674
[8]	validation_0-merror:0.057674
[9]	validation_0-merror:0.057209
[10]	validation_0-merror:0.056744
[11]	validation_0-merror:0.055814
[12]	validation_0-merror:0.055349
[13]	validation_0-merror:0.054884
[14]	validation_0-merror:0.053488
[15]	validation_0-merror:0.054419
[16]	validation_0-merror:0.053488
[17]	validation_0-merror:0.052093
[18]	validation_0-merror:0.051628
[19]	validation_0-merror:0.051163
[20]	validation_0-merror:0.053023
[21]	validation_0-merror:0.053023
[22]	validation_0-merror:0.053023
[23]	validation_0-merror:0.052558
[24]	validation_0-merror:0.051628
[25]	validation_0-merror:0.050698
[26]	validatio

  lower_bound = contaminations - self.z*np.sqrt(contaminations*(1 - contaminations)/dps)
  upper_bound = contaminations + self.z*np.sqrt(contaminations*(1 - contaminations)/dps)
  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  idx_0 = idx_hetero & (ab_ad0/ab_ad1 > mo_ad0/mo_ad1)
  mo_share = (2*ab_ad1/dps)
  idx_0_confirmed = (mo_share > lower_bound) & (mo_share < upper_bound) & idx_0
  idx_0_confirmed = (mo_share > lower_bound) & (mo_share < upper_bound) & idx_0
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  idx_1 = idx_hetero & (ab_ad0/ab_ad1 < mo_ad0/mo_ad1)
  mo_share = (2*ab_ad0/dps)
  idx_1_confirmed = (mo_share > lower_bound) & (mo_share < upper_bound) & idx_1
  idx_1_confirmed = (mo_share > lower_bound) & (mo_share < upper_bound) & idx_1
  if diff:
  if diff:


Serialize and save the dictionary

In [4]:
import pickle

with open("results_1vA.pickle", "wb") as f:
    pickle.dump(results_cum, f)