In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score

from preprocessing import load_suffixes, prepare_input
from recalibrator import Recalibrator
from utils import match
from confidence_intervals import confidence_intervals

In [2]:
trios = ["ajt", "chd", "corpas", "yri"]

# Pre-processing. Uncomment during first run of the script, then
# comment to avoid re-computing

for trio in trios:
    data_dir = '../data/' + trio + '/'
    df = load_suffixes(data_dir)
    df.to_csv(trio + '.csv', index=False)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  df = df.convert_objects(convert_numeric=True)
100%|██████████| 12/12 [01:24<00:00,  7.07s/it]
100%|██████████| 12/12 [01:14<00:00,  6.24s/it]
100%|██████████| 12/12 [01:07<00:00,  5.64s/it]
100%|██████████| 12/12 [01:42<00:00,  8.55s/it]


For each trio, construct a training set from the remaining trios and save the predictions into a dictionary.

In [3]:

results_cum = {}

for test in trios:
    results = {}
    df_train = pd.DataFrame()
    
    for train in trios:
        if train != test:
            df_train = df_train.append(pd.read_csv(train + '.csv'))

    df_test = pd.read_csv(test + '.csv')
    
    X_train = prepare_input(df_train, target_cols=['justchild^GT'])
    y_train = df_train['justchild^GT'].values
    X_test = prepare_input(df_test, target_cols=['justchild^GT'])
    y_test = df_test['justchild^GT'].values

    contaminations = df_test['contamination'].values
    contamination_values = list(sorted(np.unique(contaminations)))

    X_tests = {}
    y_tests = {}
    idx = {}

    for contamination in contamination_values:
        idx[contamination] = contaminations == contamination
        X_tests[contamination] = X_test[idx[contamination]]
        y_tests[contamination] = y_test[idx[contamination]]

    results['y_test'] = y_test
    results['preds_naive'] = df_test['abortus^GT'].values
    results['idx'] = idx

    r = Recalibrator()
    print(test)
    r.train(X_train, y_train)
    
    results['preds_lr'] = r.predict_lr(X_test)
    results['preds_xgb'] = r.predict_xgb(X_test)
    results['preds_ci'] = confidence_intervals(df_test)

    results_cum[test] = results


ajt
Training logistic regression




Training XGB
[0]	validation_0-merror:0.057969
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.053868
[2]	validation_0-merror:0.052928
[3]	validation_0-merror:0.052629
[4]	validation_0-merror:0.052757
[5]	validation_0-merror:0.051177
[6]	validation_0-merror:0.051006
[7]	validation_0-merror:0.049212
[8]	validation_0-merror:0.047119
[9]	validation_0-merror:0.046734
[10]	validation_0-merror:0.045923
[11]	validation_0-merror:0.045282
[12]	validation_0-merror:0.044556
[13]	validation_0-merror:0.043829
[14]	validation_0-merror:0.043915
[15]	validation_0-merror:0.043274
[16]	validation_0-merror:0.042889
[17]	validation_0-merror:0.042505
[18]	validation_0-merror:0.041992
[19]	validation_0-merror:0.04148
[20]	validation_0-merror:0.041309
[21]	validation_0-merror:0.040967
[22]	validation_0-merror:0.041181
[23]	validation_0-merror:0.040754
[24]	validation_0-merror:0.040198
[25]	validation_0-merror:0.039728
[26]	validation_0-merror:0.039643
[27]	validati

  if diff:
  lower_bound = contaminations - z*np.sqrt(contaminations*(1 - contaminations)/df_test[sample_name + '^DP'].values)
  upper_bound = contaminations + z*np.sqrt(contaminations*(1 - contaminations)/df_test[sample_name + '^DP'].values)


chd
Training logistic regression




Training XGB
[0]	validation_0-merror:0.033308
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.033833
[2]	validation_0-merror:0.031529
[3]	validation_0-merror:0.029993
[4]	validation_0-merror:0.030317
[5]	validation_0-merror:0.029346
[6]	validation_0-merror:0.029063
[7]	validation_0-merror:0.029063
[8]	validation_0-merror:0.02874
[9]	validation_0-merror:0.028376
[10]	validation_0-merror:0.028498
[11]	validation_0-merror:0.028417
[12]	validation_0-merror:0.028174
[13]	validation_0-merror:0.027366
[14]	validation_0-merror:0.027042
[15]	validation_0-merror:0.027083
[16]	validation_0-merror:0.026921
[17]	validation_0-merror:0.026719
[18]	validation_0-merror:0.026557
[19]	validation_0-merror:0.026193
[20]	validation_0-merror:0.025911
[21]	validation_0-merror:0.025547
[22]	validation_0-merror:0.025506
[23]	validation_0-merror:0.025183
[24]	validation_0-merror:0.024819
[25]	validation_0-merror:0.02494
[26]	validation_0-merror:0.024738
[27]	validatio

  if diff:


corpas
Training logistic regression




Training XGB
[0]	validation_0-merror:0.046498
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.046898
[2]	validation_0-merror:0.047058
[3]	validation_0-merror:0.044856
[4]	validation_0-merror:0.044655
[5]	validation_0-merror:0.043334
[6]	validation_0-merror:0.043414
[7]	validation_0-merror:0.042172
[8]	validation_0-merror:0.041812
[9]	validation_0-merror:0.041852
[10]	validation_0-merror:0.040811
[11]	validation_0-merror:0.04049
[12]	validation_0-merror:0.039489
[13]	validation_0-merror:0.038808
[14]	validation_0-merror:0.037847
[15]	validation_0-merror:0.037286
[16]	validation_0-merror:0.036926
[17]	validation_0-merror:0.036605
[18]	validation_0-merror:0.036405
[19]	validation_0-merror:0.035764
[20]	validation_0-merror:0.035484
[21]	validation_0-merror:0.035043
[22]	validation_0-merror:0.034763
[23]	validation_0-merror:0.034523
[24]	validation_0-merror:0.034242
[25]	validation_0-merror:0.033522
[26]	validation_0-merror:0.033121
[27]	validati

  if diff:


yri
Training logistic regression




Training XGB
[0]	validation_0-merror:0.059554
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.058438
[2]	validation_0-merror:0.057647
[3]	validation_0-merror:0.056949
[4]	validation_0-merror:0.055693
[5]	validation_0-merror:0.054902
[6]	validation_0-merror:0.05425
[7]	validation_0-merror:0.052947
[8]	validation_0-merror:0.051877
[9]	validation_0-merror:0.050761
[10]	validation_0-merror:0.049504
[11]	validation_0-merror:0.047271
[12]	validation_0-merror:0.047178
[13]	validation_0-merror:0.046062
[14]	validation_0-merror:0.046155
[15]	validation_0-merror:0.045736
[16]	validation_0-merror:0.045131
[17]	validation_0-merror:0.044759
[18]	validation_0-merror:0.044154
[19]	validation_0-merror:0.043782
[20]	validation_0-merror:0.043363
[21]	validation_0-merror:0.043177
[22]	validation_0-merror:0.042758
[23]	validation_0-merror:0.042293
[24]	validation_0-merror:0.042619
[25]	validation_0-merror:0.042293
[26]	validation_0-merror:0.042014
[27]	validati

  if diff:
  lower_bound = contaminations - z*np.sqrt(contaminations*(1 - contaminations)/df_test[sample_name + '^DP'].values)
  upper_bound = contaminations + z*np.sqrt(contaminations*(1 - contaminations)/df_test[sample_name + '^DP'].values)


In [4]:
import pickle

with open("results_1vA.pickle", "wb") as f:
    pickle.dump(results_cum, f)