In [6]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(0, '../python')
from preprocessing import load_suffixes, prepare_input
from recalibrator import Recalibrator
from utils import match
from confidence_intervals import confidence_intervals

In [7]:
trios = ["ajt", "chd", "corpas", "yri"]

# # Pre-processing. Uncomment during first run of the script, then
# # comment to avoid re-computing

# for trio in trios:
#     data_dir = '../data/' + trio + '/'
#     df = load_suffixes(data_dir)
#     df.to_csv(trio + '.csv', index=False)

For each trio, construct a training set from the remaining trios and save the predictions into a dictionary.

In [None]:

results_cum = {}

for test in trios:
    results = {}
    df_train = pd.DataFrame()
    
    for train in trios:
        if train != test:
            df_train = df_train.append(pd.read_csv(train + '.csv'))

    df_test = pd.read_csv(test + '.csv')
    
    X_train = prepare_input(df_train, target_cols=['justchild^GT'])
    y_train = df_train['justchild^GT'].values
    X_test = prepare_input(df_test, target_cols=['justchild^GT'])
    y_test = df_test['justchild^GT'].values

    contaminations = df_test['contamination'].values
    contamination_values = list(sorted(np.unique(contaminations)))

    X_tests = {}
    y_tests = {}
    idx = {}

    for contamination in contamination_values:
        idx[contamination] = contaminations == contamination
        X_tests[contamination] = X_test[idx[contamination]]
        y_tests[contamination] = y_test[idx[contamination]]

    results['y_test'] = y_test
    results['preds_naive'] = df_test['abortus^GT'].values
    results['idx'] = idx

    r = Recalibrator()
    print(test)
    r.train(X_train, y_train)
    
    results['preds_lr'] = r.predict_lr(X_test)
    results['preds_xgb'] = r.predict_xgb(X_test)
    results['preds_ci'] = confidence_intervals(df_test)

    results_cum[test] = results


ajt




Training logistic regression




Training XGB
[0]	validation_0-merror:0.054629
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.052079
[2]	validation_0-merror:0.051673
[3]	validation_0-merror:0.051002
[4]	validation_0-merror:0.04958
[5]	validation_0-merror:0.049152
[6]	validation_0-merror:0.047837
[7]	validation_0-merror:0.046786
[8]	validation_0-merror:0.046094
[9]	validation_0-merror:0.04579
[10]	validation_0-merror:0.044693
[11]	validation_0-merror:0.0443
[12]	validation_0-merror:0.043578
[13]	validation_0-merror:0.043176
[14]	validation_0-merror:0.042766
[15]	validation_0-merror:0.042364
[16]	validation_0-merror:0.041924
[17]	validation_0-merror:0.041625
[18]	validation_0-merror:0.041121
[19]	validation_0-merror:0.040758
[20]	validation_0-merror:0.040446
[21]	validation_0-merror:0.040113
[22]	validation_0-merror:0.039763
[23]	validation_0-merror:0.03937
[24]	validation_0-merror:0.039126
[25]	validation_0-merror:0.038844
[26]	validation_0-merror:0.038575
[27]	validation_0

Serialize and save the dictionary

In [None]:
import pickle

with open("results_1vA.pickle", "wb") as f:
    pickle.dump(results_cum, f)