In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score

from preprocessing import *
from recalibrator import Recalibrator
from utils import match
from confidence_intervals import confidence_intervals

In [2]:
trios = ["ajt", "chd", "corpas", "yri"]

# # Pre-processing. Uncomment during first run of the script, then
# # comment to avoid re-computing

# for trio in trios:
#     data_dir = '../data/' + trio + '/'
#     df = load_suffixes(data_dir)
#     df.to_csv(trio + '.csv')

In [3]:

results_cum = {}

for test in trios:
    results = {}
    df_train = pd.DataFrame()
    
    for train in trios:
        if train != test:
            df_train = df_train.append(pd.read_csv(train + '.csv'))

    df_test = pd.read_csv(test + '.csv')

    gt_cols = list(filter(match("GT", pos=-1), df_train.columns.values))
    to_drop = list(set(['#CHROM', 'POS', 'Unnamed: 0', 'Unnamed: 1'] + gt_cols))
    
    X_train = df_train.drop(to_drop + ["justchild^GT"], axis=1).values
    y_train = df_train['justchild^GT'].values
    X_test = df_test.drop(to_drop + ["justchild^GT"], axis=1).values
    y_test = df_test['justchild^GT'].values

    contaminations = df_test['contamination'].values
    contamination_values = list(sorted(np.unique(contaminations)))

    X_tests = {}
    y_tests = {}
    idx = {}

    for contamination in contamination_values:
        idx[contamination] = contaminations == contamination
        X_tests[contamination] = X_test[idx[contamination]]
        y_tests[contamination] = y_test[idx[contamination]]

    results['y_test'] = y_test
    results['preds_naive'] = df_test['abortus^GT'].values
    results['idx'] = idx

    r = Recalibrator()
    print(test)
    print("Training logistic regression")
    r.model_lr.fit(X_train, y_train)
    print("Training XGBoost")
    r.model_xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=20)
    results['preds_lr'] = r.model_lr.predict(X_test)
    results['preds_xgb'] = r.model_xgb.predict(X_test)
    results['preds_ci'] = confidence_intervals(df_test)

    results_cum[test] = results


damn
son
ajt
Training logistic regression
Training XGBoost
[0]	validation_0-merror:0.037462
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.036514
[2]	validation_0-merror:0.034745
[3]	validation_0-merror:0.035958
[4]	validation_0-merror:0.035259
[5]	validation_0-merror:0.034905
[6]	validation_0-merror:0.034678
[7]	validation_0-merror:0.033374
[8]	validation_0-merror:0.032225
[9]	validation_0-merror:0.031824
[10]	validation_0-merror:0.031339
[11]	validation_0-merror:0.030291
[12]	validation_0-merror:0.029919
[13]	validation_0-merror:0.028775
[14]	validation_0-merror:0.026813
[15]	validation_0-merror:0.027515
[16]	validation_0-merror:0.025758
[17]	validation_0-merror:0.025268
[18]	validation_0-merror:0.024909
[19]	validation_0-merror:0.024665
[20]	validation_0-merror:0.024234
[21]	validation_0-merror:0.023496
[22]	validation_0-merror:0.0231
[23]	validation_0-merror:0.022687
[24]	validation_0-merror:0.02249
[25]	validation_0-merror:0.022423
[26

  if diff:
  lower_bound = contaminations - z*np.sqrt(contaminations*(1 - contaminations)/df_test[sample_name + '^DP'].values)
  upper_bound = contaminations + z*np.sqrt(contaminations*(1 - contaminations)/df_test[sample_name + '^DP'].values)


damn
son
chd
Training logistic regression
Training XGBoost
[0]	validation_0-merror:0.074899
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.076585
[2]	validation_0-merror:0.074657
[3]	validation_0-merror:0.073992
[4]	validation_0-merror:0.070179
[5]	validation_0-merror:0.070008
[6]	validation_0-merror:0.069894
[7]	validation_0-merror:0.069311
[8]	validation_0-merror:0.069379
[9]	validation_0-merror:0.068661
[10]	validation_0-merror:0.067552
[11]	validation_0-merror:0.067148
[12]	validation_0-merror:0.066884
[13]	validation_0-merror:0.066342
[14]	validation_0-merror:0.065957
[15]	validation_0-merror:0.065354
[16]	validation_0-merror:0.065189
[17]	validation_0-merror:0.064909
[18]	validation_0-merror:0.0645
[19]	validation_0-merror:0.064291
[20]	validation_0-merror:0.064015
[21]	validation_0-merror:0.063835
[22]	validation_0-merror:0.063556
[23]	validation_0-merror:0.063282
[24]	validation_0-merror:0.063041
[25]	validation_0-merror:0.0628
[26]

[235]	validation_0-merror:0.05602
[236]	validation_0-merror:0.056
[237]	validation_0-merror:0.056014
[238]	validation_0-merror:0.056023
[239]	validation_0-merror:0.056032
[240]	validation_0-merror:0.056056
[241]	validation_0-merror:0.056059
[242]	validation_0-merror:0.056026
[243]	validation_0-merror:0.056032
[244]	validation_0-merror:0.056032
[245]	validation_0-merror:0.056029
[246]	validation_0-merror:0.055985
[247]	validation_0-merror:0.056009
[248]	validation_0-merror:0.056026
[249]	validation_0-merror:0.056029
[250]	validation_0-merror:0.056056
[251]	validation_0-merror:0.056061
[252]	validation_0-merror:0.056064
[253]	validation_0-merror:0.05607
[254]	validation_0-merror:0.056067
[255]	validation_0-merror:0.056064
[256]	validation_0-merror:0.05605
[257]	validation_0-merror:0.056053
[258]	validation_0-merror:0.056082
[259]	validation_0-merror:0.056082
[260]	validation_0-merror:0.056091
[261]	validation_0-merror:0.056088
[262]	validation_0-merror:0.056109
[263]	validation_0-merror:

  if diff:


damn
son
corpas
Training logistic regression
Training XGBoost
[0]	validation_0-merror:0.063667
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.060473
[2]	validation_0-merror:0.060293
[3]	validation_0-merror:0.059014
[4]	validation_0-merror:0.058752
[5]	validation_0-merror:0.05701
[6]	validation_0-merror:0.056264
[7]	validation_0-merror:0.055396
[8]	validation_0-merror:0.05349
[9]	validation_0-merror:0.053734
[10]	validation_0-merror:0.052366
[11]	validation_0-merror:0.051776
[12]	validation_0-merror:0.05068
[13]	validation_0-merror:0.050019
[14]	validation_0-merror:0.049014
[15]	validation_0-merror:0.048344
[16]	validation_0-merror:0.047921
[17]	validation_0-merror:0.04705
[18]	validation_0-merror:0.046547
[19]	validation_0-merror:0.046359
[20]	validation_0-merror:0.045634
[21]	validation_0-merror:0.044967
[22]	validation_0-merror:0.044367
[23]	validation_0-merror:0.043734
[24]	validation_0-merror:0.043073
[25]	validation_0-merror:0.042756
[

  if diff:


damn
son
yri
Training logistic regression
Training XGBoost
[0]	validation_0-merror:0.064491
Will train until validation_0-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.062706
[2]	validation_0-merror:0.062318
[3]	validation_0-merror:0.06055
[4]	validation_0-merror:0.059001
[5]	validation_0-merror:0.058642
[6]	validation_0-merror:0.058449
[7]	validation_0-merror:0.05359
[8]	validation_0-merror:0.051836
[9]	validation_0-merror:0.050304
[10]	validation_0-merror:0.048739
[11]	validation_0-merror:0.046979
[12]	validation_0-merror:0.046083
[13]	validation_0-merror:0.04349
[14]	validation_0-merror:0.041348
[15]	validation_0-merror:0.039691
[16]	validation_0-merror:0.038705
[17]	validation_0-merror:0.036742
[18]	validation_0-merror:0.035867
[19]	validation_0-merror:0.035226
[20]	validation_0-merror:0.033912
[21]	validation_0-merror:0.03352
[22]	validation_0-merror:0.03296
[23]	validation_0-merror:0.032323
[24]	validation_0-merror:0.031726
[25]	validation_0-merror:0.031204
[26]	

  if diff:


In [4]:
import pickle

with open("results_1vA_new.pickle", "wb") as f:
    pickle.dump(results_cum, f)