In [1]:
import pandas as pd
import numpy as np

from crossval import cvpreds_df_enhancer_folds, cvpreds_df_chunk_folds, ChunkCV
from models import DeepSeaSNP
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.preprocessing.label import LabelBinarizer#



In [2]:
df = pd.read_csv('data/cagi5_df.csv')

In [10]:
cvdf_enh = cvpreds_df_enhancer_folds(df, DeepSeaSNP, model_kwargs={'classifier': 'lr',
                                                               'feattypes':['diff']})


binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_enh['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_enh['cv_prediction']), average=None))

# lr diff: 0.6135, 0.542, 0.6000
# lr absdiff: 0.571, 0.587, 0.517
# gblinear diff: 0.612, 0.57, 0.58
print(binarizer.classes_)

binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_enh[cvdf_enh['base_element']=='F9']['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_enh[cvdf_enh['base_element']=='F9']['cv_prediction']), average=None))

[ 0.61350038  0.54231944  0.60066257]
[-1  0  1]
[ 0.72258772  0.55522239  0.65946844]


In [5]:
cv_chunk = ChunkCV(df, DeepSeaSNP, model_kwargs={'classifier': 'lr',
                                                   'feattypes':['diff']})
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[1, 5, 0, 9, 2, 6, 8, 7, 10, 3, 4] 1 2 [[1, 5, 4], [0, 9], [2, 6], [8, 7], [10, 3]]
[-1  0  1]
[ 0.57094481  0.61399832  0.52084386]


In [6]:
cv_chunk.fold_dict

{'release_F9': [[3], [1], [2], [4], [0]],
 'release_GP1BB': [[2, 5], [0], [3], [1], [4]],
 'release_HBB': [[2], [0], [1], [], []],
 'release_HBG1': [[1], [2], [0], [3], []],
 'release_HNF4A': [[1], [0], [2], [3], []],
 'release_IRF4': [[4, 1], [0, 7], [3, 2], [5], [6]],
 'release_IRF6': [[2, 6], [8, 0], [1, 7], [3, 5], [4]],
 'release_LDLR': [[0], [4], [1], [3], [2]],
 'release_MSMB': [[2, 4], [0, 1], [7, 5], [6, 8], [3]],
 'release_MYCrs6983267': [[8, 1], [7, 0], [4, 2], [3, 5], [6]],
 'release_PKLR': [[6, 2], [5, 0], [1, 4], [3], [7]],
 'release_SORT1': [[6, 1], [8, 5], [2, 7], [0, 3], [4]],
 'release_TERT-GBM': [[4], [0], [1], [3], [2]],
 'release_TERT-HEK293T': [[2], [4], [0], [1], [3]],
 'release_ZFAND3': [[1, 5, 4], [0, 9], [2, 6], [8, 7], [10, 3]]}

In [9]:
binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_chunk[cvdf_chunk['regulatory_element']=='release_F9']['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk[cvdf_chunk['regulatory_element']=='release_F9']['cv_prediction']), average=None))

[ 0.72916667  0.54522739  0.63565891]


In [6]:
cvdf_chunk[cvdf_chunk['cv_prediction'].isnull()]

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,cv_prediction,is_break,chunk_length,is_start,chunk_id,is_train
4159,6,37775274,C,G,0.02,0.00,0,release_ZFAND3,,start,,True,0,True
4160,6,37775274,C,T,0.35,0.02,0,release_ZFAND3,,no,,False,0,True
4161,6,37775275,G,A,-0.09,0.02,0,release_ZFAND3,,no,,False,0,True
4162,6,37775275,G,C,0.00,0.00,0,release_ZFAND3,,no,,False,0,True
4163,6,37775275,G,T,-0.09,0.01,0,release_ZFAND3,,no,,False,0,True
4164,6,37775276,T,A,-0.01,0.00,0,release_ZFAND3,,no,,False,0,True
4165,6,37775276,T,C,0.00,0.00,0,release_ZFAND3,,no,,False,0,True
4166,6,37775276,T,G,-0.03,0.07,0,release_ZFAND3,,no,,False,0,True
4167,6,37775277,T,A,0.08,0.03,0,release_ZFAND3,,no,,False,0,True
4168,6,37775277,T,C,0.08,0.03,0,release_ZFAND3,,no,,False,0,True
