In [1]:
import pandas as pd
import numpy as np

from models import DeepSeaSNP, Conservation, SNPContext, MixedModel, EnhancerOneHot
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.preprocessing.label import LabelBinarizer

from cagi5_utils import get_breakpoint_df
from crossval import cvpreds_df_enhancer_folds, cvpreds_df_chunk_folds, ChunkCV,\
                     df_cv_split

In [2]:
df = pd.read_csv('data/cagi5_df.csv')

In [3]:
df.head()

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS
0,X,138612669,T,A,-0.17,0.07,0,release_F9,0.006,0.47,3.93,1.49
1,X,138612669,T,C,-0.26,0.24,-1,release_F9,0.006,0.47,3.93,1.49
2,X,138612669,T,G,0.34,0.05,0,release_F9,0.006,0.47,3.93,1.49
3,X,138612670,A,C,0.0,0.0,0,release_F9,0.008,0.47,3.93,0.337
4,X,138612670,A,G,0.22,0.2,1,release_F9,0.008,0.47,3.93,0.337


* TODO: setup some way of fixing the folds over which cv is done
* combine features from multiple layers with keras models
* try the gifford ensembl model

In [10]:
cvdf_enh = cvpreds_df_enhancer_folds(df, DeepSeaSNP, model_kwargs={'classifier': 'lr',
                                                                   'feattypes':['diff']})


binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_enh['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_enh['cv_prediction']), average=None))

# lr diff: 0.6135, 0.542, 0.6000
# lr absdiff: 0.571, 0.587, 0.517
# gblinear diff: 0.612, 0.57, 0.58
print(binarizer.classes_)

binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_enh[cvdf_enh['base_element']=='F9']['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_enh[cvdf_enh['base_element']=='F9']['cv_prediction']), average=None))

[ 0.61350038  0.54231944  0.60066257]
[-1  0  1]
[ 0.72258772  0.55522239  0.65946844]


In [4]:
nfolds = 5
breakpoint_df = get_breakpoint_df(df)
fold_dict = df_cv_split(breakpoint_df, nfolds)

In [5]:
fold_dict

{'release_F9': [[3], [1], [2], [0], [4]],
 'release_GP1BB': [[5, 0], [4], [3], [2], [1]],
 'release_HBB': [[1], [2], [0], [], []],
 'release_HBG1': [[2], [0], [1], [3], []],
 'release_HNF4A': [[0], [3], [2], [1], []],
 'release_IRF4': [[7, 5], [4, 3], [0, 2], [6], [1]],
 'release_IRF6': [[1, 7], [2, 6], [4, 5], [3, 0], [8]],
 'release_LDLR': [[2], [0], [3], [1], [4]],
 'release_MSMB': [[4, 2], [0, 5], [7, 6], [1, 3], [8]],
 'release_MYCrs6983267': [[0, 1], [4, 7], [5, 8], [3, 6], [2]],
 'release_PKLR': [[7, 5], [3, 2], [0, 1], [4], [6]],
 'release_SORT1': [[1, 3], [6, 4], [0, 7], [8, 5], [2]],
 'release_TERT-GBM': [[2], [3], [4], [0], [1]],
 'release_TERT-HEK293T': [[1], [2], [0], [3], [4]],
 'release_ZFAND3': [[10, 8, 6], [2, 5], [0, 4], [7, 3], [9, 1]]}

In [6]:
cv_chunk = ChunkCV(df, DeepSeaSNP, model_kwargs={'classifier': 'lr',
                                                 'feattypes':['diff']},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[-1  0  1]
[ 0.61678638  0.5421782   0.62033399]


In [7]:
cvdf_chunk

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS,cv_prediction,is_break,chunk_length,is_start,chunk_id,is_train
0,X,138612669,T,A,-0.17,0.07,0,release_F9,0.006,0.470,3.93,1.4900,-1.0,start,,True,0,True
1,X,138612669,T,C,-0.26,0.24,-1,release_F9,0.006,0.470,3.93,1.4900,-1.0,no,,False,0,True
2,X,138612669,T,G,0.34,0.05,0,release_F9,0.006,0.470,3.93,1.4900,-1.0,no,,False,0,True
3,X,138612670,A,C,0.00,0.00,0,release_F9,0.008,0.470,3.93,0.3370,-1.0,no,,False,0,True
4,X,138612670,A,G,0.22,0.20,1,release_F9,0.008,0.470,3.93,0.3370,-1.0,no,,False,0,True
5,X,138612670,A,T,0.12,0.03,0,release_F9,0.008,0.470,3.93,0.3370,-1.0,no,,False,0,True
6,X,138612671,T,A,0.06,0.02,0,release_F9,0.009,-0.404,3.93,-0.8070,1.0,no,,False,0,True
7,X,138612671,T,C,0.26,0.21,1,release_F9,0.009,-0.404,3.93,-0.8070,-1.0,no,,False,0,True
8,X,138612671,T,G,0.08,0.01,0,release_F9,0.009,-0.404,3.93,-0.8070,-1.0,no,,False,0,True
9,X,138612672,C,A,-0.13,0.05,0,release_F9,0.011,-0.251,3.93,0.6860,-1.0,no,,False,0,True


In [None]:
cv_chunk = ChunkCV(df, DeepSeaSNP, model_kwargs={'classifier': 'xgb',
                                                 'feattypes':['diff']},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

  if diff:


In [4]:
cv_chunk = ChunkCV(df, EnhancerOneHot, model_kwargs={'classifier': 'lr'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[-1  0  1]
[0.50709792 0.63668881 0.51966204]


In [None]:
cv_chunk = ChunkCV(df, MixedModel, model_kwargs={'models': [DeepSeaSNP, EnhancerOneHot],
                                                 'classifier': 'xgb',
                                                 'model_kwargs': [{'feattypes':['diff']}, {}]},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[<class 'models.DeepSeaSNP'>, <class 'models.EnhancerOneHot'>]


  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.68941863 0.67380134 0.65154412]


  if diff:


In [18]:
cv_chunk = ChunkCV(df, SNPContext, model_kwargs={'classifier': 'xgb',
                                                 'context_size': 2},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.49041311 0.51201328 0.55197833]


  if diff:


In [19]:
cv_chunk = ChunkCV(df, SNPContext, model_kwargs={'classifier': 'xgb',
                                                 'context_size': 4},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.48944876 0.47887845 0.51813505]


  if diff:


In [15]:
cv_chunk = ChunkCV(df, Conservation, model_kwargs={
                                                   'classifier': 'lr'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[-1  0  1]
[0.51816882 0.5431898  0.51130111]


In [17]:
cv_chunk = ChunkCV(df, MixedModel, model_kwargs={'models': [Conservation, SNPContext, DeepSeaSNP],
                                                 'model_kwargs': [{'scores': ['GerpRS', 'phyloP']},
                                                                  {'context_size': 2},
                                                                  {'feattypes': 'diff'}],
                                                 'classifier': 'xgb'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[<class 'models.Conservation'>, <class 'models.SNPContext'>, <class 'models.DeepSeaSNP'>]


  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.57120829 0.5682547  0.58844915]


  if diff:


In [16]:
cv_chunk = ChunkCV(df, MixedModel, model_kwargs={'models': [Conservation, DeepSeaSNP],
                                                 'model_kwargs': [{'scores': ['GerpRS', 'phyloP']},
                                                                  {'feattypes': 'diff'}],
                                                 'classifier': 'xgb'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))


[<class 'models.Conservation'>, <class 'models.DeepSeaSNP'>]


  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.67589589 0.6625439  0.62515695]


  if diff:


In [28]:
cv_chunk = ChunkCV(df, MixedModel, model_kwargs={'models': [SNPContext, DeepSeaSNP],
                                                 'model_kwargs': [{'context_size': 2,
                                                                   'raw_aggs': ['median']},
                                                                  {'feattypes': 'diff'}],
                                                 'classifier': 'xgb'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[<class 'models.SNPContext'>, <class 'models.DeepSeaSNP'>]
[-1  0  1]
[ 0.65167265  0.6335013   0.60240654]


In [6]:
cv_chunk.fold_dict

{'release_F9': [[3], [1], [2], [4], [0]],
 'release_GP1BB': [[2, 5], [0], [3], [1], [4]],
 'release_HBB': [[2], [0], [1], [], []],
 'release_HBG1': [[1], [2], [0], [3], []],
 'release_HNF4A': [[1], [0], [2], [3], []],
 'release_IRF4': [[4, 1], [0, 7], [3, 2], [5], [6]],
 'release_IRF6': [[2, 6], [8, 0], [1, 7], [3, 5], [4]],
 'release_LDLR': [[0], [4], [1], [3], [2]],
 'release_MSMB': [[2, 4], [0, 1], [7, 5], [6, 8], [3]],
 'release_MYCrs6983267': [[8, 1], [7, 0], [4, 2], [3, 5], [6]],
 'release_PKLR': [[6, 2], [5, 0], [1, 4], [3], [7]],
 'release_SORT1': [[6, 1], [8, 5], [2, 7], [0, 3], [4]],
 'release_TERT-GBM': [[4], [0], [1], [3], [2]],
 'release_TERT-HEK293T': [[2], [4], [0], [1], [3]],
 'release_ZFAND3': [[1, 5, 4], [0, 9], [2, 6], [8, 7], [10, 3]]}

In [9]:
binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_chunk[cvdf_chunk['regulatory_element']=='release_F9']['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk[cvdf_chunk['regulatory_element']=='release_F9']['cv_prediction']), average=None))

[ 0.72916667  0.54522739  0.63565891]


In [6]:
cvdf_chunk[cvdf_chunk['cv_prediction'].isnull()]

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,cv_prediction,is_break,chunk_length,is_start,chunk_id,is_train
4159,6,37775274,C,G,0.02,0.00,0,release_ZFAND3,,start,,True,0,True
4160,6,37775274,C,T,0.35,0.02,0,release_ZFAND3,,no,,False,0,True
4161,6,37775275,G,A,-0.09,0.02,0,release_ZFAND3,,no,,False,0,True
4162,6,37775275,G,C,0.00,0.00,0,release_ZFAND3,,no,,False,0,True
4163,6,37775275,G,T,-0.09,0.01,0,release_ZFAND3,,no,,False,0,True
4164,6,37775276,T,A,-0.01,0.00,0,release_ZFAND3,,no,,False,0,True
4165,6,37775276,T,C,0.00,0.00,0,release_ZFAND3,,no,,False,0,True
4166,6,37775276,T,G,-0.03,0.07,0,release_ZFAND3,,no,,False,0,True
4167,6,37775277,T,A,0.08,0.03,0,release_ZFAND3,,no,,False,0,True
4168,6,37775277,T,C,0.08,0.03,0,release_ZFAND3,,no,,False,0,True
