In [1]:
import pandas as pd
import numpy as np

from models import DeepSeaSNP, Conservation, SNPContext, MixedModel, EnhancerOneHot
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.preprocessing.label import LabelBinarizer

from cagi5_utils import get_breakpoint_df
from crossval import ChunkCV, df_cv_split, cvpreds_df_enhancer_folds
# from crossval import cvpreds_df_chunk_folds

## Load data

Read the training data with added conservation information.

In [22]:
df = pd.read_csv('data/cagi5_df.csv')
df.head()

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS
0,X,138612669,T,A,-0.17,0.07,0,release_F9,0.006,0.47,3.93,1.49
1,X,138612669,T,C,-0.26,0.24,-1,release_F9,0.006,0.47,3.93,1.49
2,X,138612669,T,G,0.34,0.05,0,release_F9,0.006,0.47,3.93,1.49
3,X,138612670,A,C,0.0,0.0,0,release_F9,0.008,0.47,3.93,0.337
4,X,138612670,A,G,0.22,0.2,1,release_F9,0.008,0.47,3.93,0.337


TODO:
* setup some way of fixing the folds over which cv is done
* combine features from multiple layers with keras models
* try the gifford ensembl model

Fit a logistic regression model to the training data for each base element independently.

In [23]:
cvdf_enh = cvpreds_df_enhancer_folds(df, DeepSeaSNP, model_kwargs={'classifier': 'lr',
                                                                   'feattypes':['diff']})
cvdf_enh.sample(6)

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS,cv_prediction,base_element
378,22,19710947,C,G,-0.05,0.01,0,release_GP1BB,0.002,0.557,3.15,2.06,-1.0,GP1BB
803,11,5271260,A,C,-0.07,0.0,0,release_HBG1,0.593,0.255,2.27,-1.93,1.0,HBG1
980,20,42984379,T,A,-0.01,0.0,0,release_HNF4A,0.576,0.53,3.39,-1.3,1.0,HNF4A
3806,5,1295152,A,C,0.18,0.02,0,release_TERT-GBM,0.0,-0.711,2.07,-4.15,-1.0,TERT
936,20,42984348,G,C,-0.02,0.01,0,release_HNF4A,0.78,0.651,5.16,4.13,-1.0,HNF4A
4093,5,1295277,G,A,-0.1,0.04,0,release_TERT-HEK293T,0.011,0.281,1.16,1.16,-1.0,TERT


Binarize the labels and calculate the AUC scores for all 2-way comparisons

In [4]:
binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_enh['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_enh['cv_prediction']), average=None))

# lr diff: 0.6135, 0.542, 0.6000
# lr absdiff: 0.571, 0.587, 0.517
# gblinear diff: 0.612, 0.57, 0.58
print(binarizer.classes_)

binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_enh[cvdf_enh['base_element']=='F9']['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_enh[cvdf_enh['base_element']=='F9']['cv_prediction']), average=None))

[0.61350038 0.54231944 0.60066257]
[-1  0  1]
[0.72258772 0.55522239 0.65946844]


## Create the cross-validation folds

Calculate the breakpoints inside each regulatory element.

In [36]:
breakpoint_df = get_breakpoint_df(df)
# Look at breakpoints
breakpoint_df[breakpoint_df['regulatory_element']=='release_F9']
breakpoint_df[breakpoint_df['is_break']!='no'].head(15)

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS,cv_prediction,base_element,is_break,chunk_length,is_start,chunk_id
0,X,138612669,T,A,-0.17,0.07,0,release_F9,0.006,0.47,3.93,1.49,-1.0,F9,start,,True,0.0
47,X,138612684,G,T,-0.02,0.0,0,release_F9,0.048,-0.233,3.45,-2.01,-1.0,F9,end,16.0,False,0.0
48,X,138612701,T,A,-0.36,0.37,-1,release_F9,0.009,0.47,3.27,1.92,-1.0,F9,start,,True,1.0
95,X,138612716,G,T,-0.15,0.04,0,release_F9,0.055,0.581,3.32,0.431,-1.0,F9,end,16.0,False,1.0
96,X,138612765,C,A,0.07,0.01,0,release_F9,0.42,-0.304,4.22,0.299,0.0,F9,start,,True,2.0
143,X,138612780,T,G,0.18,0.02,0,release_F9,0.525,0.47,4.69,4.69,1.0,F9,end,16.0,False,2.0
144,X,138612877,G,A,-0.09,0.05,0,release_F9,0.864,0.581,5.08,5.08,1.0,F9,start,,True,3.0
190,X,138612892,C,T,0.12,0.01,0,release_F9,0.508,0.581,5.08,1.79,-1.0,F9,end,16.0,False,3.0
191,X,138612909,C,A,0.05,0.02,0,release_F9,0.453,0.581,5.08,0.0736,-1.0,F9,start,,True,4.0
235,X,138612924,A,G,-0.01,0.0,0,release_F9,0.614,0.47,4.55,0.671,-1.0,F9,end,16.0,False,4.0


Split the training data into folds respecting the breakpoints.

In [27]:
nfolds = 5
fold_dict = df_cv_split(breakpoint_df, nfolds)
fold_dict

{'release_F9': [[3], [4], [2], [0], [1]],
 'release_GP1BB': [[5, 1], [2], [0], [4], [3]],
 'release_HBB': [[0], [1], [2], [], []],
 'release_HBG1': [[0], [1], [3], [2], []],
 'release_HNF4A': [[2], [1], [0], [3], []],
 'release_IRF4': [[6, 3], [0, 1], [2, 7], [5], [4]],
 'release_IRF6': [[0, 8], [6, 3], [1, 2], [7, 5], [4]],
 'release_LDLR': [[1], [3], [0], [2], [4]],
 'release_MSMB': [[2, 8], [0, 7], [1, 5], [4, 6], [3]],
 'release_MYCrs6983267': [[4, 3], [2, 5], [8, 0], [7, 1], [6]],
 'release_PKLR': [[5, 6], [0, 1], [2, 4], [7], [3]],
 'release_SORT1': [[5, 0], [4, 6], [2, 7], [8, 3], [1]],
 'release_TERT-GBM': [[0], [1], [3], [4], [2]],
 'release_TERT-HEK293T': [[1], [2], [0], [3], [4]],
 'release_ZFAND3': [[10, 6, 5], [7, 4], [1, 9], [0, 2], [8, 3]]}

In [7]:
cv_chunk = ChunkCV(df, DeepSeaSNP, model_kwargs={'classifier': 'lr',
                                                 'feattypes':['diff']},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[-1  0  1]
[0.62158833 0.5531188  0.61359885]


In [8]:
cvdf_chunk

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS,cv_prediction,base_element,is_break,chunk_length,is_start,chunk_id,is_train
0,X,138612669,T,A,-0.17,0.07,0,release_F9,0.006,0.470,3.93,1.4900,-1.0,F9,start,,True,0,True
1,X,138612669,T,C,-0.26,0.24,-1,release_F9,0.006,0.470,3.93,1.4900,-1.0,F9,no,,False,0,True
2,X,138612669,T,G,0.34,0.05,0,release_F9,0.006,0.470,3.93,1.4900,-1.0,F9,no,,False,0,True
3,X,138612670,A,C,0.00,0.00,0,release_F9,0.008,0.470,3.93,0.3370,-1.0,F9,no,,False,0,True
4,X,138612670,A,G,0.22,0.20,1,release_F9,0.008,0.470,3.93,0.3370,-1.0,F9,no,,False,0,True
5,X,138612670,A,T,0.12,0.03,0,release_F9,0.008,0.470,3.93,0.3370,-1.0,F9,no,,False,0,True
6,X,138612671,T,A,0.06,0.02,0,release_F9,0.009,-0.404,3.93,-0.8070,1.0,F9,no,,False,0,True
7,X,138612671,T,C,0.26,0.21,1,release_F9,0.009,-0.404,3.93,-0.8070,-1.0,F9,no,,False,0,True
8,X,138612671,T,G,0.08,0.01,0,release_F9,0.009,-0.404,3.93,-0.8070,-1.0,F9,no,,False,0,True
9,X,138612672,C,A,-0.13,0.05,0,release_F9,0.011,-0.251,3.93,0.6860,-1.0,F9,no,,False,0,True


In [9]:
cv_chunk = ChunkCV(df, DeepSeaSNP, model_kwargs={'classifier': 'xgb',
                                                 'feattypes':['diff']},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.67151189 0.65343182 0.63498297]


  if diff:


In [10]:
cv_chunk = ChunkCV(df, EnhancerOneHot, model_kwargs={'classifier': 'lr'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

  enhancers = df['regulatory_element'].astype('category', categories=self.enh_names)


[-1  0  1]
[0.52227546 0.62640861 0.51259548]


In [11]:
cv_chunk = ChunkCV(df, MixedModel, model_kwargs={'models': [DeepSeaSNP, EnhancerOneHot],
                                                 'classifier': 'xgb',
                                                 'model_kwargs': [{'feattypes':['diff']}, {}]},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[<class 'models.DeepSeaSNP'>, <class 'models.EnhancerOneHot'>]


  enhancers = df['regulatory_element'].astype('category', categories=self.enh_names)
  enhancers = df['regulatory_element'].astype('category', categories=self.enh_names)
  if diff:
  enhancers = df['regulatory_element'].astype('category', categories=self.enh_names)
  if diff:
  enhancers = df['regulatory_element'].astype('category', categories=self.enh_names)
  if diff:
  enhancers = df['regulatory_element'].astype('category', categories=self.enh_names)
  if diff:


[-1  0  1]
[0.6852266  0.65448542 0.63170888]


  enhancers = df['regulatory_element'].astype('category', categories=self.enh_names)
  if diff:


In [12]:
cv_chunk = ChunkCV(df, SNPContext, model_kwargs={'classifier': 'xgb',
                                                 'context_size': 2},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.49464575 0.51020767 0.50562773]


  if diff:


In [13]:
cv_chunk = ChunkCV(df, SNPContext, model_kwargs={'classifier': 'xgb',
                                                 'context_size': 4},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.50694934 0.54008246 0.50755304]


  if diff:


In [14]:
cv_chunk = ChunkCV(df, Conservation, model_kwargs={
                                                   'classifier': 'lr'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[-1  0  1]
[0.54613803 0.53437548 0.49431912]


In [15]:
cv_chunk = ChunkCV(df, MixedModel, model_kwargs={'models': [Conservation, SNPContext, DeepSeaSNP],
                                                 'model_kwargs': [{'scores': ['GerpRS', 'phyloP']},
                                                                  {'context_size': 2},
                                                                  {'feattypes': 'diff'}],
                                                 'classifier': 'xgb'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[<class 'models.Conservation'>, <class 'models.SNPContext'>, <class 'models.DeepSeaSNP'>]


  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.59636776 0.58535654 0.57957422]


  if diff:


In [16]:
cv_chunk = ChunkCV(df, MixedModel, model_kwargs={'models': [Conservation, DeepSeaSNP],
                                                 'model_kwargs': [{'scores': ['GerpRS', 'phyloP']},
                                                                  {'feattypes': 'diff'}],
                                                 'classifier': 'xgb'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))


[<class 'models.Conservation'>, <class 'models.DeepSeaSNP'>]


  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.68459967 0.65837532 0.63453087]


  if diff:


In [17]:
cv_chunk = ChunkCV(df, MixedModel, model_kwargs={'models': [SNPContext, DeepSeaSNP],
                                                 'model_kwargs': [{'context_size': 2,
                                                                   'raw_aggs': ['median']},
                                                                  {'feattypes': 'diff'}],
                                                 'classifier': 'xgb'},
                   fold_dict=fold_dict)
cvdf_chunk = cv_chunk.get_cv_preds()
binarizer = LabelBinarizer()

ybin = binarizer.fit_transform(cvdf_chunk['class'])
print(binarizer.classes_)
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk['cv_prediction']), average=None))

# Note that by comparision with the cv setup above, the avg size of the training set is smaller

[<class 'models.SNPContext'>, <class 'models.DeepSeaSNP'>]


  if diff:
  if diff:
  if diff:
  if diff:


[-1  0  1]
[0.65299663 0.62625973 0.58506188]


  if diff:


In [18]:
cv_chunk.fold_dict

{'release_F9': [[1], [0], [4], [3], [2]],
 'release_GP1BB': [[0, 4], [5], [3], [1], [2]],
 'release_HBB': [[1], [0], [2], [], []],
 'release_HBG1': [[2], [3], [1], [0], []],
 'release_HNF4A': [[2], [3], [0], [1], []],
 'release_IRF4': [[2, 6], [5, 4], [0, 3], [7], [1]],
 'release_IRF6': [[3, 5], [8, 2], [1, 6], [7, 4], [0]],
 'release_LDLR': [[2], [3], [0], [4], [1]],
 'release_MSMB': [[5, 8], [4, 0], [6, 1], [7, 3], [2]],
 'release_MYCrs6983267': [[3, 7], [2, 1], [0, 6], [5, 8], [4]],
 'release_PKLR': [[0, 4], [7, 6], [1, 3], [5], [2]],
 'release_SORT1': [[7, 6], [3, 4], [0, 1], [2, 8], [5]],
 'release_TERT-GBM': [[4], [3], [2], [1], [0]],
 'release_TERT-HEK293T': [[2], [1], [0], [3], [4]],
 'release_ZFAND3': [[9, 7, 3], [2, 6], [1, 0], [4, 10], [8, 5]]}

In [19]:
binarizer = LabelBinarizer()
ybin = binarizer.fit_transform(cvdf_chunk[cvdf_chunk['regulatory_element']=='release_F9']['class'])
print(roc_auc_score(ybin, binarizer.transform(cvdf_chunk[cvdf_chunk['regulatory_element']=='release_F9']['cv_prediction']), average=None))

[0.67214912 0.55822089 0.48017719]


In [20]:
cvdf_chunk[cvdf_chunk['cv_prediction'].isnull()]

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS,cv_prediction,base_element,is_break,chunk_length,is_start,chunk_id,is_train


In [21]:
cvdf_chunk.head()

Unnamed: 0,#Chrom,Pos,Ref,Alt,Value,Confidence,class,regulatory_element,phastCon,phyloP,GerpN,GerpRS,cv_prediction,base_element,is_break,chunk_length,is_start,chunk_id,is_train
0,X,138612669,T,A,-0.17,0.07,0,release_F9,0.006,0.47,3.93,1.49,0.0,F9,start,,True,0,True
1,X,138612669,T,C,-0.26,0.24,-1,release_F9,0.006,0.47,3.93,1.49,-1.0,F9,no,,False,0,True
2,X,138612669,T,G,0.34,0.05,0,release_F9,0.006,0.47,3.93,1.49,0.0,F9,no,,False,0,True
3,X,138612670,A,C,0.0,0.0,0,release_F9,0.008,0.47,3.93,0.337,0.0,F9,no,,False,0,True
4,X,138612670,A,G,0.22,0.2,1,release_F9,0.008,0.47,3.93,0.337,0.0,F9,no,,False,0,True
