In [1]:
import numpy as np
import pandas as pd
import time
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve,auc


In [2]:
outfile_name = 'validation'
iterations = 1000

In [3]:
import os
os.getcwd()

'/fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/MBC_validation/validation_analysis'

In [4]:
site_group = '5e-4_qval'

#parameters
in_file = '../../MBC/ATAC_nucleosome_profiling/analysis/merged_data/'+site_group+'_reformatted.txt'
cval_file = '../../MBC/ATAC_nucleosome_profiling/analysis/logreg_results/'+site_group+'_logreg_results/'+site_group+'.c_values.txt'

test_data_file = 'merged_data/'+site_group+'_validation_reformatted.txt'


In [5]:
#import training data
data = pd.read_csv(in_file, sep='\t')
data = data.set_index('sample')

#get features and exclude all other columns
features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude'))]
# features = data.columns[(data.columns.str.startswith('amplitude'))]

print('Features',len(features))

data = data.sort_index()
status_col = 'revisions_ER_status_binary'
data['status'] = data[status_col].replace('+',1).replace('-',0)

print('Total samples:',len(data))

#scale data
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])
data[features].mean()


Features 12
Total samples: 254


central_coverage_ER_neg_heme.5e-4_qval       -1.423184e-15
mean_coverage_ER_neg_heme.5e-4_qval           6.273197e-15
amplitude_ER_neg_heme.5e-4_qval              -3.601668e-16
central_coverage_ER_pos_heme.5e-4_qval        1.328771e-15
mean_coverage_ER_pos_heme.5e-4_qval           8.437695e-15
amplitude_ER_pos_heme.5e-4_qval               3.846442e-17
central_coverage_ER_pos_specific.5e-4_qval   -1.307790e-15
mean_coverage_ER_pos_specific.5e-4_qval      -8.946474e-15
amplitude_ER_pos_specific.5e-4_qval           5.594825e-17
central_coverage_ER_neg_specific.5e-4_qval    1.531583e-15
mean_coverage_ER_neg_specific.5e-4_qval       3.601668e-15
amplitude_ER_neg_specific.5e-4_qval           1.328771e-16
dtype: float64

In [6]:
#import data
test_data = pd.read_csv(test_data_file, sep='\t')
test_data = test_data.set_index('sample')

#test data
print('test samples',len(test_data))

test_data = test_data.sort_index()

test_data[features] = scaler.transform(test_data[features])
test_data[features].mean()

test samples 144


central_coverage_ER_neg_heme.5e-4_qval        0.369014
mean_coverage_ER_neg_heme.5e-4_qval           0.344394
amplitude_ER_neg_heme.5e-4_qval               0.082355
central_coverage_ER_pos_heme.5e-4_qval        0.294134
mean_coverage_ER_pos_heme.5e-4_qval           0.400497
amplitude_ER_pos_heme.5e-4_qval              -0.239409
central_coverage_ER_pos_specific.5e-4_qval    0.309752
mean_coverage_ER_pos_specific.5e-4_qval       0.415392
amplitude_ER_pos_specific.5e-4_qval          -0.395851
central_coverage_ER_neg_specific.5e-4_qval    0.418454
mean_coverage_ER_neg_specific.5e-4_qval       0.387354
amplitude_ER_neg_specific.5e-4_qval          -0.522310
dtype: float64

In [7]:
cvals = pd.read_csv(cval_file,sep='\t', header = None)
best_c = cvals.mode().values[0][0]
print('best_c',best_c)

best_c 0.1


In [8]:
data['status'].value_counts()

1    133
0    121
Name: status, dtype: int64

In [9]:
#train a  model on the full training dataset 
model = LogisticRegression(class_weight='balanced', max_iter=500, C=best_c)
model.fit(data[features], data['status'])

#predict the test data
pred = model.predict(data[features])
prob = model.predict_proba(data[features])

data['pred']= pred
data['prob'] = prob[:,1]

print('training accuracy',np.round(sum(data['status'] == data['pred'])/len(data),3))

training accuracy 0.858


In [10]:
#predict the test data
pred = model.predict(test_data[features])
prob = model.predict_proba(test_data[features])

test_data['prediction']= pred
test_data['probability'] = prob[:,1]

test_data['accuracy'] = np.where(test_data['prediction']==test_data['status'],1,0)


In [11]:
#group test data by tumor fraction
test_data['tfx_group'] = np.where(test_data['tumor_fraction']>=0.1,">0.1_TFx",test_data['tumor_fraction'])
test_data['tfx_group'] = np.where((test_data['tumor_fraction']>=0.05) & (test_data['tumor_fraction']<0.1),"0.05-0.1_TFx",test_data['tfx_group'])
test_data['tfx_group'] = np.where((test_data['tumor_fraction']<0.05),"<0.05_TFx",test_data['tfx_group'])

#get only the ULP
test_data = test_data[test_data['dataset'].isin(['MBC_Stover','BRCA_Ghana_ULP','MBC_Dawson_downsampled'])]
print('exporting:')
print(test_data['dataset'].value_counts())
#export results for all samples
test_data.to_csv('validation_results/'+site_group+'.probabilities.txt',sep='\t')

#exclude second timepoints in the Stover Dataset for further benchmarking
test_data = test_data[(test_data['timepoint_within_pt_ctDNAonly']==1) | ~(test_data['dataset']=='MBC_Stover')].copy()

print('for bootstrapping:')
test_data['dataset'].value_counts()

exporting:
MBC_Stover                103
MBC_Dawson_downsampled     27
BRCA_Ghana_ULP             14
Name: dataset, dtype: int64
for bootstrapping:


MBC_Stover                30
MBC_Dawson_downsampled    27
BRCA_Ghana_ULP            14
Name: dataset, dtype: int64

In [12]:
test_data.sort_values(by = ['dataset','tfx_group'])[['dataset','tfx_group']].value_counts(sort=False)

dataset                 tfx_group   
BRCA_Ghana_ULP          0.05-0.1_TFx     4
                        <0.05_TFx        7
                        >0.1_TFx         3
MBC_Dawson_downsampled  0.05-0.1_TFx     2
                        <0.05_TFx       13
                        >0.1_TFx        12
MBC_Stover              0.05-0.1_TFx     6
                        <0.05_TFx       15
                        >0.1_TFx         9
dtype: int64

In [13]:
#get bootstrap values for each dataset within the validation data

#dataset_AUCs = pd.DataFrame()
dataset_accuracies = pd.DataFrame()

#bootstrap a training set with replacement    
for i in range(1000):
    if i %100==0:
        print(i)
    current_results = pd.DataFrame()   
    for dataset,df1 in test_data.groupby('dataset'):
        current_sample = df1.sample(len(df1), replace = True, random_state = i+100)[['probability','status','tfx_group','dataset','accuracy']]        
        del(df1)
        
        #get the metrics for the full dataset
        fpr, tpr, _ = roc_curve(current_sample['status'].values,current_sample['probability'])
        AUC = auc(fpr,tpr)
        accuracy = current_sample['accuracy'].mean()

        #save the metrics for the full dataset
        current_results = current_results.append({'group':'All samples','dataset':dataset,'AUC':AUC,'Accuracy':accuracy}, ignore_index=True)

        del(fpr,tpr,AUC,accuracy)

        #get the metrics for each tfx group
        for group,df2 in current_sample.groupby('tfx_group'):
            fpr, tpr, _ = roc_curve(df2['status'],df2['probability'])
            AUC = auc(fpr,tpr)
            accuracy = df2['accuracy'].mean()

            current_results = current_results.append({'group':group,'dataset':dataset,'AUC':AUC,'Accuracy':accuracy}, ignore_index=True)

            del(AUC,fpr,tpr,accuracy)

    #dataset_AUCs = dataset_AUCs.append(pd.DataFrame(current_results.set_index(['dataset','group'])['AUC']).T, ignore_index=True)
    dataset_accuracies = dataset_accuracies.append(pd.DataFrame(current_results.set_index(['dataset','group'])['Accuracy']).T, ignore_index=True)


0




100
200
300
400
500
600
700
800
900


In [14]:
display_df = pd.DataFrame(dataset_accuracies.median()).rename(columns = {0:'bootstrap_median'})
display_df['mean_accuracy'] = test_data[['accuracy','dataset','tfx_group']].groupby(['dataset','tfx_group']).mean()
overall = test_data[['accuracy','dataset']].groupby(['dataset']).mean()
overall['tfx_group'] = 'All samples'
display_df['mean_accuracy_overall'] = overall.reset_index().set_index(['dataset','tfx_group'])
display_df

Unnamed: 0_level_0,Unnamed: 1_level_0,bootstrap_median,mean_accuracy,mean_accuracy_overall
dataset,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BRCA_Ghana_ULP,0.05-0.1_TFx,1.0,1.0,
BRCA_Ghana_ULP,<0.05_TFx,0.571429,0.571429,
BRCA_Ghana_ULP,>0.1_TFx,1.0,1.0,
BRCA_Ghana_ULP,All samples,0.785714,,0.785714
MBC_Dawson_downsampled,0.05-0.1_TFx,1.0,1.0,
MBC_Dawson_downsampled,<0.05_TFx,0.7,0.692308,
MBC_Dawson_downsampled,>0.1_TFx,1.0,1.0,
MBC_Dawson_downsampled,All samples,0.851852,,0.851852
MBC_Stover,0.05-0.1_TFx,0.666667,0.666667,
MBC_Stover,<0.05_TFx,0.4,0.4,


In [15]:
#get bootstrap values for all validation data
AUCs = pd.DataFrame()
accuracies = pd.DataFrame()

for i in range(iterations):
    current_results = pd.DataFrame()
    #bootstrap a training set with replacement
    current_sample = test_data.sample(len(test_data), replace = True, random_state = i+100)[['probability','status','tfx_group','accuracy']]
    
    #get metrics for the current bootstrap
    fpr, tpr, _ = roc_curve(current_sample['status'].values,current_sample['probability'])
    AUC = auc(fpr,tpr)
    accuracy = current_sample['accuracy'].mean() 
    current_results = current_results.append({'group':'All samples','AUC':AUC,'Accuracy':accuracy}, ignore_index=True)
    del(fpr,tpr,AUC,accuracy)

    #get metrics for >0.05 in the current bootstrap
    df = current_sample[current_sample['tfx_group'].isin(['>0.1_TFx','0.05-0.1_TFx'])]
    fpr, tpr, _ = roc_curve(df['status'].values,df['probability'])
    AUC = auc(fpr,tpr)
    accuracy = df['accuracy'].mean()
    current_results = current_results.append({'group':'>0.05_TFx','AUC':AUC,'Accuracy':accuracy}, ignore_index=True)
    del(df,fpr,tpr,AUC,accuracy)
    
    #get the mean for each tfx group
    for group,df in current_sample.groupby('tfx_group'):
        fpr, tpr, _ = roc_curve(df['status'],df['probability'])
        AUC = auc(fpr,tpr)
        accuracy = df['accuracy'].mean()
        
        current_results = current_results.append({'group':group,'AUC':AUC,'Accuracy':accuracy}, ignore_index=True)
        
        #print(group,len(df))
        del(AUC,fpr,tpr)
            
    AUCs = AUCs.append(current_results.set_index('group')['AUC'], ignore_index=True)
    accuracies = accuracies.append(current_results.set_index('group')['Accuracy'], ignore_index=True)




In [16]:
pd.DataFrame(accuracies.median()).rename(columns = {0:'bootstrap_median'})

Unnamed: 0,bootstrap_median
All samples,0.732394
>0.05_TFx,0.918919
0.05-0.1_TFx,0.846154
<0.05_TFx,0.542857
>0.1_TFx,0.96


In [17]:
display(pd.DataFrame(AUCs.median()).rename(columns = {0:'bootstrap_median'}))

Unnamed: 0,bootstrap_median
All samples,0.753128
>0.05_TFx,0.962441
0.05-0.1_TFx,0.9
<0.05_TFx,0.388489
>0.1_TFx,0.984127


In [18]:

dataset_accuracies.to_csv('validation_results/'+site_group+'.dataset_accuracies.txt',sep='\t', index = False)

AUCs.to_csv('validation_results/'+site_group+'.AUCs.txt',sep='\t', index=False)
accuracies.to_csv('validation_results/'+site_group+'.accuracies.txt',sep='\t',index=False)


In [19]:
AUCs.median()

All samples     0.753128
>0.05_TFx       0.962441
0.05-0.1_TFx    0.900000
<0.05_TFx       0.388489
>0.1_TFx        0.984127
dtype: float64

In [20]:
#export AUC CIs
AUC_CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
AUC_CIs = AUC_CIs.rename(columns = {'Unnamed 0':'median'}).reset_index().rename(columns = {'index':'group'})
AUC_CIs.to_csv('validation_results/'+site_group+'.AUC_CI.txt', sep='\t', float_format = '%.5f', index = False)

AUC_CIs

Unnamed: 0,group,median,0.025,0.975
0,All samples,0.753128,0.624761,0.855087
1,>0.05_TFx,0.962441,0.877175,1.0
2,0.05-0.1_TFx,0.9,0.595741,1.0
3,<0.05_TFx,0.388489,0.1875,0.607578
4,>0.1_TFx,0.984127,0.890869,1.0


In [21]:
#export accuracy CIs
accuracy_CIs = pd.DataFrame([accuracies.median(), accuracies.quantile(.025), accuracies.quantile(.975)]).T
accuracy_CIs = accuracy_CIs.rename(columns = {'Unnamed 0':'median'}).reset_index().rename(columns = {'index':'group'})
accuracy_CIs.to_csv('validation_results/'+site_group+'.accuracy_CI.txt', sep='\t', float_format = '%.5f', index=False)

accuracy_CIs


Unnamed: 0,group,median,0.025,0.975
0,All samples,0.732394,0.619718,0.830986
1,>0.05_TFx,0.918919,0.823529,1.0
2,0.05-0.1_TFx,0.846154,0.6,1.0
3,<0.05_TFx,0.542857,0.378294,0.702703
4,>0.1_TFx,0.96,0.863636,1.0
