In [1]:
import numpy as np
import pandas as pd
import time
import sys
from matplotlib import pyplot as plt
%matplotlib inline

# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve,auc

In [2]:
#get data
in_file = '../../merge/Ulz_downsampled_cancer_detection_results_merged.txt'
feature_type = 'Ulz_downsampled'

metadata_columns = ['bam_name','sample_type','cancer_present']

data = pd.read_csv(in_file, sep='\t')

data = data.set_index('sample')
data['status'] = data['cancer_present']

#get only the training samples used in delfi
training_samples = pd.read_csv("../../../../metadata/cancer_detection/delfi_training_samples.txt", header=None)
data = data[data.index.isin(training_samples[0])].copy()

print(len(data))
print(data['sample_type'].value_counts())


423
Healthy              215
Breast_Cancer         54
Pancreatic_Cancer     34
Ovarian_Cancer        28
Gastric_cancer        27
Colorectal_Cancer     27
Bile_Duct_Cancer      25
Lung_Cancer           12
Duodenal_Cancer        1
Name: sample_type, dtype: int64


In [3]:
#view data
data.head()

Unnamed: 0_level_0,ADNP,AEBP2,AhR,Androgen,AP-2&#945;,AP-2&#947;,AP-4,ARID1A,ARID1B,ARID2,...,ZSCAN16,ZSCAN22,ZSCAN2,ZSCAN5A,ZSCAN5D,ZXDC,bam_name,sample_type,cancer_present,status
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Healthy_CGPLH640,0.811035,0.809477,0.645685,0.524515,0.698147,0.639548,0.691884,0.670617,1.231582,0.725045,...,0.698984,0.696669,0.920359,0.66765,0.744383,0.640529,PGDX18259P_WGS.sorted_processed,Healthy,0,0
Breast_Cancer_CGPLBR24,0.740001,0.790616,0.651644,0.628981,1.180467,0.596563,1.632324,0.765319,0.656969,0.727409,...,0.676794,1.124909,0.620888,1.171612,0.701792,0.847923,PGDX2750P_WGS_X1.sorted_processed,Breast_Cancer,1,1
Colorectal_Cancer_CGCRC292,1.033225,1.052757,0.728701,0.811694,0.6715,0.598978,0.847551,1.041598,0.780689,1.391952,...,0.882978,0.579957,0.608522,0.854119,0.88918,0.747744,PGDX5882P_WGS_processed_downsamp,Colorectal_Cancer,1,1
Colorectal_Cancer_CGCRC341,0.731452,0.55739,0.422682,0.568415,0.624778,1.209577,0.659978,0.767851,1.226222,0.576228,...,0.667641,0.753801,0.769049,0.572226,0.75257,0.536912,PGDX8828P_WGS.sorted_processed,Colorectal_Cancer,1,1
Healthy_CGPLH324,0.498629,0.831827,0.552264,0.60949,0.646907,0.715652,0.869012,1.035716,0.745268,1.526194,...,0.892366,1.096187,1.352853,0.604555,0.773505,0.663729,PGDX18251P_WGS.sorted_processed,Healthy,0,0


In [4]:
#get a list of all possible features
features = data.columns.drop(['status']+metadata_columns)

#these two should be the seame
print(len(features))
print(len(features[features.str.contains('fft_10_magnitude')])*3)

504
0


In [5]:
#scale data
scaler = StandardScaler()
scaler.fit(data[features])
data[features] = scaler.transform(data[features])
data[features].mean()

ADNP         -4.393649e-16
AEBP2         5.249281e-18
AhR           1.002088e-15
Androgen      4.755849e-16
AP-2&#945;   -4.262417e-16
                  ...     
ZSCAN22       2.055094e-16
ZSCAN2        1.057730e-16
ZSCAN5A      -6.955298e-17
ZSCAN5D      -9.159996e-17
ZXDC         -4.606244e-16
Length: 504, dtype: float64

In [None]:
#Leave one out nested cross validation
hyperparameters = {'C': [0.00001, 0.0001, 0.001,0.01,0.1,1,10,100]}

start_time = time.time()
n_iter = 1000

probabilities = pd.DataFrame(index=data.index)
c_vals = []
coefs = pd.DataFrame(index=features)

# Loop for each iteration
for i in range(n_iter):
    if i%50==0:
        print(i, time.time()-start_time)
        sys.stdout.flush()
    
    #bootstrap a training set with replacement
    training = data.sample(len(data), replace = True, random_state = i+100)
    
    #the test set is all samples that aren't seen in the training data
    test = data[~(data.index.isin(training.index))]
        
    #10 fold cross validation on the training set
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = i+100) 

    model = LogisticRegression(class_weight='balanced', max_iter=500)
    search = GridSearchCV(estimator=model, param_grid=hyperparameters, cv=cv, n_jobs = 1)
    search.fit(training[features], training['status'])
    best_C = search.best_params_['C']
    
    #train a new model on the full training dataset (is this the same as refit...?)
    model = LogisticRegression(class_weight='balanced', max_iter=500, C=best_C)
    model.fit(training[features], training['status'])

    #predict the test data
    pred = model.predict(test[features])
    prob = model.predict_proba(test[features])

    
    #collect metrics
    current_output = pd.DataFrame(test[['status']])#.reset_index()
    current_output['probability']=prob[:,1]
    
    #save results
    probabilities[i] = current_output['probability']
    c_vals.append(best_C)
    coefs[i] = pd.Series(model.coef_[0], index = features)

probabilities = probabilities.merge(data[metadata_columns+['status']], left_index=True, right_index=True)

0 0.000949859619140625
50 44.1033148765564
100 84.3633279800415
150 127.02383995056152
200 163.27746176719666
250 200.87864303588867
300 241.55170893669128
350 280.24127078056335
400 319.27598810195923
450 356.4936830997467
500 394.9091649055481
550 432.34111189842224
600 470.2382769584656
650 507.86152505874634
700 545.7978928089142
750 584.3272180557251


In [1]:
probabilities.to_csv(feature_type+'_results/probabilities.txt', sep='\t')
pd.Series(c_vals).to_csv(feature_type+'_results/c_values.txt', sep='\t', header = False, index=False)
coefs.to_csv(feature_type+'_results/coefs.txt', sep='\t')

NameError: name 'probabilities' is not defined

In [None]:
plt.hist([str(m) for m in sorted(c_vals)])
plt.savefig(feature_type+'_results/cvals.pdf')