In [1]:
import numpy as np
import pandas as pd
import time
from matplotlib import pyplot as plt
%matplotlib inline

# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve,auc

from sklearn.decomposition import PCA


In [2]:
import os
os.getcwd()

'/fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/figures/cancer_detection_supplement'

In [3]:
site_group = '30000-sites'

print(site_group)
#parameters
in_file = '../../delfi_data_cancer_detection/number_of_sites_analysis/merged_data/'+site_group+'_reformatted.txt'
cval_file = '../../delfi_data_cancer_detection/number_of_sites_analysis/logreg_PCA_results/30000-sites_logreg_results/'+site_group+'.c_values.txt'

#import training data
data = pd.read_csv(in_file, sep='\t')
data = data.set_index('sample')

#get features and exclude all other columns
features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude'))]

print('Features',len(features))

data = data.sort_index()

print('Total samples:',len(data))

#scale data
scaler = StandardScaler()
scaler.fit(data[features])
data[features] = scaler.transform(data[features])
data[features].mean()    

#perform PCA on the training set
n_components = min(len(features), len(data))
pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
PCs = pca.fit_transform(data[features])
principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = data.index)

#find the principle components that make up 80% of the varience
for j in range(len(pca.explained_variance_ratio_)):
    current_sum = pca.explained_variance_ratio_[:j].sum()
    if current_sum>=0.8:
        break

print('number of components:',j)
pca_features = ['PC_'+str(m) for m in np.arange(0,j)]

data = data[['status','Stage','tumor_fraction']].merge(principal_components, left_index = True, right_index = True)

cvals = pd.read_csv(cval_file,sep='\t', header = None)
best_c = cvals.mode().values[0][0]
print('best_c',best_c)

#train a  model on the full training dataset 
model = LogisticRegression(class_weight='balanced', max_iter=500, C=best_c)
model.fit(data[pca_features], data['status'])

#predict the test data
pred = model.predict(data[pca_features])
prob = model.predict_proba(data[pca_features])

data['pred']= pred
data['prob'] = prob[:,1]

print('training accuracy',sum(data['status'] == data['pred'])/len(data))


30000-sites
Features 810
Total samples: 423
number of components: 14
best_c 0.1
training accuracy 0.9054373522458629


In [4]:
pca_component_contributions = pd.DataFrame(pca.components_, index = ['PC_'+str(m) for m in np.arange(n_components)], columns = features)

In [5]:
model_coefs = pd.Series(model.coef_[0], index = pca_features)
model_coefs = pd.DataFrame(model_coefs)
model_coefs = model_coefs.rename(columns = {0:'coef'})
model_coefs['abs_val'] = np.abs(model_coefs['coef'])

In [6]:
top_index = model_coefs.sort_values(by = 'abs_val', ascending = False).index[0]
model_coefs.sort_values(by = 'abs_val', ascending = False)[0:5]


Unnamed: 0,coef,abs_val
PC_9,-0.634571,0.634571
PC_6,0.506097,0.506097
PC_8,0.42154,0.42154
PC_11,0.267721,0.267721
PC_1,0.257018,0.257018


In [7]:
top_component_values = pca_component_contributions.T[[top_index]].sort_values(by = top_index)

In [8]:
top_component_values.to_csv('files/S7_top_DELFI_coef_PCA_contributions.tsv', sep='\t')

In [9]:
top_component_values

Unnamed: 0,PC_9
mean_coverage_ZNF384.hg38.30000,-0.143468
central_coverage_ZNF384.hg38.30000,-0.132288
central_coverage_ATF4.hg38.30000,-0.119196
central_coverage_ATF7.hg38.30000,-0.097499
amplitude_MAF.hg38.30000,-0.096553
...,...
central_coverage_GLIS1.hg38.30000,0.109664
mean_coverage_XBP1.hg38.30000,0.121813
mean_coverage_GLIS3.hg38.30000,0.141731
amplitude_PHOX2B.hg38.30000,0.175269
