In [1]:
import numpy as np
import pandas as pd
import time
from matplotlib import pyplot as plt
%matplotlib inline

# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve,auc

from sklearn.decomposition import PCA


In [2]:
#path to your data here

#must be a tab separated text file with a column named 'sample' containing the unique sample name, and one column per feature for each of the 810 features output by Griffin for the TFBS 30,000 site analysis:
#additional columns are optional and will be retained in the output but not used for anything else

# example features:
# central_coverage_AHR.hg38.30000     
# mean_coverage_AHR.hg38.30000
# amplitude_AHR.hg38.30000

test_data_file = '../../lung_validation_cancer_detection/number_of_sites_analysis/merged_data/30000-sites_validation_reformatted.txt'
outfile_name = 'cancer_detection_results.tsv'


In [3]:
#import training data
in_file = '../../lung_validation_cancer_detection/number_of_sites_analysis/merged_data/30000-sites_LUCAS_reformatted.txt'
cval_file = '../../lung_validation_cancer_detection/number_of_sites_analysis/logreg_PCA_results/30000-sites_logreg_results/30000-sites.c_values.txt'


In [4]:
#import training data
data = pd.read_csv(in_file, sep='\t')
data = data.set_index('sample')

#get features and exclude all other columns
griffin_features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude'))]

print('Features',len(griffin_features))

data = data.sort_index()

print('Total samples:',len(data))

#scale data
scaler = StandardScaler()
scaler.fit(data[griffin_features])
data[griffin_features] = scaler.transform(data[griffin_features])
data[griffin_features].mean()

Features 810
Total samples: 287


central_coverage_AHR.hg38.30000     -1.708273e-15
central_coverage_AR.hg38.30000      -8.705386e-15
central_coverage_ARNT.hg38.30000     3.682691e-15
central_coverage_ARNTL.hg38.30000    5.567362e-15
central_coverage_ASCL1.hg38.30000   -4.951517e-15
                                         ...     
mean_coverage_ZNF467.hg38.30000      3.922840e-14
mean_coverage_ZNF554.hg38.30000     -4.439035e-14
mean_coverage_ZNF580.hg38.30000     -4.097381e-15
mean_coverage_ZNF770.hg38.30000      4.622551e-14
mean_coverage_ZSCAN16.hg38.30000     4.857129e-15
Length: 810, dtype: float64

In [5]:
#import test data
test_data = pd.read_csv(test_data_file, sep='\t')
test_data = test_data.set_index('sample')

test_data = test_data.sort_index()

#scale data
test_data[griffin_features] = scaler.transform(test_data[griffin_features])
test_data[griffin_features].mean()

print(len(test_data))
test_data['status'].value_counts()

431


0    385
1     46
Name: status, dtype: int64

In [6]:
test_data = test_data[griffin_features].copy()

In [7]:
#perform PCA on the training set
n_components = min(len(griffin_features), len(data))
pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
PCs = pca.fit_transform(data[griffin_features])
principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = data.index)

#find the principle components that make up 80% of the varience
for j in range(len(pca.explained_variance_ratio_)):
    current_sum = pca.explained_variance_ratio_[:j].sum()
    if current_sum>=0.8:
        break

print('number of components:',j)
pca_features = ['PC_'+str(m) for m in np.arange(0,j)]
data = data.drop(columns = griffin_features).merge(principal_components[pca_features], left_index = True, right_index = True)

#import cvalue
cvals = pd.read_csv(cval_file,sep='\t', header = None)
best_c = cvals.mode().values[0][0]
print('best_c',best_c)

#train a  model on the full training dataset 
model = LogisticRegression(class_weight='balanced', max_iter=500, C=best_c)
model.fit(data[pca_features], data['status'])

#predict the training data
pred = model.predict(data[pca_features])
prob = model.predict_proba(data[pca_features])

data['pred']= pred
data['prob'] = prob[:,1]

print('training accuracy',sum(data['status'] == data['pred'])/len(data))

number of components: 35
best_c 0.01
training accuracy 0.7804878048780488


In [8]:
#apply to the test data
test_PCs = pca.transform(test_data[griffin_features])
test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = test_data.index)

test_data = test_data.drop(columns = griffin_features).merge(test_principal_components[pca_features], left_index = True, right_index = True)

#predict the test data
pred = model.predict(test_data[pca_features])
prob = model.predict_proba(test_data[pca_features])

test_data['prediction']= pred
test_data['probability'] = prob[:,1]

In [9]:
test_data = test_data.drop(columns = pca_features)
test_data.to_csv(outfile_name, sep='\t')


In [10]:
# #output model in pickle format in case that is helpful to future users

# import pickle
# with open('cancer_detection_scaler.pkl', 'wb') as f:
#     pickle.dump(scaler, f)

# with open('cancer_detection_PCA.pkl', 'wb') as f:
#     pickle.dump(pca, f)
    
# with open('cancer_detection_model.pkl', 'wb') as f:
#     pickle.dump(model, f)
    
