In [1]:
import numpy as np
import pandas as pd
import time
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve,auc


In [2]:
#path to your data here

#must be a tab separated text file with a column named 'sample' containing the unique sample name, and one column per feature for each of the 12 features output by Griffin for the ATAC analysis:
#additional columns are optional and will be retained in the output but not used for anything else

#features:
# central_coverage_ER_neg_heme.5e-4_qval
# mean_coverage_ER_neg_heme.5e-4_qval
# amplitude_ER_neg_heme.5e-4_qval
# central_coverage_ER_pos_heme.5e-4_qval
# mean_coverage_ER_pos_heme.5e-4_qval
# amplitude_ER_pos_heme.5e-4_qval
# central_coverage_ER_pos_specific.5e-4_qval
# mean_coverage_ER_pos_specific.5e-4_qval
# amplitude_ER_pos_specific.5e-4_qval
# central_coverage_ER_neg_specific.5e-4_qval
# mean_coverage_ER_neg_specific.5e-4_qval
# amplitude_ER_neg_specific.5e-4_qval

test_data_file = '../../MBC_validation/validation_analysis/merged_data/5e-4_qval_validation_reformatted.txt'
outfile_name = 'ER_status_results.tsv'

In [3]:
#import training data
in_file = '../../MBC/ATAC_nucleosome_profiling/analysis/merged_data/5e-4_qval_reformatted.txt'
cval_file = '../../MBC/ATAC_nucleosome_profiling/analysis/logreg_results/5e-4_qval_logreg_results/5e-4_qval.c_values.txt'

In [4]:
#import training data
data = pd.read_csv(in_file, sep='\t')
data = data.set_index('sample')

#get features and exclude all other columns
features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude'))]
# features = data.columns[(data.columns.str.startswith('amplitude'))]

print('Features',len(features))

data = data.sort_index()
status_col = 'revisions_ER_status_binary'
data['status'] = data[status_col].replace('+',1).replace('-',0)

print('Total samples:',len(data))

#scale data
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])
data[features].mean()


Features 12
Total samples: 254


central_coverage_ER_neg_heme.5e-4_qval       -1.423184e-15
mean_coverage_ER_neg_heme.5e-4_qval           6.273197e-15
amplitude_ER_neg_heme.5e-4_qval              -3.601668e-16
central_coverage_ER_pos_heme.5e-4_qval        1.328771e-15
mean_coverage_ER_pos_heme.5e-4_qval           8.437695e-15
amplitude_ER_pos_heme.5e-4_qval               3.846442e-17
central_coverage_ER_pos_specific.5e-4_qval   -1.307790e-15
mean_coverage_ER_pos_specific.5e-4_qval      -8.946474e-15
amplitude_ER_pos_specific.5e-4_qval           5.594825e-17
central_coverage_ER_neg_specific.5e-4_qval    1.531583e-15
mean_coverage_ER_neg_specific.5e-4_qval       3.601668e-15
amplitude_ER_neg_specific.5e-4_qval           1.328771e-16
dtype: float64

In [5]:
#import test data
test_data = pd.read_csv(test_data_file, sep='\t')
test_data = test_data.set_index('sample')

#test data
print('test samples',len(test_data))

test_data = test_data.sort_index()

test_data[features] = scaler.transform(test_data[features])
test_data[features].mean()

test samples 144


central_coverage_ER_neg_heme.5e-4_qval        0.369014
mean_coverage_ER_neg_heme.5e-4_qval           0.344394
amplitude_ER_neg_heme.5e-4_qval               0.082355
central_coverage_ER_pos_heme.5e-4_qval        0.294134
mean_coverage_ER_pos_heme.5e-4_qval           0.400497
amplitude_ER_pos_heme.5e-4_qval              -0.239409
central_coverage_ER_pos_specific.5e-4_qval    0.309752
mean_coverage_ER_pos_specific.5e-4_qval       0.415392
amplitude_ER_pos_specific.5e-4_qval          -0.395851
central_coverage_ER_neg_specific.5e-4_qval    0.418454
mean_coverage_ER_neg_specific.5e-4_qval       0.387354
amplitude_ER_neg_specific.5e-4_qval          -0.522310
dtype: float64

In [6]:
cvals = pd.read_csv(cval_file,sep='\t', header = None)
best_c = cvals.mode().values[0][0]
print('best_c',best_c)

best_c 0.1


In [7]:
#train a  model on the full training dataset 
model = LogisticRegression(class_weight='balanced', max_iter=500, C=best_c)
model.fit(data[features], data['status'])

#predict the test data
pred = model.predict(data[features])
prob = model.predict_proba(data[features])

data['pred']= pred
data['prob'] = prob[:,1]

print('training accuracy',np.round(sum(data['status'] == data['pred'])/len(data),3))

training accuracy 0.858


In [8]:
#predict the test data
pred = model.predict(test_data[features])
prob = model.predict_proba(test_data[features])

test_data['prediction']= pred
test_data['probability'] = prob[:,1]


In [9]:
test_data = test_data.drop(columns = features)
test_data.to_csv(outfile_name, sep='\t')


In [10]:
# #output model in pickle format in case that is helpful to future users
# import pickle
# with open('ER_status_scaler.pkl', 'wb') as f:
#     pickle.dump(scaler, f)
    
# with open('ER_status_model.pkl', 'wb') as f:
#     pickle.dump(model, f)