In [130]:
patch10 = ['BRCA', 'KIRCKICH', 'KIRP', 'LUAD', 'PAAD', 'PRAD', 'TGCT', 'THCA']

In [46]:
# Final format, sample count script
import time
start_time = time.time()

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import f1_score
import pandas as pd
import math
import statistics

import argparse
parser = argparse.ArgumentParser(description = 'Cancer and sample size input')
parser.add_argument('cancer_name', type = str, help='Input cancer name')
parser.add_argument('sample_size', type = int, help='Input sample size')
# args = parser.parse_args() # * Turn off for devel

three_col = pd.read_csv('../sk_grid_best_models.tsv', sep = '\t')

date = '2021-06-18' # * script created

# cancer_row = three_col[three_col.Cancer == args.cancer_name] # * off for devel
cancer_row = three_col[three_col.Cancer == 'THCA']

set_size = 10 # list(range(10,101,10))
# set_size = args.sample_size # * devel off, e.g. sbatch array_250.sh call w/ 10-250:10 range

idx = cancer_row.index[0]

# Process UCSC best model file format
feat_read = three_col.iloc[idx,2].split('_')[1]+'_'+three_col.iloc[idx,2].split('_')[2]+'_'+three_col.iloc[idx,2].split('_')[3]

feat_shard = pd.read_csv('../features/'+feat_read+'.tsv',sep='\t')

cohort = three_col.iloc[idx,0]

rpt_fld_file = pd.read_csv('../cross_val/'+cohort+'_CVfolds_5FOLD_v12_20210228.tsv',
                                                        sep='\t')
clf_list = [
    [SGDClassifier(alpha=0.001,loss='perceptron',penalty='l1')],
    [SVC(C=0.2,kernel='linear')],
    [SVC(C=1,kernel='linear')],
    [SVC(C=2,kernel='linear')],
    [SVC(C=1.5,kernel='linear')],
    [SVC(C=1,kernel='linear')],
    [SVC(C=1.5,kernel='rbf')],
    [SVC(C=2,kernel='linear')],
    [SGDClassifier(alpha=0.0001,loss='squared_hinge',penalty='l1')],
    [SVC(C=0.2,kernel='linear')],
    [SVC(C=2,kernel='linear')],
    [SVC(C=0.5,kernel='linear')],
    [SVC(C=0.2,kernel='linear')],
    [SVC(C=1.5,kernel='linear')],
    [SVC(C=0.2,kernel='linear')],
    [SVC(C=0.2,kernel='linear')],
    [PassiveAggressiveClassifier(C=0.8)],
    [PassiveAggressiveClassifier(C=1.0)],
    [PassiveAggressiveClassifier(C=0.4)],
    [SVC(C=0.2,kernel='linear')],
    [SGDClassifier(alpha=0.1,loss='hinge',penalty='l2')],
    [SVC(C=2,kernel='poly')],
    [SVC(C=1.5,kernel='rbf')],
    [SVC(C=1.5,kernel='linear')],
    [SVC(C=0.2,kernel='rbf')],
    [SVC(C=0.2,kernel='linear')],
]

clf = clf_list[idx][0] # Initialize classifier for cohort

def extrct_nms(shrd_pth):
    slash_split = shrd_pth.split('/')
    cncr = slash_split[-1].split('_')[0]
    uniq = slash_split[-1].split('.')[0]
    mthd_dtyp = uniq.split('_')[1]+'_'+uniq.split('_')[2]
    return (cncr, mthd_dtyp)

(cncr,mthd_dtyp) = extrct_nms('./features/'+feat_read+'.tsv') # get column name components

rate_scores = [] # Storage list for scores at different sample counts
errors = []
sample_counts = []

# sample_counts.append(set_size) # arg parse from list in shell file
hdr_lst=['Sample_ID', 'Repeat', 'Fold', 'Test', 'Label']
unique_name=str(clf)+'|'+mthd_dtyp+'|'+date+'|c'
results_frame_header = hdr_lst.append(unique_name)
results_storeDF=pd.DataFrame(columns=results_frame_header) # Empty data frame with columns for this classifier

# for set_size in set_list: # * Devel loop, replaced with array arg from shell script
if set_size >= len(feat_shard):
    exit()
#     print(set_size)

loop_start_time = time.time()
for i in list(range(0,100)):
    sample_counts.append(set_size)
#     print(i)

    shard_subset = feat_shard.sample(set_size)
    
    rpt_scores = [] # scores for error
    for rx_fx in rpt_fld_file.iloc[:,2:12]: # enumerate?
        
        feat_shard['rpt_fld'] = rpt_fld_file[rx_fx] #label molecular shard for train/test split, will overwrite
        labeled_subset = feat_shard.iloc[shard_subset.index,:]

        trainDF = labeled_subset[labeled_subset['rpt_fld']==0]
        testDF = labeled_subset[labeled_subset['rpt_fld']==1]
#         print(rx_fx)
        if len(testDF) == 0: # Error one, no samples in test set
            transfer_row = pd.DataFrame(trainDF[-1:].values, index=[0], columns=trainDF.columns)
            trainDF.drop(trainDF.tail(1).index,inplace=True)
            transfer_row.iloc[0,-1] = 1
            testDF = testDF.append(transfer_row)
#                 print('zero_test')
#             break
        
#         break
        while trainDF.Labels.nunique() == 1: # Error two, only one subtype in training set
            shard_subset = feat_shard.sample(set_size)
            trainDF = shard_subset[shard_subset['rpt_fld']==0]
            testDF = shard_subset[shard_subset['rpt_fld']==1]
#                 print('one_label')

        if len(testDF) == 0: # Error 1.2, no samples in test again after resampling
            transfer_row = pd.DataFrame(trainDF[-1:].values, index=[0], columns=trainDF.columns)
            trainDF.drop(trainDF.tail(1).index,inplace=True)
            transfer_row.iloc[0,-1] = 1
            testDF = testDF.append(transfer_row)
            
        if trainDF.Labels.nunique() == 1:
            print('continue')
            continue

        X_trn=trainDF.drop(columns=[cncr,'Labels','rpt_fld']) # raw feature columns remain
        x_tst=testDF.drop(columns=[cncr,'Labels','rpt_fld'])
        y_trn=trainDF['Labels'].str.split('_',expand=True)[1].astype(int) # Strip TCGA code off number
        y_tst=testDF['Labels'].str.split('_',expand=True)[1].astype(int)

        trnD={'Sample_ID':trainDF[cncr], # Build storage frame with info for training samples, will attach prediction results column to RHS
          'Repeat':rx_fx.split(':')[0],
          'Fold':rx_fx.split(':')[1],
          'Test':trainDF.rpt_fld,
          'Label':trainDF.Labels}
        rsltsTRN=pd.DataFrame(data=trnD)
        rsltsTRN.reset_index(inplace=True,drop=True)

        tstD={'Sample_ID':testDF[cncr],
          'Repeat':rx_fx.split(':')[0],
          'Fold':rx_fx.split(':')[1],
          'Test':testDF.rpt_fld,
          'Label':testDF.Labels}
        rsltsTST=pd.DataFrame(data=tstD)
        rsltsTST.reset_index(inplace=True,drop=True)

        clf.fit(X_trn, y_trn)

        y_prd_trn = clf.predict(X_trn) # Predict on training sample feature values
        y_prd_tst = clf.predict(x_tst) # Predict on testing sample feature values

        rpt_score=f1_score(y_tst,y_prd_tst,average='weighted')
        rpt_scores.append(rpt_score) # Error bars data

        trn_series = pd.Series(y_prd_trn.astype(str)) # convert prediction output array to series; now has index
        tst_series = pd.Series(y_prd_tst.astype(str))

        trn_labl=cncr+'_'+trn_series.str[0] # reattach cancer label to predictions
        tst_labl=cncr+'_'+tst_series.str[0]

         # Make a column header name for classifier plus feature set which is "the model" in Synapse format

        rsltsTRN[unique_name]=trn_labl
        rsltsTST[unique_name]=tst_labl

        results_storeDF=pd.concat([results_storeDF,rsltsTRN,rsltsTST]) # Stack up the prediction results for this repeat fold
#     break
    stdv = statistics.stdev(rpt_scores)
    errors.append(stdv)

    test_set=results_storeDF[results_storeDF.Test==1]

    y_true_str=test_set.Label
    y_true=[]
    for k in y_true_str:
        splt=k.split('_')
        y_str_ele=splt[1]
        y_int_ele=int(y_str_ele)
        y_true.append(y_int_ele)
#     print(len())
    col=test_set.iloc[:,5]
    y_pred=[]
    for j in col:
        splt=j.split('_')
        y_str_ele=splt[1]
        y_int_ele=int(y_str_ele)
        y_pred.append(y_int_ele)
#     print(len(y_pred))
    score=f1_score(y_true,y_pred,average='weighted') # Standard deviation on raw means 
    rate_scores.append(round(score, 3))
row_df = pd.DataFrame([[cncr, sample_counts, rate_scores, errors]], columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
# write to out_file
loop_end_time = time.time()
file_time =  loop_end_time - loop_start_time
row_df.to_csv('rerun_10/'+ # write results file for this classfier, cohort, sample step combo, date
             cncr+'_'+date+'_runtime:'+str(round(file_time))+'_set.'+str(set_size)+'_reimport.tsv',
                      index=None, sep='\t')

In [10]:
import glob

In [2]:
import pandas as pd

### Data files to upload to exacloud
results/2021-06-03_archive  
big1 and substep  
LIHCCHOL

In [None]:
# Sample count experiment results integration

In [None]:
# Per variance_check.ipynb

In [32]:
import statistics

In [88]:
# Put all the small cancer result files into a table
small_cohort_paths = sorted(glob.glob('/Users/karlberb/work/tmp/v12/sample_size_experiment/smoothing/results/2021-06-03_archive/*.tsv'))
table = pd.DataFrame(columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
for file in small_cohort_paths:
    df = pd.read_csv(file, sep = '\t')
    table = pd.concat([table,df])
table.reset_index(inplace = True, drop = True)
cnt_ordr = []
for count_lst in table.Sample_counts:
    cnt = eval(count_lst)[0]
    cnt_ordr.append(cnt)
table['Ordr'] = cnt_ordr
table = table.sort_values(['Cancer', 'Ordr'])
table.reset_index(inplace = True, drop = True)
small_cohorts =  pd.DataFrame(columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
for cncr in table.Cancer.unique():
    block = table[table['Cancer'] == cncr]
    block.reset_index(inplace = True, drop = True)
    samp_counts = []
    rate_scores = []
    error = []
    for i in block.index:
        score = eval(block.iloc[i,2])[-1]
        samp_counts.append(eval(block.iloc[i,1])[-1])
        rate_scores.append(score)
        error.append(round(statistics.mean(eval(block.iloc[i,3])),3)) # Might need to average this
    df = pd.DataFrame([[cncr, samp_counts, rate_scores, error]], columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
    small_cohorts = pd.concat([small_cohorts,df])
small_cohorts.reset_index(inplace = True, drop = True)    

In [89]:
small_cohorts

Unnamed: 0,Cancer,Sample_counts,Rate_scores,Error
0,ACC,"[10, 20, 30, 40, 50, 60, 70]","[0.616, 0.702, 0.751, 0.789, 0.81, 0.827, 0.836]","[0.36, 0.246, 0.189, 0.146, 0.124, 0.102, 0.088]"
1,CESC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.711, 0.801, 0.847, 0.864, 0.877, 0.885, 0.8...","[0.327, 0.215, 0.155, 0.135, 0.105, 0.101, 0.0..."
2,ESCC,"[10, 20, 30, 40, 50, 60, 70, 80]","[0.734, 0.856, 0.892, 0.923, 0.934, 0.948, 0.9...","[0.318, 0.187, 0.121, 0.089, 0.075, 0.057, 0.0..."
3,GEA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.458, 0.535, 0.609, 0.626, 0.657, 0.685, 0.6...","[0.332, 0.276, 0.223, 0.19, 0.166, 0.138, 0.12..."
4,KIRCKICH,"[20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120...","[0.903, 0.917, 0.916, 0.934, 0.94, 0.935, 0.94...","[0.148, 0.114, 0.106, 0.081, 0.073, 0.076, 0.0..."
5,KIRP,"[20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120...","[0.706, 0.731, 0.761, 0.766, 0.787, 0.796, 0.8...","[0.241, 0.186, 0.15, 0.138, 0.125, 0.115, 0.10..."
6,LGGGBM,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.567, 0.672, 0.689, 0.723, 0.745, 0.767, 0.7...","[0.343, 0.249, 0.206, 0.165, 0.142, 0.124, 0.1..."
7,LUAD,"[20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120...","[0.552, 0.601, 0.654, 0.677, 0.68, 0.694, 0.70...","[0.268, 0.209, 0.179, 0.158, 0.138, 0.126, 0.1..."
8,MESO,"[10, 20, 30, 40, 50, 60, 70]","[0.565, 0.7, 0.742, 0.777, 0.81, 0.833, 0.849]","[0.345, 0.244, 0.182, 0.144, 0.117, 0.095, 0.08]"
9,OV,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.532, 0.598, 0.659, 0.701, 0.73, 0.753, 0.76...","[0.357, 0.273, 0.203, 0.169, 0.151, 0.136, 0.1..."


In [None]:
# Big cohorts

In [86]:
big_6 = sorted(glob.glob('../smoothing/results/2021-06-03_big_1/*.tsv'))
table = pd.DataFrame(columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])

for tsv in big_6:
    df = pd.read_csv(tsv, sep = '\t')
    table = pd.concat([table,df])
table.reset_index(inplace = True, drop = True)
cnt_ordr = []

for count_lst in table.Sample_counts:
    cnt = eval(count_lst)[0]
    cnt_ordr.append(cnt)
table['Ordr'] = cnt_ordr
table = table.sort_values(['Cancer', 'Ordr'])
table.reset_index(inplace = True, drop = True)

big_cohorts =  pd.DataFrame(columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
for cncr in table.Cancer.unique():
    block = table[table['Cancer'] == cncr]
    block.reset_index(inplace = True, drop = True)
    samp_counts = []
    rate_scores = []
    error = []
    
    for i in block.index:
        score = eval(block.iloc[i,2])[-1]
        samp_counts.append(eval(block.iloc[i,1])[-1])
        rate_scores.append(score)
        error.append(round(statistics.mean(eval(block.iloc[i,3])),3))
    df = pd.DataFrame([[cncr, samp_counts, rate_scores, error]], columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
    big_cohorts = pd.concat([big_cohorts,df])
big_cohorts.reset_index(inplace = True, drop = True)

In [87]:
big_cohorts

Unnamed: 0,Cancer,Sample_counts,Rate_scores,Error
0,BLCA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.454, 0.571, 0.632, 0.66, 0.687, 0.71, 0.727...","[0.358, 0.266, 0.217, 0.174, 0.151, 0.134, 0.1..."
1,BRCA,"[20, 30, 40, 50, 60, 70, 80, 90, 100]","[0.613, 0.681, 0.715, 0.744, 0.768, 0.771, 0.7...","[0.27, 0.214, 0.175, 0.146, 0.129, 0.114, 0.10..."
2,COADREAD,"[10, 20, 30, 40]","[0.375, 0.505, 0.588, 0.63]","[0.331, 0.271, 0.213, 0.18]"
3,HNSC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]","[0.326, 0.427, 0.48, 0.533, 0.58, 0.603, 0.629...","[0.318, 0.273, 0.217, 0.189, 0.173, 0.152, 0.1..."
4,LIHCCHOL,"[10, 20, 30, 40, 50, 60]","[0.381, 0.501, 0.589, 0.625, 0.665, 0.689]","[0.348, 0.279, 0.217, 0.179, 0.166, 0.143]"
5,LUSC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.431, 0.499, 0.555, 0.6, 0.621, 0.644, 0.653...","[0.336, 0.265, 0.209, 0.182, 0.159, 0.143, 0.1..."
6,SARC,"[10, 20, 30, 40, 50]","[0.769, 0.85, 0.876, 0.886, 0.904]","[0.278, 0.173, 0.135, 0.115, 0.094]"


In [97]:
subs = sorted(glob.glob('../smoothing/results/substep/*.tsv'))
subtab = pd.DataFrame(columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])

for tsv in subs:
    df = pd.read_csv(tsv, sep = '\t')
    subtab = pd.concat([subtab,df])
subtab.reset_index(inplace = True, drop = True)

cnt_ordr = []
for count_lst in subtab.Sample_counts:
    cnt = eval(count_lst)[0]
    cnt_ordr.append(cnt)
subtab['Ordr'] = cnt_ordr
subtab = subtab.sort_values(['Cancer', 'Ordr'])
subtab.reset_index(inplace = True, drop = True)

big_cohorts_upper =  pd.DataFrame(columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
for cncr in subtab.Cancer.unique():
    block = subtab[subtab['Cancer'] == cncr].copy()
    block.reset_index(inplace = True, drop = True)
    mean_list = []
    rate_list = []
    error_list = []
    for rate in block.Ordr.unique():
        rate_list.append(rate)
        sub_block = block[block['Ordr'] == rate]
        
        av_list = []
        for score_list in sub_block.Rate_scores:
            av_score = float(eval(score_list)[-1])
            av_list.append(av_score)
        mn = statistics.mean(av_list)
        mean_list.append(round(mn,3))
        
        mn_err_list = []
        for err_list in sub_block.Error:
            err = statistics.mean(eval(err_list))
            mn_err_list.append(err)
        error_list.append(round(statistics.mean(mn_err_list),3))
        
    df = pd.DataFrame([[cncr, rate_list, mean_list, error_list]], columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
    big_cohorts_upper = pd.concat([big_cohorts_upper,df])
big_cohorts_upper.reset_index(inplace = True, drop = True)

In [98]:
big_cohorts_upper

Unnamed: 0,Cancer,Sample_counts,Rate_scores,Error
0,BLCA,"[190, 200, 210, 220, 230, 240, 250]","[0.777, 0.784, 0.786, 0.784, 0.788, 0.782, 0.791]","[0.073, 0.069, 0.065, 0.064, 0.063, 0.062, 0.06]"
1,BRCA,"[110, 120, 130, 140, 150, 160, 170, 180, 190, ...","[0.823, 0.827, 0.836, 0.84, 0.839, 0.844, 0.84...","[0.08, 0.078, 0.07, 0.066, 0.066, 0.064, 0.062..."
2,COADREAD,"[50, 60, 70, 80, 90, 100, 110, 120, 130, 140, ...","[0.654, 0.675, 0.694, 0.705, 0.71, 0.719, 0.72...","[0.155, 0.14, 0.122, 0.118, 0.106, 0.099, 0.09..."
3,HNSC,"[120, 130, 140, 150, 160, 170, 180, 190, 200, ...","[0.727, 0.731, 0.741, 0.755, 0.756, 0.76, 0.76...","[0.096, 0.09, 0.088, 0.083, 0.083, 0.075, 0.07..."
4,LUSC,"[130, 140, 150, 160, 170, 180, 190, 200, 210, ...","[0.708, 0.719, 0.722, 0.728, 0.73, 0.732, 0.73...","[0.084, 0.087, 0.082, 0.078, 0.076, 0.071, 0.0..."
5,SARC,"[60, 70, 80, 90, 100, 110, 120, 130, 140, 150,...","[0.916, 0.921, 0.922, 0.933, 0.935, 0.937, 0.9...","[0.081, 0.076, 0.068, 0.062, 0.061, 0.056, 0.0..."


In [90]:
# Add LIHCCHOL to big_cohorts
samp_counts = []
rate_scores = []
error = []
cncr = 'LIHCCHOL'
lc = sorted(glob.glob('./LIHCCHOL/*.tsv'))
table = pd.DataFrame(columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])

for tsv in lc:
    df = pd.read_csv(tsv, sep = '\t')
    table = pd.concat([table,df])
table.reset_index(inplace = True, drop = True)

cnt_ordr = []

for count_lst in table.Sample_counts:
    cnt = eval(count_lst)[0]
    cnt_ordr.append(cnt)
table['Ordr'] = cnt_ordr
table = table.sort_values(['Cancer', 'Ordr'])
table.reset_index(inplace = True, drop = True)

step_list = []
score_list = []
error_list = []

for step in table.Ordr.unique():
    step_list.append(step)
    block = table[table['Ordr'] == step].copy()
    block.reset_index(inplace = True, drop = True)

    sub_scores = []
    sub_err = []
    for i, sub_score_list in enumerate(block.Rate_scores):
        sub_scores.append(float(eval(sub_score_list)[-1]))
        sub_err.append(statistics.mean(eval(block.iloc[i,3])))
    mn_scr = statistics.mean(sub_scores)
    mn_err = statistics.mean(sub_err)
    score_list.append(round(mn_scr,3))
    error_list.append(round(mn_err,3))                   
        
lihcchol = pd.DataFrame([[cncr, step_list, score_list, error_list]], columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])

In [91]:
lihcchol

Unnamed: 0,Cancer,Sample_counts,Rate_scores,Error
0,LIHCCHOL,"[70, 80, 90, 100, 110, 120, 130, 140, 150, 160...","[0.696, 0.708, 0.724, 0.732, 0.736, 0.737, 0.7...","[0.129, 0.114, 0.105, 0.101, 0.094, 0.087, 0.0..."


In [99]:
big_cohorts_upper = pd.concat([big_cohorts_upper, lihcchol])
big_cohorts_upper.sort_values('Cancer', inplace = True)
big_cohorts_upper.reset_index(inplace = True, drop = True)

In [100]:
big_cohorts_upper

Unnamed: 0,Cancer,Sample_counts,Rate_scores,Error
0,BLCA,"[190, 200, 210, 220, 230, 240, 250]","[0.777, 0.784, 0.786, 0.784, 0.788, 0.782, 0.791]","[0.073, 0.069, 0.065, 0.064, 0.063, 0.062, 0.06]"
1,BRCA,"[110, 120, 130, 140, 150, 160, 170, 180, 190, ...","[0.823, 0.827, 0.836, 0.84, 0.839, 0.844, 0.84...","[0.08, 0.078, 0.07, 0.066, 0.066, 0.064, 0.062..."
2,COADREAD,"[50, 60, 70, 80, 90, 100, 110, 120, 130, 140, ...","[0.654, 0.675, 0.694, 0.705, 0.71, 0.719, 0.72...","[0.155, 0.14, 0.122, 0.118, 0.106, 0.099, 0.09..."
3,HNSC,"[120, 130, 140, 150, 160, 170, 180, 190, 200, ...","[0.727, 0.731, 0.741, 0.755, 0.756, 0.76, 0.76...","[0.096, 0.09, 0.088, 0.083, 0.083, 0.075, 0.07..."
4,LIHCCHOL,"[70, 80, 90, 100, 110, 120, 130, 140, 150, 160...","[0.696, 0.708, 0.724, 0.732, 0.736, 0.737, 0.7...","[0.129, 0.114, 0.105, 0.101, 0.094, 0.087, 0.0..."
5,LUSC,"[130, 140, 150, 160, 170, 180, 190, 200, 210, ...","[0.708, 0.719, 0.722, 0.728, 0.73, 0.732, 0.73...","[0.084, 0.087, 0.082, 0.078, 0.076, 0.071, 0.0..."
6,SARC,"[60, 70, 80, 90, 100, 110, 120, 130, 140, 150,...","[0.916, 0.921, 0.922, 0.933, 0.935, 0.937, 0.9...","[0.081, 0.076, 0.068, 0.062, 0.061, 0.056, 0.0..."


In [121]:
# Combine big cohort lower and upper
samp_count_list = []
rate_score_list = []
cohort_list = []
error_list = []
big_combined = pd.DataFrame(columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])
for i, cohort in enumerate(big_cohorts.Cancer):
    cohort_list.append(cohort)   
    samp_count_list = big_cohorts.iloc[i,1] + big_cohorts_upper.iloc[i,1]
    rate_score_list = big_cohorts.iloc[i,2] + big_cohorts_upper.iloc[i,2]
    error_list = big_cohorts.iloc[i,3] + big_cohorts_upper.iloc[i,3]

    df = pd.DataFrame([[cohort, samp_count_list, rate_score_list, error_list]],
                      columns=['Cancer', 'Sample_counts', 'Rate_scores', 'Error'])

    big_combined = pd.concat([big_combined,df],axis = 0)
big_combined.reset_index(inplace = True, drop = True)

In [120]:
big_combined

Unnamed: 0,Cancer,Sample_counts,Rate_scores,Error
0,BLCA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.454, 0.571, 0.632, 0.66, 0.687, 0.71, 0.727...","[0.358, 0.266, 0.217, 0.174, 0.151, 0.134, 0.1..."
1,BRCA,"[20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120...","[0.613, 0.681, 0.715, 0.744, 0.768, 0.771, 0.7...","[0.27, 0.214, 0.175, 0.146, 0.129, 0.114, 0.10..."
2,COADREAD,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.375, 0.505, 0.588, 0.63, 0.654, 0.675, 0.69...","[0.331, 0.271, 0.213, 0.18, 0.155, 0.14, 0.122..."
3,HNSC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.326, 0.427, 0.48, 0.533, 0.58, 0.603, 0.629...","[0.318, 0.273, 0.217, 0.189, 0.173, 0.152, 0.1..."
4,LIHCCHOL,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.381, 0.501, 0.589, 0.625, 0.665, 0.689, 0.6...","[0.348, 0.279, 0.217, 0.179, 0.166, 0.143, 0.1..."
5,LUSC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.431, 0.499, 0.555, 0.6, 0.621, 0.644, 0.653...","[0.336, 0.265, 0.209, 0.182, 0.159, 0.143, 0.1..."
6,SARC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.769, 0.85, 0.876, 0.886, 0.904, 0.916, 0.92...","[0.278, 0.173, 0.135, 0.115, 0.094, 0.081, 0.0..."


In [164]:
cohorts26 = pd.concat([big_combined,small_cohorts])
cohorts26.sort_values('Cancer',inplace = True)
cohorts26.reset_index(inplace = True, drop = True)

In [167]:
cohorts26

Unnamed: 0,Cancer,Sample_counts,Rate_scores,Error
0,ACC,"[10, 20, 30, 40, 50, 60, 70]","[0.616, 0.702, 0.751, 0.789, 0.81, 0.827, 0.836]","[0.36, 0.246, 0.189, 0.146, 0.124, 0.102, 0.088]"
1,BLCA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.454, 0.571, 0.632, 0.66, 0.687, 0.71, 0.727...","[0.358, 0.266, 0.217, 0.174, 0.151, 0.134, 0.1..."
2,BRCA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.514, 0.613, 0.681, 0.715, 0.744, 0.768, 0.7...","[0.34, 0.27, 0.214, 0.175, 0.146, 0.129, 0.114..."
3,CESC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.711, 0.801, 0.847, 0.864, 0.877, 0.885, 0.8...","[0.327, 0.215, 0.155, 0.135, 0.105, 0.101, 0.0..."
4,COADREAD,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.375, 0.505, 0.588, 0.63, 0.654, 0.675, 0.69...","[0.331, 0.271, 0.213, 0.18, 0.155, 0.14, 0.122..."
5,ESCC,"[10, 20, 30, 40, 50, 60, 70, 80]","[0.734, 0.856, 0.892, 0.923, 0.934, 0.948, 0.9...","[0.318, 0.187, 0.121, 0.089, 0.075, 0.057, 0.0..."
6,GEA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.458, 0.535, 0.609, 0.626, 0.657, 0.685, 0.6...","[0.332, 0.276, 0.223, 0.19, 0.166, 0.138, 0.12..."
7,HNSC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.326, 0.427, 0.48, 0.533, 0.58, 0.603, 0.629...","[0.318, 0.273, 0.217, 0.189, 0.173, 0.152, 0.1..."
8,KIRCKICH,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.867, 0.903, 0.917, 0.916, 0.934, 0.94, 0.93...","[0.224, 0.148, 0.114, 0.106, 0.081, 0.073, 0.0..."
9,KIRP,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.66, 0.706, 0.731, 0.761, 0.766, 0.787, 0.79...","[0.326, 0.241, 0.186, 0.15, 0.138, 0.125, 0.11..."


In [140]:
import json

In [166]:
# Add back the sampling rates of 10 that were dropped
rerun_paths = sorted(glob.glob('rerun_10/*.tsv'))
for tsv in rerun_paths:
    df = pd.read_csv(tsv, sep = '\t')
    cohort = df.iloc[0,0]
    count = [json.loads(df.iloc[0,1])[0]]
    score = [json.loads(df.iloc[0,2])[-1]]
    error = [round(statistics.mean(json.loads(df.iloc[0,3])),3)]
    idx = cohorts26.index[cohorts26['Cancer'] == cohort]
    cohorts26.iloc[idx[0],1] = count + cohorts26.iloc[idx[0],1]
    cohorts26.iloc[idx[0],2] = score + cohorts26.iloc[idx[0],2]
    cohorts26.iloc[idx[0],3] = error + cohorts26.iloc[idx[0],3]

In [168]:
cohorts26.to_csv('Sample_count_response.tsv', sep = '\t', index = False)

In [169]:
test = pd.read_csv('Sample_count_response.tsv', sep = '\t')

In [170]:
test

Unnamed: 0,Cancer,Sample_counts,Rate_scores,Error
0,ACC,"[10, 20, 30, 40, 50, 60, 70]","[0.616, 0.702, 0.751, 0.789, 0.81, 0.827, 0.836]","[0.36, 0.246, 0.189, 0.146, 0.124, 0.102, 0.088]"
1,BLCA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.454, 0.571, 0.632, 0.66, 0.687, 0.71, 0.727...","[0.358, 0.266, 0.217, 0.174, 0.151, 0.134, 0.1..."
2,BRCA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.514, 0.613, 0.681, 0.715, 0.744, 0.768, 0.7...","[0.34, 0.27, 0.214, 0.175, 0.146, 0.129, 0.114..."
3,CESC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.711, 0.801, 0.847, 0.864, 0.877, 0.885, 0.8...","[0.327, 0.215, 0.155, 0.135, 0.105, 0.101, 0.0..."
4,COADREAD,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.375, 0.505, 0.588, 0.63, 0.654, 0.675, 0.69...","[0.331, 0.271, 0.213, 0.18, 0.155, 0.14, 0.122..."
5,ESCC,"[10, 20, 30, 40, 50, 60, 70, 80]","[0.734, 0.856, 0.892, 0.923, 0.934, 0.948, 0.9...","[0.318, 0.187, 0.121, 0.089, 0.075, 0.057, 0.0..."
6,GEA,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.458, 0.535, 0.609, 0.626, 0.657, 0.685, 0.6...","[0.332, 0.276, 0.223, 0.19, 0.166, 0.138, 0.12..."
7,HNSC,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.326, 0.427, 0.48, 0.533, 0.58, 0.603, 0.629...","[0.318, 0.273, 0.217, 0.189, 0.173, 0.152, 0.1..."
8,KIRCKICH,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.867, 0.903, 0.917, 0.916, 0.934, 0.94, 0.93...","[0.224, 0.148, 0.114, 0.106, 0.081, 0.073, 0.0..."
9,KIRP,"[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,...","[0.66, 0.706, 0.731, 0.761, 0.766, 0.787, 0.79...","[0.326, 0.241, 0.186, 0.15, 0.138, 0.125, 0.11..."
