In [18]:
import pandas, seaborn, scipy, numpy, matplotlib, collections, sklearn, math, seaborn
import sklearn.linear_model
import os


%matplotlib inline
from matplotlib import pyplot

# Constants
library_sizes = {
    'WGS': 3002000000,
    'WES': 50160183,
}
mhc_binding_threshold_affinity = 500

%matplotlib inline
%config InlineBackend.figure_format = 'png'

matplotlib.rc("savefig", dpi=800)
#matplotlib.rc("savefig", dpi=72)
matplotlib.rc('text', usetex=False)
#reload(c)

pandas.set_option('display.max_rows', 50)
pandas.set_option('display.max_columns', 50)

def print_full(x):
    pandas.set_option('display.max_rows', len(x))
    print(x)
    pandas.reset_option('display.max_rows')
    
def bootstrap(values, statistic=numpy.mean, samples=5000):
    values = pandas.Series(values).dropna()
    if len(values) <= 1:
        return (numpy.nan, numpy.nan)
    values = [statistic(sklearn.utils.resample(values)) for i in range(samples)]
    return (numpy.percentile(values, 5), numpy.percentile(values, 95))

def round_to_n(x, n):
    return round(x, -int(math.floor(math.log10(x))) + (n - 1)) 

def mean_with_errorbars(values, decimals=0, plusminus=False, function=numpy.mean):
    pattern = "%%0.%df" % decimals
    bars = bootstrap(values, statistic=function)
    if numpy.nan in bars:
        return pattern % function(values)
    diff = (bars[1] - bars[0]) / 2
    if decimals == 0:
        bars = (round_to_n(bars[0], 2), round_to_n(bars[1], 2))
        diff = round_to_n(diff, 1)
    if plusminus:
        return (pattern + " $\\pm$ " + pattern) % (function(values), diff)
    return (pattern + " (" + pattern + "-" + pattern + ")") % ((function(values),) + bars)

def median_with_errorbars(values, decimals=0, plusminus=False, function=numpy.median):
    return mean_with_errorbars(values, decimals, plusminus, function)


# Mutations

In [2]:
mutations = pandas.read_csv("../data/derived/annotated_mutations.with_mhc_binders.csv.bz2",
                            converters={'binding_peptides': lambda x: eval(x) if x else {}})
#mutations["indel"] = mutations.ref.str.len() != mutations.alt.str.len()
#mutations["interesting"] = (mutations.context_mutation_3p5p == "C(C>A)C").astype(float)
#mutations["interesting"] = (mutations.context_mutation == "C>A").astype(float)

#mutations["interesting"].mean()

mutations["ref"] = mutations.ref.fillna("")
mutations["alt"] = mutations.alt.fillna("")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
mutations

Unnamed: 0,source_id,donor,genome,contig,interbase_start,interbase_end,ref,alt,sources,effect,gene,context_5_prime,context_3_prime,context_mutation,binding_affinity,binding_allele,dna_alt_reads,dna_ref_reads,dna_total_reads,rna_alt_reads,rna_ref_reads,rna_total_reads,context_mutation_3p5p,binding_peptides,num_binders,indel,snv,mnv,confident,vaf,any_alt_reads,mutation_id,site_id,unique_to_treated
0,AOCS-001-1-7,AOCS-001,GRCh37,14,24774226.0,24774227.0,C,A,annotated_mutations,silent,NOP9,ACCTTCCTAAAGCGG,GAGAGGCTTGGGAAC,C>A,,,17.0,28.0,45.0,185.0,244.0,429.0,G(C>A)G,{},0,False,True,False,True,0.377777,True,14:24774226 C>A,14:24774226,False
1,AOCS-001-1-7,AOCS-001,GRCh37,X,19968971.0,19968972.0,T,C,annotated_mutations,p.I548M,CXorf23,GTCATTTGGATCTAT,ATTTTGATCAGAGTC,T>C,94.33,B*41:01,16.0,51.0,67.0,0.0,77.0,77.0,T(T>C)A,"{'QTLIKMIDPN': 28168.48, 'SEQTLIKMID': 7669.07...",5,False,True,False,True,0.238806,True,X:19968971 T>C,X:19968971,False
2,AOCS-001-1-7,AOCS-001,GRCh37,8,35406835.0,35406836.0,G,A,annotated_mutations,p.E44K,UNC5D,CTGATGGGATGGATT,GGGAAGGGCTTCGCC,C>T,80.84,A*11:01,14.0,39.0,53.0,0.0,0.0,0.0,T(C>T)G,"{'DNGEALPKS': 44344.5, 'PKSIPSAPGTL': 38251.22...",4,False,True,False,True,0.264150,True,8:35406835 G>A,8:35406835,False
3,AOCS-001-1-7,AOCS-001,GRCh37,7,47872830.0,47872831.0,A,G,annotated_mutations,p.L2065P,HUS1 PKD1L1,CTGCATCAGCCATTC,CTCTGGGAGTGGCAG,T>C,343.29,A*11:01,16.0,31.0,47.0,0.0,1.0,1.0,C(T>C)C,"{'PSGSGRAQ': 44743.34, 'KQPASAIPSGS': 29391.39...",2,False,True,False,True,0.340425,True,7:47872830 A>G,7:47872830,False
4,AOCS-001-1-7,AOCS-001,GRCh37,17,17721666.0,17721667.0,G,C,annotated_mutations,p.R394G,SREBF1,AAATCTGCTGTCTTG,GCAAGGCCATCGACT,C>G,43.42,C*15:02,12.0,22.0,34.0,200.0,54.0,254.0,G(C>G)G,"{'LNKSAVLG': 37950.2, 'SAVLGKAIDY': 8402.54, '...",2,False,True,False,True,0.352940,True,17:17721666 G>C,17:17721666,False
5,AOCS-001-1-7,AOCS-001,GRCh37,7,148169063.0,148169064.0,G,T,annotated_mutations,intergenic,,CAAAGAAGGGCCACC,AATCTAAACCAGCAC,C>A,,,19.0,39.0,58.0,0.0,0.0,0.0,C(C>A)A,{},0,False,True,False,True,0.327586,True,7:148169063 G>T,7:148169063,False
6,AOCS-001-1-7,AOCS-001,GRCh37,1,107152783.0,107152784.0,G,A,annotated_mutations,intergenic,,TCCACTGTGTTTGTT,ATCCCTTACAAAAGA,C>T,,,14.0,69.0,83.0,0.0,0.0,0.0,T(C>T)A,{},0,False,True,False,True,0.168674,True,1:107152783 G>A,1:107152783,False
7,AOCS-001-1-7,AOCS-001,GRCh37,2,142133268.0,142133269.0,T,A,annotated_mutations,intronic,LRP1B,GCATTAAATTCCCAA,ATTGCACTTTGGTCC,T>A,,,9.0,28.0,37.0,0.0,0.0,0.0,A(T>A)A,{},0,False,True,False,True,0.243243,True,2:142133268 T>A,2:142133268,False
8,AOCS-001-1-7,AOCS-001,GRCh37,2,143687576.0,143687577.0,G,A,annotated_mutations,intronic,KYNU,GCCCTACAAAGCCTT,TGTTGTGAGGGAAAA,C>T,,,11.0,55.0,66.0,0.0,0.0,0.0,T(C>T)T,{},0,False,True,False,True,0.166666,True,2:143687576 G>A,2:143687576,False
9,AOCS-001-1-7,AOCS-001,GRCh37,1,144017729.0,144017730.0,G,T,annotated_mutations,non-coding-transcript,SRGAP2B,TGTAAGTGCACCAAT,GACACTCTGTATCTA,C>A,,,26.0,158.0,184.0,0.0,0.0,0.0,T(C>A)G,{},0,False,True,False,True,0.141304,True,1:144017729 G>T,1:144017729,False


In [4]:
prepared = mutations.copy()
#prepared["binding_peptides"] = prepared.binding_peptides.map(lambda d: " ".join(k for (k,v) in d.items() if v < 500.0))
prepared["binding_peptides"] = prepared.binding_peptides.map(lambda d: " ".join("%s:%0.1f" % (k,v) for (k,v) in d.items() if v < 500.0))

prepared["gene"] = prepared.gene.map(lambda d: d if d and d != "None" else "")
prepared["position"] = prepared.interbase_start + 1
prepared["sample"] = prepared.source_id

cols = """
source_id,donor,contig,position,ref,alt,gene,effect,dna_alt_reads,dna_ref_reads,dna_total_reads,rna_alt_reads,rna_ref_reads,rna_total_reads,context_mutation_3p5p,binding_peptides,unique_to_treated
""".strip().split(",")
prepared = prepared[cols]

#prepared.columns = [x.replace("_", " ") for x in prepared.columns]


In [5]:
to_csv_args = {
    "index": False,
    "float_format": "%0.0f",
}

In [6]:
#prepared.iloc[0:10000].to_csv("/tmp/test_csv.csv", **to_csv_args)
#len(prepared.iloc[0:10000].to_csv(**to_csv_args)) / 1024 / 1024

In [7]:
prepared.to_csv("../additional-files/Additional File 2.csv", **to_csv_args)
!zip -9 ../additional-files/Additional\ File\ 2.csv.zip ../additional-files/Additional\ File\ 2.csv

  adding: ../additional-files/Additional File 2.csv (deflated 82%)


In [8]:
#len(mutations.to_csv(index=False)) / 1024 / 1024

# Sources

In [19]:
sources_with_signature_counts = pandas.read_csv(
    "../data/derived/sources.extended.with_signature_counts.csv",
    index_col="source_id")

In [20]:
sources_with_signature_counts.columns

Index(['Unnamed: 0', 'RNA_ID', 'RNA biospecimen', 'DNA_id', 'DNA_biospecimen',
       'specimen_type_description', 'CollectionPoint', 'SpecimenType',
       'RNA data file', 'DNA_sample_string', 'icgc_sample_id',
       'project_code_x', 'submitted_sample_id', 'icgc_specimen_id',
       'submitted_specimen_id_x', 'icgc_donor_id_x', 'submitted_donor_id_x',
       'analyzed_sample_interval', 'percentage_cellularity_x',
       'level_of_cellularity_x', 'study', 'project_code_y',
       'study_specimen_involved_in', 'submitted_specimen_id_y',
       'icgc_donor_id_y', 'submitted_donor_id_y', 'specimen_type',
       'specimen_type_other', 'specimen_interval',
       'specimen_donor_treatment_type', 'specimen_donor_treatment_type_other',
       'specimen_processing', 'specimen_storage', 'tumour_confirmed',
       'tumour_histological_type', 'tumour_grading_system', 'tumour_grade',
       'tumour_stage_system', 'tumour_stage', 'percentage_cellularity_y',
       'level_of_cellularity_y', 'coho

In [21]:
sources_with_signature_counts

Unnamed: 0_level_0,Unnamed: 0,RNA_ID,RNA biospecimen,DNA_id,DNA_biospecimen,specimen_type_description,CollectionPoint,SpecimenType,RNA data file,DNA_sample_string,icgc_sample_id,project_code_x,submitted_sample_id,icgc_specimen_id,submitted_specimen_id_x,icgc_donor_id_x,submitted_donor_id_x,analyzed_sample_interval,percentage_cellularity_x,level_of_cellularity_x,study,project_code_y,study_specimen_involved_in,submitted_specimen_id_y,icgc_donor_id_y,...,Cisplatin C Elegans expressed neoantigens,Signature 1 mutations,Signature 1 neoantigens,Signature 1 expressed neoantigens,Signature 3 mutations,Signature 3 neoantigens,Signature 3 expressed neoantigens,Signature 8 mutations,Signature 8 neoantigens,Signature 8 expressed neoantigens,residual mutations,residual neoantigens,residual expressed neoantigens,other snv mutations,other snv neoantigens,other snv expressed neoantigens,snv mutations,snv neoantigens,snv expressed neoantigens,mnv mutations,mnv neoantigens,mnv expressed neoantigens,indel mutations,indel neoantigens,indel expressed neoantigens
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
AOCS-001-1-7,0,ICGCDBDE20130916001,AOCS-001-2-0,AOCS_001_ICGC_DBPC_20130205_002,AOCS-001-1-7,primary tumour,Primary,Tumour,130906_D81P8DQ1_0153_C2704ACXX.nopd.AOCS_001_I...,ICGCDBPC20130205002,SA505244,OV-AU,AOCS-001-1-7,SP101515,AOCS-001-1,DO46325,AOCS-001,1895.0,73.0,61-80%,PCAWG,OV-AU,PCAWG,AOCS-001-1,DO46325,...,0.000000,0.082770,0.082788,0.084069,0.406673,0.381319,0.463164,0.164367,0.117748,0.158940,-8.112947e-16,-4.284097e-16,-4.737010e-16,0.335747,0.270605,0.293826,5686.0,104.0,71.0,0.010268,0.016393,0.000000,0.014925,0.131148,0.000000
AOCS-004-1-5,1,ICGCDBDE20130916003,AOCS-004-2-9,AOCS_004_ICGC_DBPC_20130205_004,AOCS-004-1-5,primary tumour,Primary,Tumour,130906_D81P8DQ1_0157_C270DACXX.nopd.AOCS_004_I...,ICGCDBPC20130205004,SA505270,OV-AU,AOCS-004-1-5,SP101519,AOCS-004-1,DO46327,AOCS-004,1350.0,77.0,61-80%,PCAWG,OV-AU,PCAWG,AOCS-004-1,DO46327,...,0.005686,0.059897,0.115793,0.087667,0.336741,0.230991,0.295138,0.125975,0.096336,0.077589,-2.734846e-16,-1.306726e-16,-2.472593e-16,0.435291,0.362958,0.519262,6587.0,48.0,20.0,0.008726,0.000000,0.000000,0.029851,0.172414,0.000000
AOCS-005-1-8,2,ICGCDBDE20130916004,AOCS-005-2-1,AOCS_005_ICGC_DBPC_20130205_006,AOCS-005-1-8,primary tumour,Primary,Tumour,130906_D81P8DQ1_0157_C270DACXX.nopd.AOCS_005_I...,ICGCDBPC20130205006,SA505282,OV-AU,AOCS-005-1-8,SP101521,AOCS-005-1,DO46328,AOCS-005,2128.0,90.0,>81%,PCAWG,OV-AU,PCAWG,AOCS-005-1,DO46328,...,0.000000,0.139909,0.200246,0.253668,0.162710,0.194616,0.194922,0.112511,0.097643,0.119831,-1.127126e-15,-1.221715e-15,-1.233339e-15,0.525860,0.456709,0.373072,5582.0,60.0,36.0,0.005167,0.000000,0.000000,0.033898,0.000000,0.000000
AOCS-034-1-0,3,ICGCDBLG2010050402TR,AOCS-034-2-4,AOCS_034_ICGC_DBPC_20130205_009,AOCS-034-1-0,primary tumour,Primary,Tumour,131206_EXTERN_0045_AC2KJBACXX.nopd.AOCS_034_IC...,ICGCDBPC20130205009,SA505297,OV-AU,AOCS-034-1-0,SP101523,AOCS-034-1,DO46329,AOCS-034,1953.0,84.0,>81%,PCAWG,OV-AU,PCAWG,AOCS-034-1,DO46329,...,0.000000,0.094381,0.083276,0.100540,0.571580,0.539165,0.551358,0.072747,0.072635,0.058862,-6.510237e-16,-3.797080e-16,-2.487279e-16,0.249783,0.223347,0.270322,6858.0,154.0,58.0,0.007520,0.005988,0.016949,0.051020,0.071856,0.000000
AOCS-034-3-8,4,ICGCDBDE20131122024,AOCS-034-4-1,AOCS_034_ICGC_DBPC_20130205_007,AOCS-034-3-8,recurrent ascitic fluid,Recurrence,Ascites,131206_EXTERN_0045_AC2KJBACXX.nopd.AOCS_034_IC...,ICGCDBPC20130205007,SA505303,OV-AU,AOCS-034-3-8,SP101524,AOCS-034-13,DO46329,AOCS-034,110.0,99.0,>81%,,OV-AU,,AOCS-034-13,DO46329,...,0.007264,0.031874,0.037570,0.043445,0.453869,0.401323,0.374783,0.069791,0.051184,0.041759,-6.973714e-16,-3.120123e-16,-3.544233e-16,0.371331,0.434420,0.497789,13091.0,259.0,123.0,0.010501,0.003731,0.008065,0.045752,0.029851,0.000000
AOCS-055-1-7,5,ICGCDBDE20130916005,AOCS-055-2-0,AOCS_055_ICGC_DBPC_20130205_011,AOCS-055-1-7,primary tumour,Primary,Tumour,130906_D81P8DQ1_0153_C2704ACXX.nopd.AOCS_055_I...,ICGCDBPC20130205011,SA505316,OV-AU,AOCS-055-1-7,SP101526,AOCS-055-1,DO46330,AOCS-055,2150.0,82.0,>81%,PCAWG,OV-AU,PCAWG,AOCS-055-1,DO46330,...,0.000000,0.098631,0.155399,0.171134,0.282689,0.200177,0.208865,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.593488,0.512771,0.607863,14583.0,152.0,91.0,0.004165,0.000000,0.000000,0.013423,0.116279,0.000000
AOCS-056-1-X,6,ICGCDBLG2010062229TR,AOCS-056-2-3,AOCS_056_ICGC_DBPC_20130205_013,AOCS-056-1-X,primary tumour,Primary,Tumour,131206_EXTERN_0043_AC29RDACXX.nopd.AOCS_056_IC...,ICGCDBPC20130205013,SA505328,OV-AU,AOCS-056-1-X,SP101528,AOCS-056-1,DO46331,AOCS-056,2108.0,94.0,>81%,PCAWG,OV-AU,PCAWG,AOCS-056-1,DO46331,...,0.000000,0.110473,0.100189,0.141529,0.475701,0.280480,0.381540,0.234141,0.124448,0.278015,-1.086564e-15,-6.091906e-16,-8.699209e-16,0.135128,0.066452,0.130894,6909.0,89.0,16.0,0.006467,0.000000,0.000000,0.054054,0.406667,0.000000
AOCS-057-1-2,7,EXTERNAOCS20140414001,AOCS-057-2-6,AOCS_057_ICGC_DBPC_20130205_015,AOCS-057-1-2,primary tumour,Primary,Tumour,140414_EXTERN_0192_C42UFACXX.nopd.AOCS_057_EXT...,ICGCDBPC20130205015,SA505339,OV-AU,AOCS-057-1-2,SP101530,AOCS-057-1,DO46332,AOCS-057,2068.0,68.0,61-80%,PCAWG,OV-AU,PCAWG,AOCS-057-1,DO46332,...,0.006157,0.083628,0.083737,0.099534,0.369786,0.274669,0.334298,0.187544,0.113296,0.124381,-8.160455e-16,-3.351928e-16,-4.682772e-16,0.320688,0.326946,0.343093,5942.0,107.0,56.0,0.010649,0.069231,0.081967,0.067797,0.107692,0.000000
AOCS-058-1-5,8,ICGCDBLG2010062235TR,AOCS-058-2-9,AOCS_058_ICGC_DBPC_20130205_017,AOCS-058-1-5,primary tumour,Primary,Tumour,131206_EXTERN_0045_AC2KJBACXX.nopd.AOCS_058_IC...,ICGCDBPC20130205017,SA505351,OV-AU,AOCS-058-1-5,SP101532,AOCS-058-1,DO46333,AOCS-058,1015.0,90.0,>81%,PCAWG,OV-AU,PCAWG,AOCS-058-1,DO46333,...,0.011013,0.093247,0.112358,0.177247,0.422409,0.307823,0.382957,0.241276,0.156013,0.220804,-2.705551e-16,-1.814646e-16,-8.207437e-17,0.203750,0.149196,0.171458,8868.0,111.0,31.0,0.010258,0.026846,0.031250,0.084906,0.228188,0.000000
AOCS-059-1-8,9,ICGCDBDE20130916006,AOCS-059-2-1,AOCS_059_ICGC_DBPC_20130205_019,AOCS-059-1-8,primary tumour,Primary,Tumour,130906_D81P8DQ1_0157_C270DACXX.nopd.AOCS_059_I...,ICGCDBPC20130205019,SA505363,OV-AU,AOCS-059-1-8,SP101536,AOCS-059-1,DO46334,AOCS-059,1792.0,74.0,61-80%,PCAWG,OV-AU,PCAWG,AOCS-059-1,DO46334,...,0.000000,0.298429,0.508585,0.408583,0.090112,0.060780,0.067953,0.116793,0.093690,0.059082,1.533953e-16,2.608459e-16,3.439014e-16,0.410997,0.286037,0.399824,3672.0,41.0,24.0,0.003797,0.000000,0.000000,0.023810,0.000000,0.000000


In [22]:
sources_with_signature_counts.iloc[0].to_dict()

{'Cisplatin C Elegans expressed neoantigens': 0.0,
 'Cisplatin C Elegans mutations': 0.0,
 'Cisplatin C Elegans neoantigens': 0.0,
 'Cisplatin Gallus gallus expressed neoantigens': 0.0,
 'Cisplatin Gallus gallus mutations': 0.0,
 'Cisplatin Gallus gallus neoantigens': 0.0,
 'CollectionPoint': 'Primary',
 'Cyclophosphamide Gallus gallus expressed neoantigens': 0.0,
 'Cyclophosphamide Gallus gallus mutations': 0.0,
 'Cyclophosphamide Gallus gallus neoantigens': 0.0,
 'DNA_biospecimen': 'AOCS-001-1-7',
 'DNA_id': 'AOCS_001_ICGC_DBPC_20130205_002',
 'DNA_sample_string': 'ICGCDBPC20130205002',
 'Etoposide Gallus gallus expressed neoantigens': 0.0,
 'Etoposide Gallus gallus mutations': 0.0,
 'Etoposide Gallus gallus neoantigens': 0.0,
 'RNA biospecimen': 'AOCS-001-2-0',
 'RNA data file': '130906_D81P8DQ1_0153_C2704ACXX.nopd.AOCS_001_ICGCDBDE20130916001',
 'RNA_ID': 'ICGCDBDE20130916001',
 'Signature 1 expressed neoantigens': 0.084069181561445577,
 'Signature 1 mutations': 0.08277024326060845

In [12]:
treatments_matrix = pandas.read_csv("../data/derived/treatments_matrix.csv", index_col="source_id")
treatments_matrix

Unnamed: 0_level_0,carboplatin,paclitaxel,liposomal doxorubicin,gemcitabine,cyclophosphamide,topotecan,cisplatin,olaparib,docetaxel,bevacizumab,etoposide,nab-paclitaxel,farletuzumab/placebo
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AOCS-001-1-7,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-004-1-5,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-005-1-8,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-034-1-0,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-034-3-8,True,True,True,True,False,True,False,False,False,False,False,False,False
AOCS-055-1-7,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-056-1-X,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-057-1-2,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-058-1-5,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-059-1-8,False,False,False,False,False,False,False,False,False,False,False,False,False


In [23]:
prepared_sources = sources_with_signature_counts.copy()
del prepared_sources["Unnamed: 0"]
cols = []
for c in list(prepared_sources.columns):
    if c not in prepared_sources.columns:
        continue
    if c in ["group"]:
        continue
    if c.endswith("_x") and (c[:-2] + "_y") in set(prepared_sources.columns):
        if not (prepared_sources[c].equals(prepared_sources[(c[:-2] + "_y")])):
            print(c)
            print(prepared_sources[c])
            print(prepared_sources[(c[:-2] + "_y")])
        prepared_sources[c[:-2]] = prepared_sources[c]
        del prepared_sources[c]
        del prepared_sources[c[:-2] + "_y"]
        c = c[:-2]
    if c.startswith("bam_path"):
        new_c = c.replace("path", "filename")
        prepared_sources[new_c] = prepared_sources[c].map(os.path.basename)
        c = new_c
    
    cols.append(c)

prepared_sources = prepared_sources[cols]
prepared_sources["specific_treatment"] = prepared_sources["specific_treatment"].map(
    lambda v: {
        "treatment naive": "primary/untreated",
        "AMCT": "relapse/treated",
        "NACT": "primary/treated"}.get(v,v))

for c in treatments_matrix.columns:
    prepared_sources["%s" % c] = treatments_matrix[c]

prepared_sources.columns = [{"RNA_ID": "RNA_id"}.get(x, x) for x in prepared_sources.columns]
prepared_sources.to_csv("../additional-files/Additional File 1.csv")

# HLA types

In [25]:
hla = pandas.read_csv("../data/external/hla_types.csv", index_col="donor")
hla = hla.ix[hla.index != "AOCS-002"]
hla.to_csv("../additional-files/Additional File 3.csv")

# Shared peptides

In [15]:
shared = pandas.read_csv("../data/derived/shared_peptides.csv")
del shared["indices"]
shared = shared[["donors", "peptide", "genes", "effects"]]
shared.to_csv("../additional-files/Additional File 6.csv", index=False)

# All signatures

In [16]:
sigs = pandas.read_csv("../data/derived/main_signatures.csv", index_col=0)
sigs.to_csv("../additional-files/Additional File 4.csv", float_format="%0.6f")

# Deconstructsigs results

In [17]:
drs = pandas.read_csv("../data/derived/deconstructsigs_output.cleaned.csv")
drs["kind"] = drs.kind.map({"all": "all", "new": "unique to treated"})
sig_columns = list(drs.columns)[2:-3]
drs = drs[["source_id", "treated", "kind"] + sig_columns].sort_values("source_id")
drs.to_csv("../additional-files/Additional File 5.csv", float_format="%0.6f", index=False)
