In [1]:
import pandas, seaborn, scipy, numpy, matplotlib, collections, sklearn, math, seaborn
import sklearn.linear_model
import os


%matplotlib inline
from matplotlib import pyplot

# Constants
library_sizes = {
    'WGS': 3002000000,
    'WES': 50160183,
}
mhc_binding_threshold_affinity = 500

%matplotlib inline
%config InlineBackend.figure_format = 'png'

matplotlib.rc("savefig", dpi=800)
#matplotlib.rc("savefig", dpi=72)
matplotlib.rc('text', usetex=False)
#reload(c)

pandas.set_option('display.max_rows', 50)
pandas.set_option('display.max_columns', 50)

def print_full(x):
    pandas.set_option('display.max_rows', len(x))
    print(x)
    pandas.reset_option('display.max_rows')
    
def bootstrap(values, statistic=numpy.mean, samples=5000):
    values = pandas.Series(values).dropna()
    if len(values) <= 1:
        return (numpy.nan, numpy.nan)
    values = [statistic(sklearn.utils.resample(values)) for i in range(samples)]
    return (numpy.percentile(values, 5), numpy.percentile(values, 95))

def round_to_n(x, n):
    return round(x, -int(math.floor(math.log10(x))) + (n - 1)) 

def mean_with_errorbars(values, decimals=0, plusminus=False, function=numpy.mean):
    pattern = "%%0.%df" % decimals
    bars = bootstrap(values, statistic=function)
    if numpy.nan in bars:
        return pattern % function(values)
    diff = (bars[1] - bars[0]) / 2
    if decimals == 0:
        bars = (round_to_n(bars[0], 2), round_to_n(bars[1], 2))
        diff = round_to_n(diff, 1)
    if plusminus:
        return (pattern + " $\\pm$ " + pattern) % (function(values), diff)
    return (pattern + " (" + pattern + "-" + pattern + ")") % ((function(values),) + bars)

def median_with_errorbars(values, decimals=0, plusminus=False, function=numpy.median):
    return mean_with_errorbars(values, decimals, plusminus, function)




# Mutations

In [2]:
mutations = pandas.read_csv("../data/derived/annotated_mutations.with_mhc_binders.csv.bz2",
                            converters={'binding_peptides': lambda x: eval(x) if x else {}})
#mutations["indel"] = mutations.ref.str.len() != mutations.alt.str.len()
#mutations["interesting"] = (mutations.context_mutation_3p5p == "C(C>A)C").astype(float)
#mutations["interesting"] = (mutations.context_mutation == "C>A").astype(float)

#mutations["interesting"].mean()

mutations["ref"] = mutations.ref.fillna("")
mutations["alt"] = mutations.alt.fillna("")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
mutations

Unnamed: 0,source_id,donor,genome,contig,interbase_start,interbase_end,ref,alt,sources,effect,gene,context_5_prime,context_3_prime,context_mutation,binding_affinity,binding_allele,dna_alt_reads,dna_ref_reads,dna_total_reads,rna_alt_reads,rna_ref_reads,rna_total_reads,context_mutation_3p5p,binding_peptides,num_binders,indel,snv,mnv,confident,vaf,any_alt_reads,mutation_id,site_id,unique_to_treated
0,AOCS-001-1-7,AOCS-001,GRCh37,14,24774226.0,24774227.0,C,A,annotated_mutations,silent,NOP9,ACCTTCCTAAAGCGG,GAGAGGCTTGGGAAC,C>A,,,17.0,28.0,45.0,185.0,244.0,429.0,G(C>A)G,{},0,False,True,False,True,0.377777,True,14:24774226 C>A,14:24774226,False
1,AOCS-001-1-7,AOCS-001,GRCh37,X,19968971.0,19968972.0,T,C,annotated_mutations,p.I548M,CXorf23,GTCATTTGGATCTAT,ATTTTGATCAGAGTC,T>C,94.33,B*41:01,16.0,51.0,67.0,0.0,77.0,77.0,T(T>C)A,"{'QTLIKMIDPN': 28168.48, 'SEQTLIKMID': 7669.07...",5,False,True,False,True,0.238806,True,X:19968971 T>C,X:19968971,False
2,AOCS-001-1-7,AOCS-001,GRCh37,8,35406835.0,35406836.0,G,A,annotated_mutations,p.E44K,UNC5D,CTGATGGGATGGATT,GGGAAGGGCTTCGCC,C>T,80.84,A*11:01,14.0,39.0,53.0,0.0,0.0,0.0,T(C>T)G,"{'DNGEALPKS': 44344.5, 'PKSIPSAPGTL': 38251.22...",4,False,True,False,True,0.264150,True,8:35406835 G>A,8:35406835,False
3,AOCS-001-1-7,AOCS-001,GRCh37,7,47872830.0,47872831.0,A,G,annotated_mutations,p.L2065P,HUS1 PKD1L1,CTGCATCAGCCATTC,CTCTGGGAGTGGCAG,T>C,343.29,A*11:01,16.0,31.0,47.0,0.0,1.0,1.0,C(T>C)C,"{'PSGSGRAQ': 44743.34, 'KQPASAIPSGS': 29391.39...",2,False,True,False,True,0.340425,True,7:47872830 A>G,7:47872830,False
4,AOCS-001-1-7,AOCS-001,GRCh37,17,17721666.0,17721667.0,G,C,annotated_mutations,p.R394G,SREBF1,AAATCTGCTGTCTTG,GCAAGGCCATCGACT,C>G,43.42,C*15:02,12.0,22.0,34.0,200.0,54.0,254.0,G(C>G)G,"{'LNKSAVLG': 37950.2, 'SAVLGKAIDY': 8402.54, '...",2,False,True,False,True,0.352940,True,17:17721666 G>C,17:17721666,False
5,AOCS-001-1-7,AOCS-001,GRCh37,7,148169063.0,148169064.0,G,T,annotated_mutations,intergenic,,CAAAGAAGGGCCACC,AATCTAAACCAGCAC,C>A,,,19.0,39.0,58.0,0.0,0.0,0.0,C(C>A)A,{},0,False,True,False,True,0.327586,True,7:148169063 G>T,7:148169063,False
6,AOCS-001-1-7,AOCS-001,GRCh37,1,107152783.0,107152784.0,G,A,annotated_mutations,intergenic,,TCCACTGTGTTTGTT,ATCCCTTACAAAAGA,C>T,,,14.0,69.0,83.0,0.0,0.0,0.0,T(C>T)A,{},0,False,True,False,True,0.168674,True,1:107152783 G>A,1:107152783,False
7,AOCS-001-1-7,AOCS-001,GRCh37,2,142133268.0,142133269.0,T,A,annotated_mutations,intronic,LRP1B,GCATTAAATTCCCAA,ATTGCACTTTGGTCC,T>A,,,9.0,28.0,37.0,0.0,0.0,0.0,A(T>A)A,{},0,False,True,False,True,0.243243,True,2:142133268 T>A,2:142133268,False
8,AOCS-001-1-7,AOCS-001,GRCh37,2,143687576.0,143687577.0,G,A,annotated_mutations,intronic,KYNU,GCCCTACAAAGCCTT,TGTTGTGAGGGAAAA,C>T,,,11.0,55.0,66.0,0.0,0.0,0.0,T(C>T)T,{},0,False,True,False,True,0.166666,True,2:143687576 G>A,2:143687576,False
9,AOCS-001-1-7,AOCS-001,GRCh37,1,144017729.0,144017730.0,G,T,annotated_mutations,non-coding-transcript,SRGAP2B,TGTAAGTGCACCAAT,GACACTCTGTATCTA,C>A,,,26.0,158.0,184.0,0.0,0.0,0.0,T(C>A)G,{},0,False,True,False,True,0.141304,True,1:144017729 G>T,1:144017729,False


In [4]:
prepared = mutations.copy()
#prepared["binding_peptides"] = prepared.binding_peptides.map(lambda d: " ".join(k for (k,v) in d.items() if v < 500.0))
prepared["binding_peptides"] = prepared.binding_peptides.map(lambda d: " ".join("%s:%0.1f" % (k,v) for (k,v) in d.items() if v < 500.0))

prepared["gene"] = prepared.gene.map(lambda d: d if d and d != "None" else "")
prepared["position"] = prepared.interbase_start + 1
prepared["sample"] = prepared.source_id

cols = """
source_id,donor,contig,position,ref,alt,gene,effect,dna_alt_reads,dna_ref_reads,dna_total_reads,rna_alt_reads,rna_ref_reads,rna_total_reads,context_mutation_3p5p,binding_peptides,unique_to_treated
""".strip().split(",")
prepared = prepared[cols]

#prepared.columns = [x.replace("_", " ") for x in prepared.columns]


In [5]:
to_csv_args = {
    "index": False,
    "float_format": "%0.0f",
}

In [6]:
#prepared.iloc[0:10000].to_csv("/tmp/test_csv.csv", **to_csv_args)
#len(prepared.iloc[0:10000].to_csv(**to_csv_args)) / 1024 / 1024

In [7]:
prepared.to_csv("../additional-files/Additional File 2.csv", **to_csv_args)
!zip -9 ../additional-files/Additional\ File\ 2.csv.zip ../additional-files/Additional\ File\ 2.csv

  adding: ../additional-files/Additional File 2.csv (deflated 82%)


In [8]:
#len(mutations.to_csv(index=False)) / 1024 / 1024

# Sources

In [19]:
sources_with_signature_counts = pandas.read_csv(
    "../data/derived/sources.extended.with_signature_counts.csv",
    index_col="source_id")

In [28]:
sources_with_signature_counts.ix[sources_with_signature_counts.specific_treatment == "AMCT"].interval_days

source_id
AOCS-034-3-8     1597
AOCS-064-3-3     1301
AOCS-065-3-6      420
AOCS-086-3-2     2009
AOCS-088-3-8     2712
AOCS-091-3-0     1206
AOCS-092-3-3     1370
AOCS-093-3-6     1361
AOCS-094-6-X      758
AOCS-095-3-1     1161
AOCS-117-3-3     1755
AOCS-119-3-9     1296
AOCS-120-3-6     2291
AOCS-134-3-9     1479
AOCS-135-3-1      485
AOCS-135-8-X      999
AOCS-137-3-7      923
AOCS-138-3-X     1327
AOCS-139-12-5     920
AOCS-139-19-0     920
AOCS-139-6-3      920
AOCS-141-3-2     1464
AOCS-141-8-0     1743
AOCS-142-3-5      943
AOCS-150-3-1      761
AOCS-150-8-X      956
AOCS-155-3-5     2897
AOCS-167-13-9     937
AOCS-167-16-X     937
AOCS-167-3-2      937
Name: interval_days, dtype: int64

In [18]:
", ".join(
    sorted(set(sources_with_signature_counts.ix[sources_with_signature_counts.treated_paired].submitted_donor_id_x.tolist()))).replace("AOCS-", "")

'034, 064, 065, 086, 088, 091, 092, 093, 094, 095, 137, 139'

In [22]:
sources_with_signature_counts.iloc[0].to_dict()

{'Cisplatin C Elegans expressed neoantigens': 0.0,
 'Cisplatin C Elegans mutations': 0.0,
 'Cisplatin C Elegans neoantigens': 0.0,
 'Cisplatin Gallus gallus expressed neoantigens': 0.0,
 'Cisplatin Gallus gallus mutations': 0.0,
 'Cisplatin Gallus gallus neoantigens': 0.0,
 'CollectionPoint': 'Primary',
 'Cyclophosphamide Gallus gallus expressed neoantigens': 0.0,
 'Cyclophosphamide Gallus gallus mutations': 0.0,
 'Cyclophosphamide Gallus gallus neoantigens': 0.0,
 'DNA_biospecimen': 'AOCS-001-1-7',
 'DNA_id': 'AOCS_001_ICGC_DBPC_20130205_002',
 'DNA_sample_string': 'ICGCDBPC20130205002',
 'Etoposide Gallus gallus expressed neoantigens': 0.0,
 'Etoposide Gallus gallus mutations': 0.0,
 'Etoposide Gallus gallus neoantigens': 0.0,
 'RNA biospecimen': 'AOCS-001-2-0',
 'RNA data file': '130906_D81P8DQ1_0153_C2704ACXX.nopd.AOCS_001_ICGCDBDE20130916001',
 'RNA_ID': 'ICGCDBDE20130916001',
 'Signature 1 expressed neoantigens': 0.084069181561445577,
 'Signature 1 mutations': 0.08277024326060845

In [12]:
treatments_matrix = pandas.read_csv("../data/derived/treatments_matrix.csv", index_col="source_id")
treatments_matrix

Unnamed: 0_level_0,carboplatin,paclitaxel,liposomal doxorubicin,gemcitabine,cyclophosphamide,topotecan,cisplatin,olaparib,docetaxel,bevacizumab,etoposide,nab-paclitaxel,farletuzumab/placebo
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AOCS-001-1-7,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-004-1-5,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-005-1-8,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-034-1-0,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-034-3-8,True,True,True,True,False,True,False,False,False,False,False,False,False
AOCS-055-1-7,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-056-1-X,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-057-1-2,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-058-1-5,False,False,False,False,False,False,False,False,False,False,False,False,False
AOCS-059-1-8,False,False,False,False,False,False,False,False,False,False,False,False,False


In [23]:
prepared_sources = sources_with_signature_counts.copy()
del prepared_sources["Unnamed: 0"]
cols = []
for c in list(prepared_sources.columns):
    if c not in prepared_sources.columns:
        continue
    if c in ["group"]:
        continue
    if c.endswith("_x") and (c[:-2] + "_y") in set(prepared_sources.columns):
        if not (prepared_sources[c].equals(prepared_sources[(c[:-2] + "_y")])):
            print(c)
            print(prepared_sources[c])
            print(prepared_sources[(c[:-2] + "_y")])
        prepared_sources[c[:-2]] = prepared_sources[c]
        del prepared_sources[c]
        del prepared_sources[c[:-2] + "_y"]
        c = c[:-2]
    if c.startswith("bam_path"):
        new_c = c.replace("path", "filename")
        prepared_sources[new_c] = prepared_sources[c].map(os.path.basename)
        c = new_c
    
    cols.append(c)

prepared_sources = prepared_sources[cols]
prepared_sources["specific_treatment"] = prepared_sources["specific_treatment"].map(
    lambda v: {
        "treatment naive": "primary/untreated",
        "AMCT": "relapse/treated",
        "NACT": "primary/treated"}.get(v,v))

for c in treatments_matrix.columns:
    prepared_sources["%s" % c] = treatments_matrix[c]

prepared_sources.columns = [{"RNA_ID": "RNA_id"}.get(x, x) for x in prepared_sources.columns]
prepared_sources.to_csv("../additional-files/Additional File 1.csv")

# HLA types

In [25]:
hla = pandas.read_csv("../data/external/hla_types.csv", index_col="donor")
hla = hla.ix[hla.index != "AOCS-002"]
hla.to_csv("../additional-files/Additional File 3.csv")

# Shared peptides

In [15]:
shared = pandas.read_csv("../data/derived/shared_peptides.csv")
del shared["indices"]
shared = shared[["donors", "peptide", "genes", "effects"]]
shared.to_csv("../additional-files/Additional File 6.csv", index=False)

# All signatures

In [16]:
sigs = pandas.read_csv("../data/derived/main_signatures.csv", index_col=0)
sigs.to_csv("../additional-files/Additional File 4.csv", float_format="%0.6f")

# Deconstructsigs results

In [17]:
drs = pandas.read_csv("../data/derived/deconstructsigs_output.cleaned.csv")
drs["kind"] = drs.kind.map({"all": "all", "new": "unique to treated"})
sig_columns = list(drs.columns)[2:-3]
drs = drs[["source_id", "treated", "kind"] + sig_columns].sort_values("source_id")
drs.to_csv("../additional-files/Additional File 5.csv", float_format="%0.6f", index=False)
