In [1]:
import pandas, seaborn, scipy, numpy, matplotlib, collections, sklearn, math

%matplotlib inline
from matplotlib import pyplot

# Constants
library_sizes = {
    'WGS': 3002000000,
    'WES': 50160183,
}
mhc_binding_threshold_affinity = 500



%matplotlib inline
%config InlineBackend.figure_format = 'png'

matplotlib.rc("savefig", dpi=800)
#matplotlib.rc("savefig", dpi=72)
#matplotlib.rc('text', usetex=False)
#reload(c)

pandas.set_option('display.max_rows', 500)
pandas.set_option('display.max_columns', 500)

def print_full(x):
    pandas.set_option('display.max_rows', len(x))
    print(x)
    pandas.reset_option('display.max_rows')
    
def bootstrap(values, statistic=numpy.mean, samples=5000):
    values = pandas.Series(values).dropna()
    if len(values) <= 1:
        return (numpy.nan, numpy.nan)
    values = [statistic(sklearn.utils.resample(values)) for i in range(samples)]
    return (numpy.percentile(values, 5), numpy.percentile(values, 95))

def round_to_n(x, n):
    return round(x, -int(math.floor(math.log10(x))) + (n - 1)) 

def mean_with_errorbars(values, decimals=0):
    pattern = "%%0.%df" % decimals
    bars = bootstrap(values)
    if numpy.nan in bars:
        return pattern % numpy.mean(values)
    if decimals == 0:
        bars = (round_to_n(bars[0], 2), round_to_n(bars[1], 2)) 
    return (pattern + " (" + pattern + " - " + pattern + ")") % ((numpy.mean(values),) + bars)




In [2]:
sources = pandas.read_csv("../data/sources.csv", index_col="source_id")
sources = sources.ix[sources.cohort == "AOCS"]


In [3]:
all_signatures = pandas.read_csv("../data/all_signatures_for_deconstructsigs_with_chicken.csv", index_col=0)
all_signatures.columns = [c.replace("[", "(").replace("]", ")") for c in all_signatures.columns]
all_signatures

Unnamed: 0,A(C>A)A,A(C>A)C,A(C>A)G,A(C>A)T,A(C>G)A,A(C>G)C,A(C>G)G,A(C>G)T,A(C>T)A,A(C>T)C,A(C>T)G,A(C>T)T,A(T>A)A,A(T>A)C,A(T>A)G,A(T>A)T,A(T>C)A,A(T>C)C,A(T>C)G,A(T>C)T,A(T>G)A,A(T>G)C,A(T>G)G,A(T>G)T,C(C>A)A,C(C>A)C,C(C>A)G,C(C>A)T,C(C>G)A,C(C>G)C,C(C>G)G,C(C>G)T,C(C>T)A,C(C>T)C,C(C>T)G,C(C>T)T,C(T>A)A,C(T>A)C,C(T>A)G,C(T>A)T,C(T>C)A,C(T>C)C,C(T>C)G,C(T>C)T,C(T>G)A,C(T>G)C,C(T>G)G,C(T>G)T,G(C>A)A,G(C>A)C,G(C>A)G,G(C>A)T,G(C>G)A,G(C>G)C,G(C>G)G,G(C>G)T,G(C>T)A,G(C>T)C,G(C>T)G,G(C>T)T,G(T>A)A,G(T>A)C,G(T>A)G,G(T>A)T,G(T>C)A,G(T>C)C,G(T>C)G,G(T>C)T,G(T>G)A,G(T>G)C,G(T>G)G,G(T>G)T,T(C>A)A,T(C>A)C,T(C>A)G,T(C>A)T,T(C>G)A,T(C>G)C,T(C>G)G,T(C>G)T,T(C>T)A,T(C>T)C,T(C>T)G,T(C>T)T,T(T>A)A,T(T>A)C,T(T>A)G,T(T>A)T,T(T>C)A,T(T>C)C,T(T>C)G,T(T>C)T,T(T>G)A,T(T>G)C,T(T>G)G,T(T>G)T
Signature 1,0.011098,0.009149,0.00149007,0.006234,0.001801,0.002581,0.000593,0.002964,0.029515,0.014323,0.171647,0.012624,0.00402152,0.002371,0.002811,0.008361,0.013916,0.006275,0.010138,0.009256,0.001588,0.001784,0.001386,0.003159,0.006596,0.007342,0.000893,0.007187,0.001285,0.000702,0.000506,0.001382,0.020896,0.018502,0.095577,0.017113,0.001183,0.001903,0.001488,0.002179344,0.004177,0.005253,0.007013,0.006713813,0.000303,0.002099,0.0016,0.002759,0.008233,0.005758,0.000616,0.004459,0.000602,0.002393,2.48534e-07,0.00089,0.024944,0.027161,0.103571,0.01769,0.000689,0.000552,0.0012,0.002107,0.011248,0.007,0.004977593,0.010667,9.9e-05,0.0002023656,0.001188,0.000801,0.01225,0.011162,0.002275,0.015259,0.001875,0.002067,0.000304897,0.003152,0.014492,0.017681,0.076002,0.013762,0.0056,0.001999,0.00109,0.003981,0.008074,0.004857,0.008325,0.006257,0.001397554,0.001292,0.002031077,0.00403
Signature 2,0.000683,0.000619,9.927896e-05,0.000324,0.000263,0.00027,0.000219,0.000611,0.007442,0.002726,0.003322,0.003327,1.32377e-07,0.000113,0.000533,0.000149,0.001304,0.000426,0.000575,0.001488,3.4e-05,2.5e-05,0.000273,0.000218,0.000677,0.000214,7e-06,0.000416,2.8e-05,0.00028,1.9e-05,0.000313,0.01502,0.003517,0.004979,0.008957,0.000155,0.000464,0.00023,0.0005748856,0.000547,0.000392,0.000362,0.0005609001,0.000114,2.2e-05,0.000228,6.7e-05,0.000352,0.000134,0.000178,0.000123,4.5e-05,1.5e-05,4.066589e-05,0.000268,0.006391,0.001996,0.000303,0.003266,0.000115,0.000294,8.9e-05,0.000216,5e-06,0.000186,4.95044e-07,0.000579,9.6e-05,4.700238e-05,0.00011,8.6e-05,0.015127,0.006532,0.001656,0.012395,0.037242,1.9e-05,0.001625465,0.06688,0.419941,0.081972,0.04772,0.228675,8.1e-05,5e-06,6.7e-05,0.000276,0.000102,0.00047,0.000192,0.000585,7.173695e-05,1.4e-05,0.0002066152,2.4e-05
Signature 3,0.022172,0.017872,0.00213834,0.016265,0.024003,0.01216,0.005275,0.023278,0.017872,0.008896,0.003573,0.014798,0.008428564,0.007373,0.007357,0.008754,0.013036,0.009186,0.011717,0.016979,0.002351,0.001464,0.009054,0.007031,0.018782,0.01576,0.001963,0.014723,0.016833,0.013531,0.004176,0.024046,0.014395,0.008545,0.003518,0.016076,0.007571,0.012725,0.011509,0.01645618,0.007895,0.014431,0.008423,0.01193243,0.001974,0.005824,0.010465,0.008724,0.009697,0.010843,0.000929,0.012215,0.011917,0.009824,0.001671054,0.017914,0.016127,0.008209,0.001213,0.010612,0.004435,0.005615,0.00807,0.008679,0.00685,0.006261,0.006098763,0.007509,0.004144,0.004501985,0.016391,0.007067,0.011653,0.016607,0.001357,0.016328,0.016041,0.02015,0.002527911,0.032674,0.00888,0.01353,0.001705,0.010304,0.007133,0.009103,0.006566,0.014712,0.009115,0.010954,0.006113,0.010774,0.005427184,0.00616,0.01107653,0.013001
Signature 4,0.0365,0.0309,0.0183,0.0243,0.0097,0.0054,0.0031,0.0054,0.012,0.0075,0.0028,0.0059,0.0048,0.0039,0.01,0.003,0.0084,0.002,0.0081,0.0036,0.0,0.0002,0.0015,0.0002,0.0461,0.0614,0.0088,0.0432,0.0105,0.0097,0.0063,0.0094,0.021,0.0144,0.0076,0.0201,0.0075,0.0111,0.0342,0.0115,0.0052,0.0026,0.01,0.0054,0.0,0.0013,0.0046,0.0012,0.0376,0.0399,0.0227,0.0258,0.007,0.0091,0.0062,0.006,0.0087,0.008,0.0023,0.0082,0.0069,0.0052,0.0133,0.0045,0.0061,0.0016,0.0042,0.0024,0.0,0.0,0.0018,0.0002,0.033,0.0538,0.0104,0.037,0.0032,0.0105,0.0031,0.005,0.0035,0.007,0.0011,0.0077,0.0045,0.0046,0.0082,0.0045,0.0028,0.0016,0.0036,0.0022,0.0,0.0003,0.003,0.0011
Signature 5,0.014942,0.008961,0.002207846,0.009207,0.011671,0.007292,0.002304,0.011696,0.021839,0.012756,0.01676,0.016478,0.008902904,0.007399,0.011508,0.011194,0.035367,0.013771,0.028449,0.027303,0.003462,0.002247,0.00549,0.003821,0.009675,0.004952,0.002801,0.011013,0.007538,0.007633,0.002614,0.009417,0.022769,0.017509,0.012862,0.02043,0.005043,0.006209,0.010508,0.009784641,0.014239,0.01249,0.018742,0.01905408,0.002042,0.003479,0.007147,0.011487,0.011892,0.009248,0.002809,0.010301,0.005559,0.005389,0.00110059,0.006041,0.020038,0.018022,0.013194,0.019503,0.00674,0.004022,0.006814,0.005101,0.016295,0.009575,0.01410393,0.015667,0.001628,0.0003277349,0.005949,0.003307,0.014774,0.012043,0.003902,0.018243,0.002681,0.007924,0.001319076,0.006645,0.010998,0.020645,0.007534,0.011787,0.009206,0.006835,0.007144,0.010241,0.01706,0.014196,0.012597,0.017375,0.005202874,0.005132,0.006055254,0.01337
Signature 6,0.0017,0.0028,0.0005,0.0019,0.0013,0.0012,0.0,0.0018,0.0312,0.0163,0.0908,0.0149,0.0006,0.0033,0.0,0.0053,0.0075,0.0056,0.0217,0.0023,0.0,0.0017,0.0007,0.0029,0.0101,0.0241,0.0091,0.0571,0.0,0.0,0.0,0.0002,0.0085,0.0099,0.0901,0.0087,0.0001,0.0026,0.0008,0.0011,0.0062,0.004,0.027,0.0033,0.0001,0.004,0.005,0.0086,0.0024,0.0058,0.0021,0.0087,0.0,0.003,0.0,0.0017,0.0653,0.0773,0.1339,0.0524,0.0,0.0028,0.0006,0.0021,0.0122,0.0059,0.0115,0.0042,0.0,0.0016,0.001,0.0035,0.0017,0.0029,0.0011,0.0058,0.0,0.0002,0.0,0.0001,0.0074,0.0067,0.0391,0.0047,0.0002,0.0008,0.0,0.0007,0.0059,0.0035,0.0106,0.0029,0.0009,0.0019,0.0011,0.0072
Signature 7,0.0004,0.0005,0.0,0.0004,0.0,0.0,0.0,0.0001,0.0,0.0197,0.0001,0.0043,0.001,0.0008,0.0009,0.0035,0.0005,0.0001,0.0007,0.0011,0.0,0.0,0.0009,0.0,0.0012,0.0006,0.0,0.0013,0.0001,0.0004,0.0006,0.0003,0.0754,0.1007,0.0208,0.0788,0.0,0.0014,0.0007,0.0018,0.001,0.0008,0.001,0.0045,0.0,0.0008,0.0009,0.0013,0.0003,0.0001,0.0,0.0001,0.0,0.0004,0.0,0.0,0.0,0.021,0.0002,0.0161,0.0,0.0,0.0006,0.001,0.0002,0.0001,0.0003,0.0081,0.0,0.0,0.0017,0.0009,0.001,0.002,0.0002,0.0013,0.0002,0.001,0.0003,0.0007,0.1202,0.2887,0.0992,0.0844,0.001,0.0015,0.001,0.005,0.0023,0.0018,0.0019,0.0024,0.0,0.001,0.001,0.0014
Signature 8,0.036718,0.033246,0.002525311,0.033599,0.008357,0.004306,0.000584,0.008635,0.018067,0.00565,0.019265,0.020806,0.01336508,0.012431,0.014037,0.02411,0.016207,0.007788,0.009853,0.021829,0.00422,0.000794,0.006866,0.00294,0.031724,0.025505,0.00116,0.028791,0.006619,0.006078,0.000656,0.007805,0.004866,0.00398,0.008339,0.018844,0.012024,0.017881,0.016357,0.0262478,0.005174,0.010937,0.005658,0.01112039,0.002424,0.002096,0.006604,0.004867,0.023682,0.015822,0.000851,0.021061,0.003725,0.003047,0.0003212128,0.005776,0.002261,0.001617,0.011607,0.006285,0.008139,0.00791,0.009057,0.017812,0.005721,0.005181,0.005040707,0.008488,0.000975,0.0005248216,0.006088,0.005427,0.027032,0.01809,0.001695,0.038141,0.004118,0.0038,2.565302e-05,0.006248,0.007181,0.004061,0.005535,0.012209,0.01514,0.012094,0.008363,0.027653,0.004901,0.006081,0.001712,0.010003,0.001743221,0.00255,0.006030395,0.007224
Signature 9,0.012,0.0067,0.0005,0.0068,0.0048,0.0023,0.0,0.0038,0.0093,0.0056,0.0125,0.0076,0.0121,0.0042,0.0068,0.0185,0.0252,0.0105,0.0176,0.0247,0.0335,0.0051,0.0052,0.0193,0.0098,0.0057,0.0,0.0091,0.0018,0.002,0.0,0.0039,0.0098,0.0069,0.0076,0.0097,0.0055,0.0049,0.0051,0.0069,0.0154,0.0118,0.0154,0.0275,0.0216,0.0064,0.0126,0.0509,0.0118,0.0092,0.0,0.0085,0.0011,0.0029,0.0,0.0044,0.0062,0.0069,0.0088,0.0101,0.003,0.0029,0.0028,0.0036,0.0091,0.0097,0.0094,0.0126,0.0072,0.0006,0.005,0.0185,0.0222,0.0043,0.0,0.0322,0.0033,0.0025,0.0,0.0049,0.005,0.0084,0.0047,0.0096,0.0215,0.0027,0.014,0.0139,0.0156,0.0137,0.0098,0.0309,0.0502,0.0081,0.0088,0.0545
Signature 10,0.0007,0.001,0.0003,0.0092,0.0005,0.0003,0.0,0.0002,0.0,0.0032,0.0126,0.0047,0.0,0.0002,0.0,0.0012,0.0031,0.0052,0.0026,0.004,0.0,0.004,0.0015,0.0137,0.0031,0.0009,0.0007,0.016,0.0,0.0,0.0,0.0,0.0012,0.0024,0.0109,0.0043,0.0,0.0001,0.0003,0.0009,0.0005,0.0019,0.0015,0.002,0.0022,0.0018,0.0037,0.0182,0.0014,0.0022,0.0002,0.0088,0.0,0.0002,0.0,0.0004,0.0,0.0134,0.027,0.0152,0.0,0.0001,0.0,0.0004,0.0064,0.0101,0.0054,0.0124,0.0,0.002,0.0009,0.003,0.0374,0.0103,0.0031,0.3083,0.0,0.0,0.0,0.0001,0.0037,0.0211,0.2141,0.0392,0.0,0.0008,0.0,0.0028,0.003,0.0097,0.0065,0.0099,0.005,0.0092,0.0022,0.0633


In [7]:
deconvolution_df = pandas.read_csv("../data/deconstructsigs_output_with_chicken.cleaned.csv",
                                   index_col=["source_id", "kind"])
for col in deconvolution_df.columns:
    if col.endswith(".1"):
        deconvolution_df[col[:-2]] = deconvolution_df[col]
        del deconvolution_df[col]
deconvolution_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Signature 1,Signature 2,Signature 3,Signature 4,Signature 5,Signature 6,Signature 8,Signature 9,Signature 10,Signature 12,Signature 13,Signature 14,Signature 16,Signature 18,Signature 19,Signature 22,Signature 23,Signature 25,Signature 26,Signature 29,Chicken cisplatin,Chicken cyclophosphamide,Chicken etoposide,treated,kind,source_id
source_id,kind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
etoposide,check,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.924169,False,check,etoposide
cyclophosphamide,check,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.965401,0.0,False,check,cyclophosphamide
cisplatin,check,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.938257,0.0,0.0,False,check,cisplatin
AOCS-166-1/SP102133,all,0.0,0.0,0.0,0.0,0.0,0.063701,0.0,0.250905,0.0,0.514761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094825,0.0,0.0,0.0,0.0,False,all,AOCS-166-1/SP102133
AOCS-092-13/SP101630,new,0.0,0.0,0.158093,0.138944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093589,0.107358,0.146737,0.0,0.0,0.0,0.07562,0.156505,0.0,True,new,AOCS-092-13/SP101630
AOCS-119-13/SP101732,all,0.0,0.0,0.243012,0.0,0.0,0.0,0.474244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,all,AOCS-119-13/SP101732
AOCS-135-14/SP101860,all,0.0,0.0,0.279925,0.0,0.124117,0.0,0.373382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,all,AOCS-135-14/SP101860
AOCS-135-13/SP101855,all,0.0,0.0,0.280556,0.0,0.106046,0.0,0.392249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,all,AOCS-135-13/SP101855
AOCS-088-1/SP101610,all,0.0,0.0,0.339112,0.0,0.0,0.0,0.432023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,all,AOCS-088-1/SP101610
AOCS-150-13/SP102010,all,0.0,0.0,0.349613,0.0,0.0,0.0,0.247126,0.0,0.0,0.0,0.0,0.0,0.061296,0.0,0.088898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,all,AOCS-150-13/SP102010


In [31]:
full_signatures = deconvolution_df.ix[deconvolution_df.kind == "all"].copy()
del full_signatures["treated"]
del full_signatures["kind"]
del full_signatures["source_id"]


full_signatures.index = full_signatures.index.droplevel(1)
untreated_full_signatures = full_signatures.ix[[s for s in sources.ix[~sources.treated].index ]].dropna()
untreated_full_signatures

Unnamed: 0_level_0,Signature 1,Signature 2,Signature 3,Signature 4,Signature 5,Signature 6,Signature 8,Signature 9,Signature 10,Signature 12,Signature 13,Signature 14,Signature 16,Signature 18,Signature 19,Signature 22,Signature 23,Signature 25,Signature 26,Signature 29,Chicken cisplatin,Chicken cyclophosphamide,Chicken etoposide
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
AOCS-001-1/SP101515,0.081648,0.0,0.426233,0.0,0.0,0.0,0.151482,0.0,0.0,0.0,0.0,0.0,0.142125,0.0,0,0,0,0,0.0,0,0,0.0,0.0
AOCS-002-1/SP101517,0.21174,0.0,0.077292,0.0,0.0,0.0,0.237462,0.0,0.0,0.0,0.0,0.0,0.281761,0.0,0,0,0,0,0.0,0,0,0.0,0.065608
AOCS-004-1/SP101519,0.0,0.0,0.356411,0.0,0.102398,0.0,0.152115,0.0,0.0,0.0,0.0,0.0,0.078702,0.0,0,0,0,0,0.0,0,0,0.0,0.0
AOCS-005-1/SP101521,0.133929,0.0,0.188933,0.0,0.248625,0.0,0.111107,0.069122,0.0,0.0,0.080032,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0.0
AOCS-034-1/SP101523,0.084477,0.0,0.63285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089499,0.0,0,0,0,0,0.0,0,0,0.0,0.0
AOCS-055-1/SP101526,0.099431,0.071504,0.292034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.308129,0.0,0.095993,0.0,0,0,0,0,0.0,0,0,0.0,0.0
AOCS-056-1/SP101528,0.106632,0.0,0.490868,0.0,0.0,0.0,0.2338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0.0
AOCS-057-1/SP101530,0.071457,0.0,0.373355,0.0,0.194589,0.0,0.197702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0.0
AOCS-058-1/SP101532,0.081298,0.0,0.426675,0.068352,0.0,0.0,0.25354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0.0,0.0
AOCS-059-1/SP101536,0.303181,0.0,0.122236,0.0,0.103763,0.0,0.098221,0.0,0.0,0.0,0.0,0.0,0.185605,0.0,0,0,0,0,0.0,0,0,0.0,0.0


In [34]:
new_signatures = deconvolution_df.ix[deconvolution_df.kind == "new"]
del new_signatures["treated"]
del new_signatures["kind"]
del new_signatures["source_id"]

new_signatures.index = new_signatures.index.droplevel(1)
new_signatures

Unnamed: 0_level_0,Signature 1,Signature 2,Signature 3,Signature 4,Signature 5,Signature 6,Signature 8,Signature 9,Signature 10,Signature 12,Signature 13,Signature 14,Signature 16,Signature 18,Signature 19,Signature 22,Signature 23,Signature 25,Signature 26,Signature 29,Chicken cisplatin,Chicken cyclophosphamide,Chicken etoposide
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
AOCS-092-13/SP101630,0,0,0.158093,0.138944,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.093589,0.107358,0.146737,0,0,0.0,0.07562,0.156505,0
AOCS-034-13/SP101524,0,0,0.378011,0.0,0,0,0.121494,0.0,0,0.0,0.174752,0,0.0,0,0.0,0.0,0.124724,0,0,0.0,0.0,0.0,0
AOCS-088-13/SP101612,0,0,0.39208,0.0,0,0,0.307939,0.0,0,0.0,0.078823,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.074157,0
AOCS-095-13/SP101650,0,0,0.423744,0.0,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.10502,0.070635,0.0,0,0,0.0,0.066062,0.127777,0
AOCS-086-13/SP101606,0,0,0.441879,0.0,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.108629,0,0,0.091503,0.0,0.122227,0
AOCS-093-13/SP101638,0,0,0.461958,0.0,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.061477,0.0,0,0,0.0,0.0,0.066529,0
AOCS-139-4/SP101906,0,0,0.471886,0.0,0,0,0.178903,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0
AOCS-139-17/SP101901,0,0,0.474986,0.0,0,0,0.128605,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.116274,0
AOCS-139-16/SP101896,0,0,0.478232,0.0,0,0,0.166595,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0
AOCS-064-13/SP101554,0,0,0.520287,0.0,0,0,0.088195,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.144016,0


In [8]:
mutations = pandas.read_csv("../data/mutations.csv.bz2")


  data = self._reader.read(nrows)


In [35]:
primary_mutations_with_signatures = mutations.ix[
    mutations.snv & mutations.source_id.isin(untreated_full_signatures.index)]
new_mutations_with_signatures = mutations.ix[
    mutations.snv & mutations.source_id.isin(new_signatures.index) & mutations.unique_to_treated
]

def add_signature_columns(mutations_df, signature_participations, all_signatures):
    mutations_df = mutations_df.copy().reset_index()
    extra_columns = []
    for (i, row) in mutations_df.iterrows():
        unnormalized = numpy.array([
            all_signatures.ix[signature][row.context_mutation_3p5p] * participation
            for (signature, participation) in signature_participations.ix[row.source_id].iteritems()
        ])
        if numpy.isnan(unnormalized).any():
            print(i, row, unnormalized, signature, row.context_mutation_3p5, participation)
            raise ValueError()
        normalized = unnormalized / unnormalized.sum()
        extra_columns.append(normalized)
    new_df = pandas.DataFrame(extra_columns, columns=signature_participations.columns)
    for col in new_df.columns:
        mutations_df[col] = new_df[col]
    return mutations_df


primary_mutations_with_signatures = add_signature_columns(
    primary_mutations_with_signatures, untreated_full_signatures, all_signatures)
primary_mutations_with_signatures

new_mutations_with_signatures = add_signature_columns(
    new_mutations_with_signatures, new_signatures, all_signatures)
new_mutations_with_signatures
            

Unnamed: 0,index,source_id,donor,called,genome,contig,interbase_start,interbase_end,ref,alt,effect,gene,context_5_prime,context_3_prime,context_mutation,dna_alt_reads,dna_ref_reads,dna_total_reads,rna_alt_reads,rna_ref_reads,rna_total_reads,vaf,any_alt_reads,snv,mutation_id,site_id,context_A,context_C,context_T,context_G,context_GC,unique_to_treated,context_mutation_5p,context_mutation_3p,context_mutation_3p5p,sources,binding_affinity,binding_allele,num_alt,num_ref,total_depth,binder,binding_peptides,num_binders,Signature 1,Signature 2,Signature 3,Signature 4,Signature 5,Signature 6,Signature 8,Signature 9,Signature 10,Signature 12,Signature 13,Signature 14,Signature 16,Signature 18,Signature 19,Signature 22,Signature 23,Signature 25,Signature 26,Signature 29,Chicken cisplatin,Chicken cyclophosphamide,Chicken etoposide
0,0,AOCS-139-4/SP101906,AOCS-139,1,GRCh37,18,61407797,61407798,C,T,intergenic,,CCCCTCATCTCCCAG,CTGAACCACGTCCTT,C>T,0,47,47,,,,0.000000,False,True,18:61407797 C>T,18:61407797,0.166667,0.500000,0.233333,0.100000,0.600000,True,G(C>T),(C>T)C,G(C>T)C,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.930521,0,0,0,0.069479,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
1,1,AOCS-139-16/SP101896,AOCS-139,0,GRCh37,18,61407797,61407798,C,T,intergenic,,CCCCTCATCTCCCAG,CTGAACCACGTCCTT,C>T,5,19,24,,,,0.208332,True,True,18:61407797 C>T,18:61407797,0.166667,0.500000,0.233333,0.100000,0.600000,True,G(C>T),(C>T)C,G(C>T)C,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.935798,0,0,0,0.064202,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
2,2,AOCS-139-4/SP101906,AOCS-139,1,GRCh37,6,63262672,63262673,C,A,intergenic,,GACCACACAGGAACC,GAGAATTTACATGCA,C>A,0,69,69,,,,0.000000,False,True,6:63262672 C>A,6:63262672,0.400000,0.266667,0.133333,0.200000,0.466667,True,C(C>A),(C>A)G,C(C>A)G,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.817047,0,0,0,0.182953,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
3,3,AOCS-139-16/SP101896,AOCS-139,0,GRCh37,6,63262672,63262673,C,A,intergenic,,GACCACACAGGAACC,GAGAATTTACATGCA,C>A,9,40,49,,,,0.183673,True,True,6:63262672 C>A,6:63262672,0.400000,0.266667,0.133333,0.200000,0.466667,True,C(C>A),(C>A)G,C(C>A)G,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.829361,0,0,0,0.170639,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
4,4,AOCS-139-4/SP101906,AOCS-139,1,GRCh37,18,63487173,63487174,C,G,intronic,CDH7,TGTTCCTTTGCTTCC,TAAAGTGCGAAAGAT,C>G,0,31,31,,,,0.000000,False,True,18:63487173 C>G,18:63487173,0.233333,0.200000,0.366667,0.200000,0.400000,True,C(C>G),(C>G)T,C(C>G)T,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.890427,0,0,0,0.109573,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
5,5,AOCS-139-16/SP101896,AOCS-139,0,GRCh37,18,63487173,63487174,C,G,intronic,CDH7,TGTTCCTTTGCTTCC,TAAAGTGCGAAAGAT,C>G,6,17,23,,,,0.260868,True,True,18:63487173 C>G,18:63487173,0.233333,0.200000,0.366667,0.200000,0.400000,True,C(C>G),(C>G)T,C(C>G)T,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.898416,0,0,0,0.101584,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
6,6,AOCS-139-4/SP101906,AOCS-139,1,GRCh37,18,68110672,68110673,A,T,intergenic,,TCCTTGTCGTCTCGT,TCTAGCCTGCAAACC,T>A,5,27,32,,,,0.156250,True,True,18:68110672 A>T,18:68110672,0.133333,0.366667,0.333333,0.166667,0.533333,True,T(T>A),(T>A)T,T(T>A)T,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.583904,0,0,0,0.416096,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
7,7,AOCS-139-16/SP101896,AOCS-139,0,GRCh37,18,68110672,68110673,A,T,intergenic,,TCCTTGTCGTCTCGT,TCTAGCCTGCAAACC,T>A,11,33,44,,,,0.249999,True,True,18:68110672 A>T,18:68110672,0.133333,0.366667,0.333333,0.166667,0.533333,True,T(T>A),(T>A)T,T(T>A)T,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.604310,0,0,0,0.395690,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
8,8,AOCS-139-16/SP101896,AOCS-139,1,GRCh37,18,68385066,68385067,C,G,intergenic,,TCCAGACAGACCACA,AGATTCACAGCCGAA,C>G,12,27,39,,,,0.307692,True,True,18:68385066 C>G,18:68385066,0.400000,0.333333,0.100000,0.166667,0.500000,True,A(C>G),(C>G)A,A(C>G)A,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.891834,0,0,0,0.108166,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
9,9,AOCS-139-4/SP101906,AOCS-139,1,GRCh37,18,68385066,68385067,C,G,intergenic,,TCCAGACAGACCACA,AGATTCACAGCCGAA,C>G,9,43,52,,,,0.173077,True,True,18:68385066 C>G,18:68385066,0.400000,0.333333,0.100000,0.166667,0.500000,True,A(C>G),(C>G)A,A(C>G)A,mutations_with_mnvs mutations_with_mnvs mutati...,,,,,,False,{},0,0,0,0.883394,0,0,0,0.116606,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0
