In [29]:
import pandas as pd

# Get trinucleotide channels

In [30]:
## 96-trinucleotide channels
catalog_96channels = []
catalog_96channels_format = []
for snv_type in ['C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G']: 
    ref, alt = snv_type.split('>')
    for nuc5 in ['A', 'C', 'G', 'T']: 
        for nuc3 in ['A', 'C', 'G', 'T']: 
            trinucleotide = nuc5 + ref + nuc3
            catalog_96channels.append(f'{trinucleotide}>{alt}')
            catalog_96channels_format.append(f'{nuc5}[{snv_type}]{nuc3}')

In [31]:
catalog_96channels[0]

'ACA>A'

In [32]:
catalog_96channels_format[0]

'A[C>A]A'

# Get signatures

In [33]:
sigs_hg37 = 'cos_sigs2.txt'
sigs_hg37_df = pd.read_csv(sigs_hg37, sep='\t', header=0)

#transpose df and reformat columns
sigs_hg37_transposed = sigs_hg37_df.transpose()
new_header = sigs_hg37_transposed.iloc[0] #grab the first row for the header
sigs_hg37_transposed = sigs_hg37_transposed[1:] #take the data less the header row
sigs_hg37_transposed.columns = new_header #set the header row as the df header

sigs_hg37_transposed.index.name = 'Type'
sigs_hg37_transposed.reset_index(inplace=True)

# beware of the fact that the sorting of contexts might not be the canonical
sigs_hg37_transposed['canonical_index'] = sigs_hg37_transposed['Type'].apply(lambda r: catalog_96channels_format.index(r))
sigs_hg37_transposed = sigs_hg37_transposed.sort_values(by='canonical_index', axis=0)

In [34]:
sigs_hg37_transposed.head() #up to here so far

Type,Type.1,SBS1,SBS5,SBS17a,SBS17b,SBS18,SBS31,SBS35,canonical_index
0,A[C>A]A,0.000886,0.011998,0.00207,0.000608,0.051534,0.009535,0.008827,0
1,A[C>A]C,0.00228,0.009438,0.000918,0.000129,0.01581,0.01849,0.046184,1
2,A[C>A]G,0.000177,0.00185,4.8e-05,5.8e-05,0.002432,0.001659,0.00139,2
3,A[C>A]T,0.00128,0.006609,6.2e-05,0.000456,0.021414,0.006277,0.021593,3
24,C[C>A]A,0.000312,0.007429,0.000295,0.000271,0.074049,0.010694,0.02989,4


# Deconstructsigs weights per sample and signature

Here you need to add the output from deconstructsigs.

This is a toy example that we are going to create manually. 

We will assume there is only one sample and the fitting has been carried out using only three signatures: SBS1, SBS2 and SBS3

In [7]:
#ignore this block, this is the example
signatures_under_analysis = ['SBS1', 'SBS2', 'SBS3']
weights = pd.DataFrame([['sample_1', 0.5, 0.10, 0.05]])
weights.columns = ['sample'] + signatures_under_analysis
weights

Unnamed: 0,sample,SBS1,SBS2,SBS3
0,sample_1,0.5,0.1,0.05


In [35]:
#adding the actual deconstructsigs output (signature weights)
signatures_under_analysis = ['SBS1', 'SBS5', 'SBS17a', 'SBS17b', 'SBS18', 'SBS31', 'SBS35']
weights_data = "DeconstructSigsWeightsData.txt"
weights_df = pd.read_csv(weights_data, sep='\t', header=0)
weights_df.head()

Unnamed: 0,sample,SBS1,SBS5,SBS17a,SBS17b,SBS18,SBS31,SBS35
0,5FU-PATIENT1-N-CLONE1,0.304925,0.376417,0,0.0,0.157465,0.062864,0.098329
1,5FU-PATIENT1-N-CLONE2,0.264959,0.426528,0,0.0,0.13469,0.0,0.123513
2,5FU-PATIENT10-LIVN-CLONE1,0.0,0.751814,0,0.0,0.065335,0.0,0.1048
3,5FU-PATIENT10-LIVN-CLONE4,0.0,0.750526,0,0.0,0.078278,0.0,0.117703
4,5FU-PATIENT11-LIVN-CLONE4,0.0,0.829973,0,0.0,0.0,0.0,0.081447


In [39]:
#"weights" from the example is replaced with weights_df

probs_per_channel = []

# Iterate over samples
for sample in weights_df['sample'].tolist():
    print(sample)
    sample_weights = weights_df.loc[weights_df['sample'] == sample]
    
    # Iterate over trinucleoate channels
    for channel in catalog_96channels_format: 
        #print(channel)
        probs_per_sig = []
        
        # Reference trinucleotide change probabilities across signatures
        # This is a vector of the mutational probabilities of a channel according to each of the signatures
        ref_probs_channel = sigs_hg37_transposed.loc[sigs_hg37_transposed['Type'] == channel].copy()
        print(ref_probs_channel)
        
        for sig in signatures_under_analysis: 
            # reference probability per signature times the signature exposure (weight) in the sample
            probs_per_sig += [ref_probs_channel[sig].iloc[0] * sample_weights[sig].iloc[0]]
        print(channel, probs_per_sig)
        
        # Normalise mutational probabilities in the channel (they must add up to 1)
        total = sum(probs_per_sig)
        probs_per_sig_norm = [i/total for i in probs_per_sig]
        
        # Save data
        probs_per_channel.append(pd.DataFrame([[sample, channel] + probs_per_sig_norm]))
    
results = pd.concat(probs_per_channel)
results.columns = ['sample', 'mutation_type'] + signatures_under_analysis

5FU-PATIENT1-N-CLONE1
Type     Type      SBS1      SBS5   SBS17a    SBS17b     SBS18     SBS31  \
0     A[C>A]A  0.000886  0.011998  0.00207  0.000608  0.051534  0.009535   

Type     SBS35  canonical_index  
0     0.008827                0  
A[C>A]A [0.0002701636591777367, 0.004516096268237602, 0.0, 0.0, 0.008114767898720777, 0.0005994073279142661, 0.0008679490312445863]
Type     Type     SBS1      SBS5    SBS17a    SBS17b    SBS18    SBS31  \
1     A[C>A]C  0.00228  0.009438  0.000918  0.000129  0.01581  0.01849   

Type     SBS35  canonical_index  
1     0.046184                1  
A[C>A]C [0.0006953527756289016, 0.003552662397680247, 0.0, 0.0, 0.002489579198474763, 0.0011623726446074777, 0.004541250614370496]
Type     Type      SBS1     SBS5    SBS17a    SBS17b     SBS18     SBS31  \
2     A[C>A]G  0.000177  0.00185  0.000048  0.000058  0.002432  0.001659   

Type    SBS35  canonical_index  
2     0.00139                2  
A[C>A]G [5.3971746810902265e-05, 0.0006962315080199636, 0.

Type     Type     SBS1      SBS5   SBS17a   SBS17b     SBS18     SBS31  \
87    T[T>A]T  0.00225  0.008748  0.00198  0.00131  0.007105  0.008875   

Type     SBS35  canonical_index  
87    0.005408               63  
T[T>A]T [0.0, 0.006565788580087755, 0.0, 0.0, 0.0005561365996373725, 0.0, 0.0006365526754090975]
Type     Type     SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
16    A[T>C]A  0.00109  0.046191  0.000097  0.000097  0.002021  0.020389   

Type     SBS35  canonical_index  
16    0.002039               64  
A[T>C]A [0.0, 0.03466736520391524, 0.0, 0.0, 0.00015822477287774894, 0.0, 0.00024003096970642268]
Type     Type      SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
17    A[T>C]C  0.003041  0.013397  0.012702  0.000252  0.002091  0.007396   

Type     SBS35  canonical_index  
17    0.000821               65  
A[T>C]C [0.0, 0.010055036976031762, 0.0, 0.0, 0.00016370780801157916, 0.0, 9.663409468908276e-05]
Type     Type      SBS1      SBS5    SBS17a SB

Type     Type      SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
53    G[C>G]C  0.000164  0.005079  0.000835  0.000056  0.001341  0.007506   

Type     SBS35  canonical_index  
53    0.021393               25  
G[C>G]C [2.8240742169997314e-05, 0.0025820696234209934, 0.0, 0.0, 0.00013521255270711272, 0.0, 0.0036686127755171023]
Type     Type      SBS1     SBS5    SBS17a SBS17b     SBS18     SBS31  \
54    G[C>G]G  0.000166  0.00151  0.000024    0.0  0.000639  0.000457   

Type     SBS35  canonical_index  
54    0.000591               26  
G[C>G]G [2.8585141464753375e-05, 0.0007675049471192323, 0.0, 0.0, 6.443586058706555e-05, 0.0, 0.00010135045032293206]
Type     Type SBS1      SBS5    SBS17a SBS17b     SBS18     SBS31     SBS35  \
55    G[C>G]T  0.0  0.006719  0.000787    0.0  0.001231  0.003358  0.008737   

Type  canonical_index  
55                 27  
G[C>G]T [3.8400521365301226e-17, 0.0034156511553915503, 0.0, 0.0, 0.0001241130471569727, 0.0, 0.0014983026353002553]
Ty

Type     Type      SBS1      SBS5   SBS17a   SBS17b     SBS18     SBS31  \
44    C[T>G]A  0.000036  0.002589  0.00031  0.00205  0.000111  0.001969   

Type     SBS35  canonical_index  
44    0.003709               84  
C[T>G]A [0.0, 0.002056786344297603, 0.0, 0.0, 0.0, 0.0, 0.0003688862641503623]
Type     Type      SBS1      SBS5   SBS17a    SBS17b     SBS18     SBS31  \
45    C[T>G]C  0.000212  0.004609  0.00104  0.031398  0.001991  0.001429   

Type     SBS35  canonical_index  
45    0.000518               85  
C[T>G]C [0.0, 0.0036609208676494013, 0.0, 0.0, 0.0, 0.0, 5.152267841067659e-05]
Type     Type      SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
46    C[T>G]G  0.000128  0.006129  0.000002  0.013699  0.002932  0.001499   

Type     SBS35  canonical_index  
46    0.006358               86  
C[T>G]G [0.0, 0.004867992390171547, 0.0, 0.0, 0.0, 0.0, 0.0006323764244107268]
Type     Type      SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
47    C[T>G]T  0.00017

Type     Type      SBS1      SBS5   SBS17a    SBS17b    SBS18     SBS31  \
58    G[C>T]G  0.218039  0.012298  0.00161  0.000855  0.01481  0.001969   

Type     SBS35  canonical_index  
58    0.002529               42  
G[C>T]G [0.01642310771715774, 0.009670041300550112, 0.0, 0.0, 0.0, 0.0, 0.00021484151990125205]
Type     Type      SBS1      SBS5    SBS17a SBS17b     SBS18     SBS31  \
59    G[C>T]T  0.000038  0.018596  0.000066    0.0  0.006694  0.005697   

Type     SBS35  canonical_index  
59    0.006028               43  
G[C>T]T [2.8923644010884044e-06, 0.014622990070098194, 0.0, 0.0, 0.0, 0.0, 0.0005120530314729578]
Type     Type     SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
80    T[C>T]A  0.00111  0.020296  0.000532  0.000108  0.011407  0.007476   

Type     SBS35  canonical_index  
80    0.002959               44  
T[C>T]A [8.362224690091519e-05, 0.01595949984334496, 0.0, 0.0, 0.0, 0.0, 0.00025135604791253857]
Type     Type      SBS1      SBS5    SBS17a    SBS17

Type     Type     SBS1      SBS5   SBS17a    SBS17b     SBS18     SBS31  \
4     A[C>G]A  0.00186  0.010098  0.00101  0.000146  0.001731  0.008316   

Type     SBS35  canonical_index  
4     0.003579               16  
A[C>G]A [0.0, 0.008398151665424841, 0.0, 0.0, 0.00013341847058461753, 0.0, 0.0]
Type     Type     SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
5     A[C>G]C  0.00122  0.005699  0.000569  0.000043  0.002592  0.003158   

Type     SBS35  canonical_index  
5     0.004548               17  
A[C>G]C [0.0, 0.004739550939893227, 0.0, 0.0, 0.0001997421639474894, 0.0, 0.0]
Type     Type      SBS1     SBS5    SBS17a SBS17b     SBS18     SBS31  \
6     A[C>G]G  0.000115  0.00172  0.000152    0.0  0.001921  0.002998   

Type     SBS35  canonical_index  
6     0.000346               18  
A[C>G]G [0.0, 0.0014301802836169037, 0.0, 0.0, 0.0001480713759987473, 0.0, 0.0]
Type     Type     SBS1      SBS5   SBS17a    SBS17b     SBS18     SBS31  \
7     A[C>G]T  0.00114  0.01009

Type     Type SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
27    C[C>A]T  0.0  0.006489  0.000137  0.000182  0.036324  0.027885   

Type     SBS35  canonical_index  
27    0.082471                7  
C[C>A]T [7.255925350870696e-17, 0.0033584826603683146, 0.0, 0.0, 0.0031699618353378495, 0.0, 0.0]
Type     Type     SBS1      SBS5   SBS17a    SBS17b     SBS18     SBS31  \
48    G[C>A]A  0.00158  0.010198  0.00178  0.000998  0.109072  0.013093   

Type     SBS35  canonical_index  
48    0.009867                8  
G[C>A]A [0.0005141880589001769, 0.005278354874538801, 0.0, 0.0, 0.009518618118334763, 0.0, 0.0]
Type     Type      SBS1      SBS5   SBS17a    SBS17b     SBS18     SBS31  \
49    G[C>A]C  0.000339  0.007648  0.00128  0.000003  0.017311  0.021489   

Type     SBS35  canonical_index  
49    0.050183                9  
G[C>A]C [0.00011030308044597156, 0.0039587661559041, 0.0, 0.0, 0.0015107531893222249, 0.0, 0.0]
Type     Type      SBS1     SBS5    SBS17a    SBS17b     

Type     Type      SBS1      SBS5    SBS17a SBS17b     SBS18     SBS31  \
59    G[C>T]T  0.000038  0.018596  0.000066    0.0  0.006694  0.005697   

Type     SBS35  canonical_index  
59    0.006028               43  
G[C>T]T [1.3625833286382323e-05, 0.008341469637816032, 0.0, 0.0, 0.0008952396012497734, 0.0, 0.0]
Type     Type     SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
80    T[C>T]A  0.00111  0.020296  0.000532  0.000108  0.011407  0.007476   

Type     SBS35  canonical_index  
80    0.002959               44  
T[C>T]A [0.0003939416467979635, 0.009103861983071, 0.0, 0.0, 0.0015255204837027593, 0.0, 0.0]
Type     Type      SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
81    T[C>T]C  0.000037  0.022995  0.000091  0.000604  0.011407  0.024587   

Type     SBS35  canonical_index  
81    0.012596               45  
T[C>T]C [1.323550993703283e-05, 0.010314720413770065, 0.0, 0.0, 0.0015255204837027593, 0.0, 0.0]
Type     Type     SBS1      SBS5    SBS17a    SBS1

Type     Type     SBS1      SBS5    SBS17a   SBS17b     SBS18     SBS31  \
42    C[T>C]G  0.00036  0.020796  0.087311  0.00392  0.001621  0.004458   

Type     SBS35  canonical_index  
42    0.001559               70  
C[T>C]G [0.00010444437891964577, 0.011510239994021839, 0.0, 0.0, 0.00018486833491142087, 0.0, 0.0]
Type     Type      SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
43    C[T>C]T  0.000043  0.014897  0.426054  0.018299  0.002562  0.013793   

Type     SBS35  canonical_index  
43    0.003179               71  
C[T>C]T [1.2359251505491414e-05, 0.008245316306562606, 0.0, 0.0, 0.0002921376198862435, 0.0, 0.0]
Type     Type     SBS1      SBS5    SBS17a   SBS17b     SBS18     SBS31  \
64    G[T>C]A  0.00105  0.014097  0.000123  0.00006  0.000708  0.004528   

Type     SBS35  canonical_index  
64    0.000565               72  
G[T>C]A [0.00030468340144474196, 0.0078026147896189816, 0.0, 0.0, 8.074122944933482e-05, 0.0, 0.0]
Type     Type    SBS1      SBS5    SBS17a  

Type     Type     SBS1      SBS5    SBS17a   SBS17b     SBS18     SBS31  \
64    G[T>C]A  0.00105  0.014097  0.000123  0.00006  0.000708  0.004528   

Type     SBS35  canonical_index  
64    0.000565               72  
G[T>C]A [0.00022463306892080164, 0.009662542159212044, 0.0, 0.0, 6.3127920949555e-05, 0.0, 0.0]
Type     Type    SBS1      SBS5    SBS17a  SBS17b     SBS18     SBS31  \
65    G[T>C]C  0.0019  0.007938  0.014902  0.0015  0.002261  0.006567   

Type     SBS35  canonical_index  
65    0.002029               73  
G[T>C]C [0.0004064789782893215, 0.00544117583701272, 0.0, 0.0, 0.00020164252745271942, 0.0, 0.0]
Type     Type     SBS1      SBS5   SBS17a   SBS17b     SBS18     SBS31  \
66    G[T>C]G  0.00117  0.014997  0.00357  0.00101  0.003983  0.005127   

Type     SBS35  canonical_index  
66    0.000191               74  
G[T>C]G [0.00025030557855053624, 0.010279300125624065, 0.0, 0.0, 0.000355104987497675, 0.0, 0.0]
Type     Type      SBS1      SBS5    SBS17a   SBS17b     SB

Type     Type      SBS1     SBS5    SBS17a SBS17b     SBS18     SBS31  \
30    C[C>G]G  0.000352  0.00249  0.000232    0.0  0.003582  0.000854   

Type    SBS35  canonical_index  
30    0.00087               22  
C[C>G]G [6.50597205738605e-05, 0.0015470378254424974, 0.0, 0.0, 0.0004491680534248667, 0.0, 0.0]
Type     Type SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
31    C[C>G]T  0.0  0.009528  0.000039  0.000193  0.003792  0.003418   

Type     SBS35  canonical_index  
31    0.016194               23  
C[C>G]T [4.1216811613553675e-17, 0.005920992159223695, 0.0, 0.0, 0.0004755158896271501, 0.0, 0.0]
Type     Type     SBS1      SBS5    SBS17a   SBS17b     SBS18     SBS31  \
52    G[C>G]A  0.00001  0.004679  0.000878  0.00006  0.000635  0.001829   

Type     SBS35  canonical_index  
52    0.003379               24  
G[C>G]A [1.772507728134438e-06, 0.0029076855514340912, 0.0, 0.0, 7.961851730029762e-05, 0.0, 0.0]
Type     Type      SBS1      SBS5    SBS17a    SBS17b     SBS1

Type     Type      SBS1      SBS5   SBS17a   SBS17b     SBS18     SBS31  \
94    T[T>G]G  0.000583  0.006939  0.00092  0.00478  0.001451  0.005347   

Type     SBS35  canonical_index  
94    0.004818               94  
T[T>G]G [0.00011502796637667058, 0.004584607029868876, 0.0, 0.0, 0.00012399127917151533, 0.0, 0.0]
Type     Type SBS1      SBS5    SBS17a    SBS17b     SBS18     SBS31  \
95    T[T>G]T  0.0  0.013497  0.004581  0.121992  0.005163  0.007296   

Type     SBS35  canonical_index  
95    0.003619               95  
T[T>G]T [4.3998690397937465e-17, 0.008918184364373768, 0.0, 0.0, 0.00044123798561632866, 0.0, 0.0]
STE1207A
Type     Type      SBS1      SBS5   SBS17a    SBS17b     SBS18     SBS31  \
0     A[C>A]A  0.000886  0.011998  0.00207  0.000608  0.051534  0.009535   

Type     SBS35  canonical_index  
0     0.008827                0  
A[C>A]A [0.00012813549986254988, 0.008944407807405574, 0.0, 0.0, 0.0034510227147450164, 0.0, 0.0]
Type     Type     SBS1      SBS5    SBS17a

In [40]:
results.head()

Unnamed: 0,sample,mutation_type,SBS1,SBS5,SBS17a,SBS17b,SBS18,SBS31,SBS35
0,5FU-PATIENT1-N-CLONE1,A[C>A]A,0.018803,0.314308,0.0,0.0,0.564766,0.041717,0.060407
0,5FU-PATIENT1-N-CLONE1,A[C>A]C,0.055891,0.285556,0.0,0.0,0.200107,0.093429,0.365017
0,5FU-PATIENT1-N-CLONE1,A[C>A]G,0.03928,0.50671,0.0,0.0,0.278664,0.075908,0.099438
0,5FU-PATIENT1-N-CLONE1,A[C>A]T,0.044524,0.283725,0.0,0.0,0.384589,0.045004,0.242159
0,5FU-PATIENT1-N-CLONE1,C[C>A]A,0.005238,0.153953,0.0,0.0,0.641977,0.037015,0.161817


# Identify the reference 96-based trinucleotide of a mutation
This code queries the hg19 reference genome to get the trinucleotide sequence overlapping a mutated position

Note: by consensus the trinucleotides are only represented for pyrimidines; this is, the A[A>C]A trinucleotide includes mutations in A[C>A]A and T[G>T]T.

In [62]:
from bgreference import hg19

In [63]:
def rev_comp(seq):
    """Compute reverse complementary of a sequence"""
    comp_nucleotides = {
        'A': 'T',
        'C': 'G',
        'G': 'C',
        'T': 'A'
    }

    return ''.join(list(map(lambda x: comp_nucleotides[x], seq[::-1])))

In [60]:
len(input_mutations)

409359

In [61]:
# Subset substitutions
nucleotides = ['A', 'C', 'G', 'T']
input_mutations = input_mutations.loc[(input_mutations['ref'].isin(nucleotides)) & (input_mutations['alt'].isin(nucleotides))].copy()

len(input_mutations)

306220

In [95]:
# Create a new dataframe with the updated mutation_type format
# Pandas dataframes can be created line by line (each line is an individual dataframe)
# These lines can be saved in a list of lines
# You then need to concatenate the df in the list of lines and add a header again
# This chunk of code takes a while to run
lines = []

for _, row in input_mutations.iterrows():
    
    chromosome = row['chr']
    start = row['pos'] - 1    # trinucleotide = the position and the 5' and 3' nucleotides, so we need to query one position before the mutation
    ref = row['ref']
    alt = row['alt']
    sample = row['sample']
    
    # Get sequence
    trinucleotide = hg19(chromosome, start, size=3)
    
    # If reference nucleotide is pyrimidine-based, write as: 
    if ref in ['C', 'T']: 
        mut_type = f'{trinucleotide[0]}[{ref}>{alt}]{trinucleotide[2]}'
    
    # If reference nucleotide is purine-based, write as: 
    if ref in ['A', 'G']: 
        # Get the reverse complementary of the sequences
        trinucleotide = rev_comp(trinucleotide)
        ref = rev_comp(ref)
        alt = rev_comp(alt)
        mut_type = f'{trinucleotide[0]}[{ref}>{alt}]{trinucleotide[2]}'
        
    # Add to lines
    lines += [pd.DataFrame([[chromosome, start+1, ref, alt, sample, mut_type]])]

In [96]:
# Concat lines into the new dataframe
new_input_mutations = pd.concat(lines)

# Add header
new_input_mutations.columns = input_mutations.columns
new_input_mutations.head()

Unnamed: 0,chr,pos,ref,alt,sample,mutation_type
0,chr1,1541060,C,T,5FU-PATIENT1-N-CLONE1,A[C>T]C
0,chr1,1937385,C,A,5FU-PATIENT1-N-CLONE1,T[C>A]T
0,chr1,2612326,C,T,5FU-PATIENT1-N-CLONE1,C[C>T]G
0,chr1,3510207,C,T,5FU-PATIENT1-N-CLONE1,C[C>T]A
0,chr1,3510227,C,A,5FU-PATIENT1-N-CLONE1,G[C>A]C


# Assign specific mutations to a signature

Here you need to read the file containing individual mutations from each sample. 

You need to know the trinucleotide context channel of each mutation in order to do the attribution

We will to a toy example

In [109]:
# For each line in the mutations file
sig_list = ['SBS1', 'SBS5', 'SBS17a', 'SBS17b', 'SBS18' 'SBS31', 'SBS35']
new_columns_dict = defaultdict(list)
for _, row in new_input_mutations.iterrows():
    sample = row['sample']
    muttype = row['mutation_type']
    # Get probabilities for this mutation type in the sample
    info = results.loc[(results['mutation_type'] == muttype) & (results['sample'] == sample)]
    if len(info) > 0:
        # Here you can do this either signature by signature, as below ("hardcoded"), or with a loop across signatures
        list_of_p = []
        for sig in signatures_under_analysis: 

            prob = info[sig].iloc[0]
            list_of_p.append((sig, prob))
            new_columns_dict[sig] += [prob]
        #print(sample, muttype, list_of_p)

        # Then, you can attribute a mutation to a signature by finding the signature with maximum probability
        attr_sig = sorted(list_of_p, key=lambda x: x[1], reverse=True)[0]
        new_columns_dict['max_sig'] += [attr_sig[0]] 
    #     print(sample, muttype, attr_sig)

        # Now you could write here a new file containing mutations as in the original input file, adding additional
        # columns with the probabilities per signature in each mutation and the signature to which the mutation
        # has been assigned... 
        # write file...
    else:
        for sig in signatures_under_analysis: 
            new_columns_dict[sig] += [-1]
        new_columns_dict['max_sig'] += ['None']


In [110]:
new_input_mutations.head()
print(len(new_input_mutations.chr))
print(len(new_columns_dict["max_sig"]))

306220
306220


In [111]:
new_input_mutations['SBS1'] = new_columns_dict['SBS1']
new_input_mutations['SBS5'] = new_columns_dict['SBS5']
new_input_mutations['SBS17a'] = new_columns_dict['SBS17a']
new_input_mutations['SBS17b'] = new_columns_dict['SBS17b']
new_input_mutations['SBS18'] = new_columns_dict['SBS18']
new_input_mutations['SBS31'] = new_columns_dict['SBS31']
new_input_mutations['SBS35'] = new_columns_dict['SBS35']
new_input_mutations['max_sig'] = new_columns_dict['max_sig']

new_input_mutations.head()

Unnamed: 0,chr,pos,ref,alt,sample,mutation_type,SBS1,SBS5,SBS17a,SBS17b,SBS18,SBS31,SBS35,max_sig
0,chr1,1541060,C,T,5FU-PATIENT1-N-CLONE1,A[C>T]C,0.185835,0.649494,0.0,0.0,0.070642,0.056216,0.037813,SBS5
0,chr1,1937385,C,A,5FU-PATIENT1-N-CLONE1,T[C>A]T,7e-05,0.18532,0.0,0.0,0.745359,0.010938,0.058313,SBS18
0,chr1,2612326,C,T,5FU-PATIENT1-N-CLONE1,C[C>T]G,0.856552,0.094373,0.0,0.0,0.037932,0.007383,0.003761,SBS1
0,chr1,3510207,C,T,5FU-PATIENT1-N-CLONE1,C[C>T]A,0.047488,0.568417,0.0,0.0,0.143529,0.201537,0.039029,SBS5
0,chr1,3510227,C,A,5FU-PATIENT1-N-CLONE1,G[C>A]C,0.008619,0.240045,0.0,0.0,0.227282,0.112632,0.411422,SBS35


In [115]:
#saving the data to a file

new_input_mutations.to_csv("sig_assignments.txt", sep = "\t")

In [85]:
from collections import defaultdict