# Identify the reference 96-based trinucleotide of a mutation

This code queries the hg19 reference genome to get the trinucleotide sequence overlapping a mutated position

**Note**: by consensus the trinucleotides are only represented for pyrimidines; this is, the A[A>C]A trinucleotide 
    includes mutations in A[C>A]A and T[G>T]T.

In [15]:
import pandas as pd

from bgreference import hg19

In [16]:
def rev_comp(seq):
    """Compute reverse complementary of a sequence"""
    comp_nucleotides = {
        'A': 'T',
        'C': 'G',
        'G': 'C',
        'T': 'A'
    }

    return ''.join(list(map(lambda x: comp_nucleotides[x], seq[::-1])))

In [18]:
f = '/home/carnedo/Downloads/mut_type_data.txt'
input_mutations = pd.read_csv(f, sep='\t', header=0)
input_mutations.head(3)

Unnamed: 0,chr,pos,ref,alt,sample,mutation_type
0,1,1541060,G,A,5FU-PATIENT1-N-CLONE1,GG->AG
1,1,1937385,C,A,5FU-PATIENT1-N-CLONE1,CC->AC
2,1,2612326,C,T,5FU-PATIENT1-N-CLONE1,CC->CT


In [26]:
len(input_mutations)

409359

In [27]:
# Subset substitutions
nucleotides = ['A', 'C', 'G', 'T']
input_mutations = input_mutations.loc[(input_mutations['ref'].isin(nucleotides)) & (input_mutations['alt'].isin(nucleotides))].copy()

In [28]:
len(input_mutations)

306220

In [29]:
# Create a new dataframe with the updated mutation_type format
# Pandas dataframes can be created line by line (each line is an individual dataframe)
# These lines can be saved in a list of lines
# You then need to concatenate the df in the list of lines and add a header again
# This chunk of code takes a while to run
lines = []

for _, row in input_mutations.iterrows():
    
    chromosome = row['chr']
    start = row['pos'] - 1    # trinucleotide = the position and the 5' and 3' nucleotides, so we need to query one position before the mutation
    ref = row['ref']
    alt = row['alt']
    sample = row['sample']
    
    # Get sequence
    trinucleotide = hg19(chromosome, start, size=3)
    
    # If reference nucleotide is pyrimidine-based, write as: 
    if ref in ['C', 'T']: 
        mut_type = f'{trinucleotide[0]}[{ref}>{alt}]{trinucleotide[2]}'
    
    # If reference nucleotide is purine-based, write as: 
    if ref in ['A', 'G']: 
        # Get the reverse complementary of the sequences
        trinucleotide = rev_comp(trinucleotide)
        ref = rev_comp(ref)
        alt = rev_comp(alt)
        mut_type = f'{trinucleotide[0]}[{ref}>{alt}]{trinucleotide[2]}'
        
    # Add to lines
    lines += [pd.DataFrame([[chromosome, start, ref, alt, sample, mut_type]])]

Unnamed: 0,chr,pos,ref,alt,sample,mutation_type
0,1,1541059,C,T,5FU-PATIENT1-N-CLONE1,A[C>T]C
0,1,1937384,C,A,5FU-PATIENT1-N-CLONE1,T[C>A]T
0,1,2612325,C,T,5FU-PATIENT1-N-CLONE1,C[C>T]G
0,1,3510206,C,T,5FU-PATIENT1-N-CLONE1,C[C>T]A
0,1,3510226,C,A,5FU-PATIENT1-N-CLONE1,G[C>A]C


In [30]:
# Concat lines into the new dataframe
new_input_mutations = pd.concat(lines)

# Add header
new_input_mutations.columns = input_mutations.columns
new_input_mutations.head()

Unnamed: 0,chr,pos,ref,alt,sample,mutation_type
0,1,1541059,C,T,5FU-PATIENT1-N-CLONE1,A[C>T]C
0,1,1937384,C,A,5FU-PATIENT1-N-CLONE1,T[C>A]T
0,1,2612325,C,T,5FU-PATIENT1-N-CLONE1,C[C>T]G
0,1,3510206,C,T,5FU-PATIENT1-N-CLONE1,C[C>T]A
0,1,3510226,C,A,5FU-PATIENT1-N-CLONE1,G[C>A]C
