# tNGS Starlims Import

## Mutation Surveyor import formats
- SNV = c.[x]+[y/=],p.A123A/B
- Structural variants = c.123_456_hetdup/hetdel/hetinsACTG or c.123_456_del/dup/ins

## Extra Columns
- Accession number
- Single gene
- NucleotideNom
- AminoNom
- GenomicNom

## Pattern logic
- If SNV or structural variant
- Genotype

## Column for mutation details import
- GENE p./N c./N

In [1]:
import pandas as pd
import numpy as np
import re
df_all = pd.read_csv('data/tNGS_import.csv')

In [29]:
# Get ride of unwanted columns and create a new df
df = df_all.iloc[:,:23].copy()
df.rename(columns={'WT nucleotides':'Ref', 'Variant nucleotides':'Alt', 'Variant type':'varType', 'Inserted nucleotides':'insBases'}, inplace=True)
sample_ids = list(x for x in df['Folder number'] if x.startswith("EX"))

In [19]:
df.tail()

Unnamed: 0,Folder number,Report variant?,Reason for selection,Gene,Genotype,Genomic nomenclature,cDNA nomenclature,Protein nomenclature,Coding effect,varType,...,insBases,Deleted nucleotides,Ref,Alt,Chromosome,AccessionNo,cDNANo,AminoNo,GenomicNo,MutDetails
48,snvcontrol05,Variant detected,HGMD sub-category,SRD5A2,0/1,NG_008365.1:g.5367T>C,NM_000348.3:c.281+15T>C,p.?,,substitution,...,,,T,C,2,NM_000348.3,281+15,,5367,SRD5A2 int1 p.?/N c.281+15T>C/N
49,snvcontrol05,Variant detected,Distance to splice site,CYP11B1,0/1,Chr8(GRCh37):g.143957129G>T,NM_000497.3:c.1120C>A,p.Arg374=,synonymous,substitution,...,,,C,A,8,NM_000497.3,1120,Arg374=,143957129,CYP11B1 ex6 p.?A?r?g?3?7?4?A?r?g?/N c.1120C>A/N
50,snvcontrol06,Variant detected,HGMD sub-category,ZMPSTE24,0/1,Chr1(GRCh37):g.40756551dup,NM_005857.4:c.1085dup,p.Leu362Phefs*19,frameshift,duplication,...,T,,,,1,NM_005857.4,1085,Leu362Phefs*,40756551,ZMPSTE24 ex9 p.?L?e?u?3?6?2?f?s?/N c.1085dup/N
51,EX1903669,Confirmation required,,"PHEX_ex_2,PHEX_ex_3",0.661,chrX:22056421-22065408,,,,deletion,...,,,,,X,,,,22056421-22065408,PHEX ex2-3 p.? g.22056421-22065408del
52,EX1903886,Confirmation required,,"CDC73_ex_11,CDC73_ex_12,CDC73_ex_13",0.67,chr1:193172754-193181723,,,,deletion,...,,,,,1,,,,193172754-193181723,CDC73 ex11-13 p.? g.193172754-193181723del


In [30]:
# Accession number
df['AccessionNo'] = df['cDNA nomenclature'].apply(lambda x: re.split("\:", str(x), 1)[0])
# cDNA
df['cDNANo'] = df['cDNA nomenclature'].apply(lambda x: str(re.findall("[^c\.][0-9]+[+-_]*[0-9]+", str(x))[-1:]).strip("[]''"))
# AminoAccid
df['AminoNo'] = df['Protein nomenclature'].apply(lambda x: str(re.findall("[a-zA-Z]{3}[0-9]+[\_]*[a-zA-Z]*[0-9]*[\=*]*", str(x))).strip("[]''"))
# Genomic
df['GenomicNo'] = df['Genomic nomenclature'].apply(lambda x: str(re.findall("[0-9]+[\-]*[0-9]*", str(x))[-1:]).strip("[]''"))

In [126]:
# Functions
def gene_name(gene):
    return re.split("\_", str(gene), 1)[0]

def single_amino_code(aa3):
    aa_dict = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys':'C', 'Glu':'E','Gln':'Q', \
               'Gly':'G', 'His':'H', 'Ile':'I','Leu':'L','Lys':'K','Met':'M','Phe':'F','Pro':'P','Ser':'S', \
               'Thr':'T','Trp':'W','Tyr':'Y','Val':'V'}
    return aa_dict[aa3]
    


def zygosity(genotype):
    try:
        genotype = float(genotype)
        if (genotype >= 0.4 and genotype <= 0.6) or (genotype >=1.4 and genotype <= 1.6):
            return "0/1"
        elif genotype <=0.1:
            return "1/1"
        else:
            return genotype
    except:
        return genotype
    
    
def get_exons(gene):
    try:
        gene = gene.split(",")
        exons = [re.findall("_[0-9]+", x)[0].replace("_","") for x in gene]
    except:
        return gene

    if len(exons) == 1:
        return int(exons[0])
    else:
        return exons[0] + "-" + exons[-1]
    
    
def variant_type(varType, insBases):
    if varType == "duplication":
        return "dup"
    elif varType == "deletion":
        return "del"
    elif varType == "insertion":
        return "ins" + insBases
    elif varType == "delins":
        return "delins" + insBases
    else:
        return ""


def mutation_details(chrom, gene, exon, intron, amino, cdna, ref, alt, genomic, genotype, varType, insBases):
    if pd.isnull(intron):
        coding = "ex"
    else:
        coding = "int"
    
    if pd.isnull(exon):
        ex_int = get_exons(gene)
    else:
        try:
            ex_int = int(exon)
        except:
            ex_int = np.nan
    gene = gene_name(gene)
    genotype = zygosity(genotype)
    
    varType = variant_type(varType, insBases)
    
    if pd.isnull(ref) and pd.isnull(exon):
        nucleotide = f"g.{genomic}{varType}"
    elif pd.isnull(ref):
        nucleotide = f"c.{cdna}{varType}"
    else:
        nucleotide = f"c.{cdna}{ref}>{alt}{varType}"
        
    try:
        amino = amino.replace("=", amino[:3])
        amino = re.sub("[a-zA-Z]{3}fs\*", "fs", amino)
        #amino = amino.replace("","?")
        amino = amino.replace("*", "Ter")
    except:
        pass

    if chrom == "X" or isinstance(genotype, float):
        return f"{gene} {coding}{ex_int} p.{amino} {nucleotide}"
    elif genotype == "0/1":
        return f"{gene} {coding}{ex_int} p.{amino}/N {nucleotide}/N"
    elif genotype == "1/1":
        return f"{gene} {coding}{ex_int} p.{amino}/{amino} {nucleotide}/{nucleotide[2:]}"
    
    
def mut_surveyor(genotype, amino, nucleotide, varType):
    nucleotide = nucleotide[2:]  
    try:
        amino = single_amino_code(re.findall("[a-zA-Z]{3}", amino)[0]) + re.findall("[0-9]+", amino)[0] \
                + single_amino_code(re.findall("[a-zA-Z]{3}", amino)[1])
    except IndexError:
        amino = single_amino_code(re.findall("[a-zA-Z]{3}", amino)[0]) + re.findall("[0-9]+", amino)[0] + re.findall("fs|\*", amino)[0]


    if varType == "":
        if genotype == "0/1":
            return f"c.[{nucleotide}]+[=],p.{amino}"
        elif genotype == "1/1":
            return f"c.[{nucleotide}]+[{nucleotide}],p.{amino}"
    

In [54]:
df['MutDetails'] = df.apply(lambda x: mutation_details(x.Chromosome, x.Gene, x.Exon, x.Intron, x.AminoNo, x.cDNANo, x.Ref, x.Alt, x.GenomicNo, x.Genotype, x.varType, x.insBases), axis=1)

In [128]:
print(mut_surveyor("0/1", 'Gly706Gly', 'c.2116G>A', ""))

c.[2116G>A]+[=],p.G706G


In [127]:
df.iloc[38]

Folder number                                                  m034
Report variant?                                    Variant detected
Reason for selection                              HGMD sub-category
Gene                                                          ABCC8
Genotype                                                        0/1
Genomic nomenclature                    Chr11(GRCh37):g.17464303T>C
cDNA nomenclature                          NM_001287174.1:c.1594A>G
Protein nomenclature                                    p.Ser532Gly
Coding effect                                              missense
varType                                                substitution
Variant location                                               exon
Exon                                                            NaN
Intron                                                          NaN
Quality (vcf)                                               11595.8
Filter (vcf)                                    

In [8]:
# SNV or Structural variant


In [9]:
a = "amino"
a[1:-1]

'min'