# Creating a MAF file from a CSV file with the 34 variants with evidence of pathogenicity

2023/12/12

In [1]:
import pandas as pd

In [2]:
print(pd.__version__)

1.5.3


In [3]:
csv_file_path = r'C:\Users\Ana\Documents\GitHub\dissertation\data\data_exome_and_clinical_MAF\for_oncoplot.csv'

df = pd.read_csv(csv_file_path, sep=';')

In [4]:
df.head()

Unnamed: 0,ID,Gene,Chr,Start,End,Type,Ref,Alt,Consequence,Exon,Transcript,c.DNA,Protein,VAF,DP,CGI,AMP,ClinGen
0,GIST-1,BCL9L,11,118769194,118769194,SNP,C,T,missense,8,NM_182557,c.G4430A,p.R1477H,262,423,driver (oncodriveMUT),III,VUS
1,GIST-1,KIT,4,55593579,55593605,DEL,CAGAAACCCATGTATGAAGTACAGTGGA,C,in-frame deletion,11,NM_000222,c.1648_1674del,p.K550_K558del,177,176,passenger,I,LO
2,GIST-10,LEF1,4,109010359,109010359,SNP,G,C,missense,4,NM_016269,c.C469G,p.L157V,112,279,driver (oncodriveMUT),III,VUS
3,GIST-10,KIT,4,55593661,55593661,SNP,T,C,missense,11,NM_000222,c.T1727C,p.L576P,117,150,driver (boostDM: non-tissue-specific model),I,LO
4,GIST-11,KIT,4,55593601,55593605,DEL,AGTGG,-,frameshift deletion,11,NM_000222,c.1667_1671del,p.W557Gfs*3,288,87,driver (oncodriveMUT),I,VUS


In [5]:
df.columns

Index(['ID', 'Gene', 'Chr', 'Start', 'End', 'Type', 'Ref', 'Alt',
       'Consequence', 'Exon', 'Transcript', 'c.DNA', 'Protein', 'VAF', 'DP',
       'CGI', 'AMP', 'ClinGen'],
      dtype='object')

In [6]:
df["Consequence"]

0                 missense
1        in-frame deletion
2                 missense
3                 missense
4      frameshift deletion
5     frameshift insertion
6                 missense
7                 missense
8        in-frame deletion
9       in-frame insertion
10                  splice
11       in-frame deletion
12                missense
13       in-frame deletion
14                missense
15                missense
16                  splice
17       in-frame deletion
18                missense
19       in-frame deletion
20       in-frame deletion
21       in-frame deletion
22                missense
23                missense
24       in-frame deletion
25                missense
26                missense
27     frameshift deletion
28       in-frame deletion
29                missense
30                missense
Name: Consequence, dtype: object

In [7]:
consequence_mapping = {
    "missense": "Missense_Mutation",
    "frameshift insertion": "Frame_Shift_Ins",
    "frameshift deletion": "Frame_Shift_Del",
    "in-frame insertion": "In_Frame_Ins",
    "in-frame deletion": "In_Frame_Del",
    "splice": "Splice_Site"
}

df["Consequence"] = df["Consequence"].map(consequence_mapping)

df["Consequence"]

0     Missense_Mutation
1          In_Frame_Del
2     Missense_Mutation
3     Missense_Mutation
4       Frame_Shift_Del
5       Frame_Shift_Ins
6     Missense_Mutation
7     Missense_Mutation
8          In_Frame_Del
9          In_Frame_Ins
10          Splice_Site
11         In_Frame_Del
12    Missense_Mutation
13         In_Frame_Del
14    Missense_Mutation
15    Missense_Mutation
16          Splice_Site
17         In_Frame_Del
18    Missense_Mutation
19         In_Frame_Del
20         In_Frame_Del
21         In_Frame_Del
22    Missense_Mutation
23    Missense_Mutation
24         In_Frame_Del
25    Missense_Mutation
26    Missense_Mutation
27      Frame_Shift_Del
28         In_Frame_Del
29    Missense_Mutation
30    Missense_Mutation
Name: Consequence, dtype: object

In [8]:
maf_df = pd.DataFrame({
    "Hugo_Symbol": df["Gene"],
    "Chromosome": df["Chr"],
    "Start_Position": df["Start"],
    "End_Position": df["End"],
    "Variant_Type": df["Type"], # ins, del, indel, snp
    "Reference_Allele": df["Ref"],
    "Tumor_Seq_Allele2": df["Alt"],
    "Variant_Classification": df["Consequence"], # missense, nonsense, frameshift, in-frame, splice
    "Tumor_Sample_Barcode": df["ID"],
    "Exon_Number": df["Exon"],
    "Transcript_ID": df["Transcript"], # canonical transcript NM_
    "cDNA_Change": df["c.DNA"],
    "Protein_Change": df["Protein"],
    "Allele_Frequency": df["VAF"],
    "Read_Depth": df["DP"],
    "CGI_Classification": df["CGI"], # driver or passenger
    "AMP_Classification": df["AMP"], # tier I, II, III
    "ClinGen_Classification": df["ClinGen"]#, # oncogenic, likely oncogenic, uncertain significance
})

In [9]:
maf_df["Transcript_ID"]

0        NM_182557
1        NM_000222
2        NM_016269
3        NM_000222
4        NM_000222
5        NM_000222
6        NM_006218
7     NM_001008781
8        NM_000222
9        NM_000222
10    NM_001164273
11       NM_000222
12       NM_020987
13       NM_000222
14       NM_000088
15       NM_000222
16    NM_001134407
17       NM_000222
18       NM_006206
19       NM_000222
20       NM_000222
21       NM_000222
22       NM_005228
23       NM_000222
24       NM_006206
25       NM_014384
26       NM_000222
27       NM_000051
28       NM_000222
29       NM_000926
30       NM_006206
Name: Transcript_ID, dtype: object

In [10]:
maf_df.to_csv("oncoplot.maf", sep='\t', index=False)