In [77]:
import pandas as pd
import os

In [78]:
parent_folder = os.path.dirname(os.getcwd())+"\\"
#Set output folder:

#Folder with all input files:
data_folder = parent_folder+"input"+"\\"

#HGNC gene annotations from https://www.genenames.org/
hgnc_f = data_folder + "hgnc5.txt"

#Mutation data from https://gdc.cancer.gov/about-data/publications/pancanatlas
mut_f = data_folder +"mc3.v0.2.8.PUBLIC.maf" 

#mRNA expression data from https://gdc.cancer.gov/about-data/publications/pancanatlas
exp_f = data_folder + "EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv" 
    
#CNV data from https://gdc.cancer.gov/about-data/publications/pancan-aneuploidy
cnv_f = data_folder + "all_thresholded.by_genes_whitelisted.tsv"

In [84]:
def gene_lookup(exp_f,hgnc_f):
    #Make a table with gene symbol and gene ID based on the mRNA exp file
    out = []
    n = 0
    with open(exp_f) as f:
        for line in f:
            if n > 0:
                line = line.strip()
                line = line.replace('"','').split('\t')
                out.append([line[0].split('|')[0].strip(),line[0].split('|')[1].strip()])
            n = 1
    
    lookup_df = pd.DataFrame(out,columns = ['old_symbol','gene_id'])

    #Get hgnc symbols
    hgnc_df = pd.read_csv(hgnc_f,sep='\t',index_col = 0)
    hgnc_df['NCBI Gene ID']= [str(x) for x in hgnc_df['NCBI Gene ID'].values]
    #Add current symbols
    out = []
    for i in lookup_df.index:
        gene_id = str(lookup_df.loc[i]['gene_id'])
        hgnc_symbol = hgnc_df.loc[hgnc_df['NCBI Gene ID']==gene_id]['Approved symbol'].values
        if len(hgnc_symbol)>0:
            out.append(hgnc_symbol[0])
        else:
            out.append('')  
    lookup_df['current_symbol']=out
    print("DONE!")
    return(lookup_df)

In [85]:
def refine_mRNA_gene_names(exp_f,lookup_df,outfile):
    exp_f2 = open(outfile, "w")#we will save the refined file here
    n=0
    with open(exp_f) as f:
        for line in f:
            line = line.strip()
            if line != '':
                line = line.replace('"','').split('\t')
            if n > 0 and line != '':
                gene_id = line[0].split("|")[1].strip()
                symbol = lookup_df.loc[lookup_df['gene_id']==str(gene_id)]['current_symbol'].values
                if len(symbol)==0:
                    symbol = ''
                else:
                    symbol = symbol[0]
                if symbol == '':
                    symbol = lookup_df.loc[lookup_df['gene_id']==str(gene_id)]['old_symbol'].values[0]
                    if symbol == '?':
                        symbol = str(gene_id)
                line[0] = symbol
            line = [x.strip() for x in line]
            line = ('\t').join(line)+"\n"
            exp_f2.write(line)
            n=1
    exp_f2.close()
    print("Refined file saved to: ",outfile)

In [86]:
def refine_CNV_names(cnv_f,lookup_df,outfile):
    #update gene names in CNV file
    n = 0
    cnv_f2 = open(outfile,'w')
    with open(cnv_f) as f:
        for line in f:
            if n > 0:
                line = line.split('\t')
                if line[0] in lookup_df['old_symbol'].values:
                    line[0] = lookup_df.loc[lookup_df['old_symbol']==line[0]]['current_symbol'].values[0]
                line = [x.strip().replace("\n", "") for x in line]
                line = ('\t').join(line)+'\n'
            cnv_f2.write(line)
            n=1
    cnv_f2.close()   
    print("Refined file saved to: ",outfile)

In [76]:
def refine_MUT_names(mut_f,lookup_df,outfile):
    #update gene names in CNV file
    n = 0
    mut_f2 = open(outfile,'w')

    with open(mut_f) as f:
        for line in f:
            line = line.split('\t')
            line = [x.replace("\n", "").replace('"', '').strip() for x in line]
            if line[0] in lookup_df['old_symbol'].values:
                line[0] = lookup_df.loc[lookup_df['old_symbol']==line[0]]['current_symbol'].values[0]
            line = ('\t').join([line[0],line[8],line[9],line[15],line[36]])+'\n'
            mut_f2.write(line)
            n=1
    mut_f2.close()   
    print("Refined file saved to: ",outfile)

In [87]:
#Compare gene symbols in the mRNA expression file with current HGNC gene symbols
#and create a look-up table to update gene names
lookup_df = gene_lookup(exp_f,hgnc_f)

  hgnc_df = pd.read_csv(hgnc_f,sep='\t',index_col = 0)


DONE!


In [88]:
#Check and rename genes in the mRNA expression file:
outfile = ('.').join(exp_f.split('.')[:-1]+['refined.tsv'])#we will save the refined file here
refine_mRNA_gene_names(exp_f,lookup_df,outfile)

Refined file saved to:  c:\Users\aaivano\OneDrive - Emory University\AVERON\AVERON_Share\input\EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.refined.tsv


In [89]:
#Check and rename genes in the CNV file:
outfile = ('.').join(cnv_f.split(".")[:-1]+['.refined.tsv'])
refine_CNV_names(cnv_f,lookup_df,outfile)

Refined file saved to:  c:\Users\aaivano\OneDrive - Emory University\AVERON\AVERON_Share\input\all_thresholded.by_genes_whitelisted..refined.tsv
