# Sometimes it's easier to look at large tables if there is an extra 'gene name'

In [1]:
import glob
import os
import gffutils
import pandas as pd
from tqdm import tnrange, tqdm_notebook
from tqdm import tqdm
tqdm.pandas()

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs'

In [3]:
merged_tables = sorted(glob.glob(os.path.join(input_dir, '*cds.epkm.tsv')))
merged_tables

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/Apo_Control_possorted_genome_bam_MD.merged.epkm.cds.epkm.tsv',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/RPS2_possorted_genome_bam_MD.merged.epkm.cds.epkm.tsv']

In [4]:
def gene_id_to_name(db, gencode=True):
    """
    Returns a dictionary containing a gene_id:name translation
    Note: may be different if the 'gene_id' or 'gene_name' 
    keys are not in the source GTF file
    (taken from gscripts.region_helpers)
    gencode: True if we're using Gencode-style annotations (ie. ENSG00000100320.18). False if we're using ENSEMBL-style (ie. ENSG00000100320)
    """
    genes = db.features_of_type('gene')
    gene_name_dict = {}
    for gene in genes:
        if gencode:
            gene_id = gene.attributes['gene_id'][0] if type(gene.attributes['gene_id']) == list else gene.attributes['gene_id']
        else:
            gene_id = gene.attributes['gene_id'][0].split('.')[0] if type(gene.attributes['gene_id']) == list else gene.attributes['gene_id']
        try:
            if gencode:
                gene_name_dict[gene_id] = gene.attributes['gene_name'][0]  # this is for GENCODE-style IDs! 
            else:
                gene_name_dict[gene_id] = gene.attributes['gene_name'][0].split('.')[0]  # this is for ENSEMBL-style IDs! 
        except KeyError:
            print(gene.attributes.keys())
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

# db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/refdata-cellranger-hg19-3.0.0/genes.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)
gene_id_to_name_dictionary = gene_id_to_name(DATABASE, gencode=False)

# test to make sure function works
# gene_id_to_name_dictionary['ENSG00000100320.18']
gene_id_to_name_dictionary['ENSG00000100320']

'RBFOX2'

In [5]:
def id2name(row, d=gene_id_to_name_dictionary):
    """
    Helper function to apply key/value matching to d across all rows
    """
    return d[row['Geneid']]

for table in merged_tables:
    # read in the table
    df = pd.read_csv(table, sep='\t')
    # make sure the 'name' column is the column that contains the geneid
    df['genename'] = df.progress_apply(id2name, axis=1)
    # setting index to genename pushes the 'genename' column to the front
    df.set_index('genename', inplace=True)
    # save
    df.to_csv(os.path.splitext(table)[0] + ".wgenenames.txt", sep='\t')

100%|██████████| 20356/20356 [00:27<00:00, 750.06it/s] 
100%|██████████| 20356/20356 [00:36<00:00, 559.66it/s]
