# Join all the scored files together

In [1]:
import glob
import os
import gffutils
import pandas as pd
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/outputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/outputs'

In [3]:
def gene_id_to_name(db):
    """
    Returns a dictionary containing a gene_id:name translation
    Note: may be different if the 'gene_id' or 'gene_name' 
    keys are not in the source GTF file
    (taken from gscripts.region_helpers)
    """
    genes = db.features_of_type('gene')
    gene_name_dict = {}
    for gene in genes:
        gene_id = gene.attributes['gene_id'][0] if type(gene.attributes['gene_id']) == list else gene.attributes['gene_id']
        try:
            gene_name_dict[gene_id] = gene.attributes['gene_name'][0]  # this is for GENCODE-style IDs! 
        except KeyError:
            print(gene.attributes.keys())
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

db_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)
gene_id_to_name_dictionary = gene_id_to_name(DATABASE)

# test to make sure function works
gene_id_to_name_dictionary['ENSG00000100320.18']

  "method of this object." % self.version)


'RBFOX2'

In [4]:
merged_tables = sorted(glob.glob(os.path.join(input_dir, '*.tsv')))
merged_tables

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/outputs/APOBEC_transient_possorted_genome_bam_MD.exons.merged.edited_over_all_c.tsv',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/outputs/RBFOX2_transient_possorted_genome_bam_MD.exons.merged.edited_over_all_c.tsv']

In [5]:
def id2name(row, d=gene_id_to_name_dictionary):
    """
    Helper function to apply key/value matching to d across all rows
    """
    return d[row['name']]

for table in merged_tables:
    # read in the table
    df = pd.read_csv(table, sep='\t')
    # make sure the 'name' column is the column that contains the geneid
    df['genename'] = df.apply(id2name, axis=1)
    # setting index to genename pushes the 'genename' column to the front
    df.set_index('genename', inplace=True)
    # save
    df.to_csv(os.path.splitext(table)[0] + ".wgenenames.txt", sep='\t')