In [1]:
import pandas as pd
import gffutils
import os
from collections import defaultdict

In [2]:
db_file = '/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)

  "method of this object." % self.version)


In [3]:
# wd = '/projects/ps-yeolab3/iachaim/Cleber_Organoids/6_months/' # working directory
wd = '/projects/ps-yeolab3/bay001/Cleber_Organoids_tempdir/'

# Build a gene id to name dictionary
- {EnsemblID:name}

In [4]:
def gene_id_to_name(db):
    '''
    Returns a dictionary containing a gene_id:name translation
    Note: may be different if the 'gene_id' or 'gene_name' 
    keys are not in the source GTF file
    (taken from gscripts.region_helpers)
    '''
    genes = db.features_of_type('gene')
    gene_name_dict = {}
    for gene in genes:
        gene_id = gene.attributes['gene_id'][0] if type(gene.attributes['gene_id']) == list else gene.attributes['gene_id']
        try:
            ensembl_id = gene_id.split('.')[0] # gencode and ensembl differ in that gencode includes annotation version 
            gene_name_dict[ensembl_id] = gene.attributes['gene_name'][0] # returns the first gene name found per id (no aliases), also ensures only one gene name is returned per id
        except KeyError:
            print(gene.attributes.keys())
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict

# create the dictionary
gene_id_to_name_dictionary = gene_id_to_name(DATABASE)

# let's test it out
gene_id_to_name_dictionary['ENSG00000100320']

'RBFOX2'

In [5]:
# read in table
fn = 'H09_expression.csv' # filename (fn)

csv = pd.read_table(
    os.path.join(wd, fn),
    sep=',',
    index_col=0
)
print("original matrix sizes (row, column)", csv.shape)
csv.head()

('original matrix sizes (row, column)', (32738, 1739))


Unnamed: 0,AAACCTGAGATCTGAA-1,AAACCTGCATCCTTGC-1,AAACCTGGTCAGAGGT-1,AAACGGGCAATGTTGC-1,AAACGGGCAGTTCATG-1,AAACGGGGTGTGGTTT-1,AAACGGGGTTAAGAAC-1,AAACGGGTCAGTCCCT-1,AAAGATGAGCGCTCCA-1,AAAGATGGTTACCAGT-1,...,TTTGCGCTCAGGCAAG-1,TTTGGTTAGGACAGAA-1,TTTGGTTTCAGGTAAA-1,TTTGGTTTCTCGTATT-1,TTTGTCAAGACCTAGG-1,TTTGTCACACAACGCC-1,TTTGTCACAGCCTTGG-1,TTTGTCAGTATGAATG-1,TTTGTCAGTCTTTCAT-1,TTTGTCAGTTCGAATC-1
ENSG00000243485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000186092,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000239945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Convert gene ids

In [6]:
def geneid2name(row, d=gene_id_to_name_dictionary):
    try:
        return '{}_{}'.format(row.name, d[row.name])
    except KeyError:
        return '{}_{}'.format(row.name, 'NA')

csv['gene_name'] = csv.apply(geneid2name, axis=1)
csv.head()

Unnamed: 0,AAACCTGAGATCTGAA-1,AAACCTGCATCCTTGC-1,AAACCTGGTCAGAGGT-1,AAACGGGCAATGTTGC-1,AAACGGGCAGTTCATG-1,AAACGGGGTGTGGTTT-1,AAACGGGGTTAAGAAC-1,AAACGGGTCAGTCCCT-1,AAAGATGAGCGCTCCA-1,AAAGATGGTTACCAGT-1,...,TTTGGTTAGGACAGAA-1,TTTGGTTTCAGGTAAA-1,TTTGGTTTCTCGTATT-1,TTTGTCAAGACCTAGG-1,TTTGTCACACAACGCC-1,TTTGTCACAGCCTTGG-1,TTTGTCAGTATGAATG-1,TTTGTCAGTCTTTCAT-1,TTTGTCAGTTCGAATC-1,gene_name
ENSG00000243485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000243485_MIR1302-11
ENSG00000237613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000237613_FAM138A
ENSG00000186092,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000186092_OR4F5
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000238009_RP11-34P13.7
ENSG00000239945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000239945_RP11-34P13.8


In [7]:
# remove ensembl IDs and replace with gene names
csv.set_index('gene_name', inplace=True) # setting index also functionally sends the column to the left, there are other ways to do it
csv.head()

Unnamed: 0_level_0,AAACCTGAGATCTGAA-1,AAACCTGCATCCTTGC-1,AAACCTGGTCAGAGGT-1,AAACGGGCAATGTTGC-1,AAACGGGCAGTTCATG-1,AAACGGGGTGTGGTTT-1,AAACGGGGTTAAGAAC-1,AAACGGGTCAGTCCCT-1,AAAGATGAGCGCTCCA-1,AAAGATGGTTACCAGT-1,...,TTTGCGCTCAGGCAAG-1,TTTGGTTAGGACAGAA-1,TTTGGTTTCAGGTAAA-1,TTTGGTTTCTCGTATT-1,TTTGTCAAGACCTAGG-1,TTTGTCACACAACGCC-1,TTTGTCACAGCCTTGG-1,TTTGTCAGTATGAATG-1,TTTGTCAGTCTTTCAT-1,TTTGTCAGTTCGAATC-1
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000243485_MIR1302-11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237613_FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000186092_OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000238009_RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000239945_RP11-34P13.8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
print("new matrix sizes (row, column)", csv.shape) # double check that the number of rows is unchanged (may change if there are duplicates)

('new matrix sizes (row, column)', (32738, 1739))


In [9]:
csv.to_csv(
    os.path.join(wd, fn + '.genename.csv'), # add a '.genename.txt' suffix to the original file
    sep=','
)

In [10]:
# check for duplicate indices
csv[csv.index.duplicated()].head()

Unnamed: 0_level_0,AAACCTGAGATCTGAA-1,AAACCTGCATCCTTGC-1,AAACCTGGTCAGAGGT-1,AAACGGGCAATGTTGC-1,AAACGGGCAGTTCATG-1,AAACGGGGTGTGGTTT-1,AAACGGGGTTAAGAAC-1,AAACGGGTCAGTCCCT-1,AAAGATGAGCGCTCCA-1,AAAGATGGTTACCAGT-1,...,TTTGCGCTCAGGCAAG-1,TTTGGTTAGGACAGAA-1,TTTGGTTTCAGGTAAA-1,TTTGGTTTCTCGTATT-1,TTTGTCAAGACCTAGG-1,TTTGTCACACAACGCC-1,TTTGTCACAGCCTTGG-1,TTTGTCAGTATGAATG-1,TTTGTCAGTCTTTCAT-1,TTTGTCAGTTCGAATC-1
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
