In [1]:
import pyensembl
from pyensembl import Genome
import pandas as pd
import csv
import pymysql

In [12]:
# PYENSEMBL
# https://buildmedia.readthedocs.org/media/pdf/pyensembl/latest/pyensembl.pdf

#### Import reference GTF to retrieve genes names by coordinates

In [2]:
# wget ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz
data = Genome(reference_name='GRCh38', annotation_name='98', gtf_path_or_url='REF/Homo_sapiens.GRCh38.98.gtf')

In [3]:
# only if index is not built already -> saved in current folder
# data.index()

In [3]:
# Pysensembl function to retrieve gene name from coordinates
data.gene_names_at_locus(contig=1, position=630755, end=634922, strand='+') 

['MTATP6P1', 'MTATP8P1', 'MTCO1P12', 'MTCO2P12', 'MTCO3P12']

In [4]:
# Pyensembl function retrieves gene ID from name
data.gene_ids_of_gene_name('FAM138A')

['ENSG00000237613']

In [22]:
# Pyensembl function retrieves gene name from ID
data.gene_name_of_gene_id('ENSG00000260500')

'AC010336.2'

In [6]:
# Pyensembl function retrieves gene SYMBOL from TX_ID
data.gene_name_of_transcript_id('ENST00000461467')

'FAM138A'

In [7]:
# Pyensembl function retrieves gene SYMBOL from TX_NAMES
data.gene_name_of_transcript_name('FAM138A-202')

'FAM138A'

In [8]:
# Pyensembl function retrieves TX_NAME from TX_ID
data.transcript_name_of_transcript_id('ENST00000461467')

'FAM138A-202'

# SALMON

### A) ddsSE (after TXIMETA): get genes symbols and transcripts names

In [4]:
# read TX_ID/GENE_ID list, strip decimals, retrieve corresponding gene symbol and tx_name, add symbols column, export
tx2gene_df = pd.read_csv('SALMON_TEMP/TXIMETA_salmon_txIdList_ddsSE.txt', 
                         sep='\t', 
                         skiprows=(0), 
                         header=(0))
ti_list = []
tn_list = []
gi_list = []
gn_list = []
i=0
j=0
for idx, row in tx2gene_df.iterrows():
    gene_symbol = ''
    new_tx = row['TX_ID'].split('.')[0]
    ti_list.append(new_tx)
    tx_name = data.transcript_name_of_transcript_id(new_tx)
    tn_list.append(tx_name)
    new_ga = row['GENE_ID'].split('.')[0]
    gi_list.append(new_ga)
    gene_symbol = data.gene_name_of_gene_id(new_ga)
    if gene_symbol != '':
        i+=1
        gn_list.append(gene_symbol)
    else:
        j+=1
        gs_list.append('.')
tx2gene_df['TX_ID'] = ti_list
tx2gene_df['TX_NAME'] = tn_list
tx2gene_df['GENE_ID'] = gi_list
tx2gene_df['GENE_NAME'] = gn_list
tx2gene_df = tx2gene_df[['TX_ID', 'TX_NAME', 'GENE_ID', 'GENE_NAME']]
tx2gene_df.to_csv('SALMON_TEMP/TXIMETA_salmon_txIdList_ddsSE_OK.txt', 
                  sep='\t', 
                  index=False)
print(f'Filled {i} genes symbols, still {j} remain unknown')

Filled 67111 genes symbols, still 0 remain unknown


### B) cts DTU (after TXIMPORT): get TX_NAME and GENE_ID

In [3]:
id2genes_df = pd.read_csv("SALMON_TEMP/TXIMPORT_salmon_txID_dtu.txt", 
                          sep='\t', 
                          skiprows=(0), 
                          header=(0))
i = 0
j = 0
gn_list = []
ga_list = []
tx_list = []
for idx, row in id2genes_df.iterrows():
    txname = ''
    txname = data.transcript_name_of_transcript_id(row['TX_ID'].split('.')[0])
    gname = '' 
    gname = data.gene_name_of_transcript_id(row['TX_ID'].split('.')[0])
    gnid = ''
    gnid = data.gene_ids_of_gene_name(gname)[0]
    if gnid != '':
        i+=1
    else:
        j+=1
    tx_list.append(txname)
    ga_list.append(gnid)
    gn_list.append(gname)
id2genes_df['TX_NAME'] = tx_list
id2genes_df['GENE_NAME'] = gn_list
id2genes_df['GENE_ID'] = ga_list
id2genes_df.to_csv("SALMON_TEMP/TXIMPORT_salmon_txID_dtu_OK.txt", 
                   sep='\t', 
                   index=False)
print(f'Filled {i} genes IDs, still {j} remain unknown')

Filled 70781 genes IDs, still 0 remain unknown


### C) dds_salmon (after TXIMPORT): get genes symbols


In [15]:
id2genes_df = pd.read_csv("SALMON_TEMP/TXIMPORT_salmon_geneID_dds.txt", 
                          sep='\t', 
                          skiprows=(0), 
                          header=(0))
i = 0
j = 0
gn_list = []
ga_list = []
for idx, row in id2genes_df.iterrows():
    gid = row['GENE_ID'].split('.')[0]
    gname = ''
    gname = data.gene_name_of_gene_id(gid)
    if gname != '':
        i+=1
    else:
        j+=1
    ga_list.append(gid)
    gn_list.append(gname)
id2genes_df['GENE_NAME'] = gn_list
id2genes_df['GENE_ID'] = ga_list
id2genes_df.to_csv("SALMON_TEMP/TXIMPORT_salmon_geneID_dds_OK.txt", 
                   sep='\t', 
                   index=False)
print(f'Filled {i} genes symbols, still {j} remain unknown')

Filled 25134 genes symbols, still 0 remain unknown


### D) DGE / DTE / DGU

In [14]:
id2genes_df = pd.read_csv("SALMON_TEMP/DEXSeq_geneID_DTU.txt", 
                          sep='\t', 
                          skiprows=(0), 
                          header=(0))
i = 0
j = 0
gn_list = []
ga_list = []
for idx, row in id2genes_df.iterrows():
    gid = row['GENE_ID'].split('.')[0]
    gname = ''
    try:
        data.gene_name_of_gene_id(gid)
    except:
        print(gid)
        ga_list.append(gid)
        gn_list.append(gid)
        #id2genes_df.drop(idx, inplace=True)
    else:
        gname = data.gene_name_of_gene_id(gid)
        if gname != '':
            i+=1
        else:
            j+=1
        ga_list.append(gid)
        gn_list.append(gname)
id2genes_df['GENE_NAME'] = gn_list
id2genes_df['GENE_ID'] = ga_list
id2genes_df.to_csv("SALMON_TEMP/DEXSeq_geneID_DTU_OK.txt", 
                   sep='\t', 
                   index=False)
print(f'Filled {i} genes symbols, still {j} remain unknown')

ENSG00000273000
ENSG00000228439
Filled 1499 genes symbols, still 0 remain unknown


### E) Karyotype plot

In [4]:
id2genes_df = pd.read_csv("SALMON_TEMP/Karyo_geneIDs.txt", 
                          sep='\t', 
                          skiprows=(0), 
                          header=(0))
i = 0
j = 0
gn_list = []
ga_list = []
for idx, row in id2genes_df.iterrows():
    gid = row['GENE_ID'].split('.')[0]
    gname = ''
    try:
        data.gene_name_of_gene_id(gid)
    except:
        print(gid)
        ga_list.append(gid)
        gn_list.append(gid)
        #id2genes_df.drop(idx, inplace=True)
    else:
        gname = data.gene_name_of_gene_id(gid)
        if gname != '':
            i+=1
        else:
            j+=1
        ga_list.append(gid)
        gn_list.append(gname)
id2genes_df['GENE_NAME'] = gn_list
id2genes_df['GENE_ID'] = ga_list
id2genes_df.to_csv("SALMON_TEMP/Karyo_geneIDs_OK.txt", 
                   sep='\t', 
                   index=False)
print(f'Filled {i} genes symbols, still {j} remain unknown')

Filled 45 genes symbols, still 0 remain unknown


### F) dge LIMMA 

In [16]:
id2genes_df = pd.read_csv("SALMON_TEMP/LIMMA_salmon_id2genes.txt", sep='\t', skiprows=(0), header=(0))
i = 0
j = 0
gn_list = []
for idx, row in id2genes_df.iterrows():
    gname = ''
    gname = data.gene_name_of_gene_id(row['GENE_ID'])
    if gname != '':
        i+=1
    else:
        j+=1
    gn_list.append(gname)
id2genes_df['GENE_NAME'] = gn_list
id2genes_df.to_csv("SALMON_TEMP/LIMMA_salmon_id2genes_OK.txt", sep='\t', index=False)
print(f'Filled {i} genes symbols, still {j} remain unknown')

Filled 14239 genes symbols, still 0 remain unknown
