In [25]:
#Import packages
import pandas as pd
import numpy as np
from biomart import BiomartServer
import json
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [3]:
#Reading the Zinc-finger domain data
curr_dir = !pwd
my_path = curr_dir[0]+"/hmm_domains/"
filename = "zf-C2H2.csv"
zinc_finger = pd.read_csv(my_path+filename, sep='\t', index_col=0)

In [4]:
#Importing Biomart database
server = BiomartServer("http://grch37.ensembl.org/biomart")
ens_genes = server.datasets[u'hsapiens_gene_ensembl']

In [5]:
#Return the Uniprot id for querying json entries in Uniprot, or "" if not found
def get_uniprot_id(ensembl_id):
    #Querying the Biomart database to get number of matches
    num = ens_genes.count({
        'filters': {
                'ensembl_gene_id': ensembl_id
        },
        'attributes': [
                'uniprot_swissprot'
        ]
    })

    #In case no match is found
    if (num == 0):
        return ""

    #In case of at least one match: querying the Biomart database
    response = ens_genes.search({
        'filters': {
                'ensembl_gene_id': ensembl_id
        },
        'attributes': [
                'uniprot_swissprot'
        ]
    })

    #Reading the query result
    uniprot_id = ""
    for line in response.iter_lines():
        line = line.decode('utf-8')
        uniprot_id = (line.split("\t")[0]).encode('ascii','ignore')
        break
    
    return uniprot_id

In [57]:
#Given transcript ids for particular gene, return the transcript id and protein id with the biggest length
def find_longest_trans_prot(transcript_ids):
    max_len = 0
    max_trans_id = ""
    max_prot_id = ""
    for trans in transcript_ids:
        trans_table = gene_table[gene_table["transcript_id"] == trans]
        protein_ids = (trans_table["prot_id"]).unique()
        for prot in protein_ids:
            #all the lines with the same transcript id and protein id should have the same length
            trans_prot_len = (trans_table[trans_table["prot_id"] == prot]["length"]).tolist()[0] 
            if (trans_prot_len > max_len):
                max_len = trans_len
                max_trans_id = trans
                max_prot_id = prot
    return [max_trans_id, max_prot_id]

In [58]:
#Create a dictionary of the canonical transcipt ids and protein ids
canonic_transcript = {}
genes_ids = (zinc_finger["ensembl_id"]).unique()
no_uniprot = 0
no_canonic_len = 0

In [67]:
for i in range(len(genes_ids)):
    ens_gene = genes_ids[i]
    gene_table = zinc_finger[zinc_finger["ensembl_id"] == ens_gene]
    transcript_ids = (gene_table["transcript_id"]).unique()
    
    if (len(transcript_ids) == 1):
        #Just find the longest protein for this transcript
        canonic_transcript[ens_gene] = find_longest_trans_prot(transcript_ids)
        
    #If there's more then one transcript: finding what's the canonic protein length from uniprot
    else:
        uniprot_id = get_uniprot_id(ens_gene)
        if (uniprot_id == ""):
            print ens_gene+": uniprot wasn't found for "+ens_gene
            canonic_transcript[ens_gene] = find_longest_trans_prot(transcript_ids)
            no_uniprot += 1
            continue
        uniprot_url = "http://togows.dbcls.jp/entry/uniprot/"+uniprot_id+".json"
        uniprot_json = pd.read_json(uniprot_url)
        canonic_len = uniprot_json.aalen[0]


        found = False
        for trans in transcript_ids:
            trans_table = gene_table[gene_table["transcript_id"] == trans]
            protein_ids = (trans_table["prot_id"]).unique()
            for prot in protein_ids:
                #all the lines with the same transcript id and protein id should have the same length
                trans_prot_len = (trans_table[trans_table["prot_id"] == prot]["length"]).tolist()[0] 

                #If the length equal the canonical, this is the canonical transcript id
                if (trans_prot_len == canonic_len):
                    found = True
                    canonic_transcript[ens_gene] = [trans, prot]
                    break

            if (found == True):
                break

        #The canonic length isn't one of the ensembl transcripts, taking the longest transcript(?)
        if (found == False):
            print ens_gene+": Transcripts don't match the Uniprot canonic"
            no_canonic_len += 1
            canonic_transcript[ens_gene] = find_longest_trans_prot(transcript_ids)

ENSG00000089335: Transcripts don't match the Uniprot canonic
ENSG00000256463: Transcripts don't match the Uniprot canonic
ENSG00000124782: Transcripts don't match the Uniprot canonic


In [70]:
json.dump(canonic_transcript, file(my_path+'zinc_canonic_trans_prot.json', 'w'))

In [None]:
#Printouts from creating the canonic dictionary
ENSG00000234669: Transcripts don't match the Uniprot canonic
ENSG00000148516: Transcripts don't match the Uniprot canonic
ENSG00000184937: Transcripts don't match the Uniprot canonic
ENSG00000147124: Transcripts don't match the Uniprot canonic
ENSG00000130544: Transcripts don't match the Uniprot canonic
ENSG00000170954: Transcripts don't match the Uniprot canonic
ENSG00000203326: Transcripts don't match the Uniprot canonic
ENSG00000164011: Transcripts don't match the Uniprot canonic
ENSG00000171606: Transcripts don't match the Uniprot canonic
ENSG00000066827: Transcripts don't match the Uniprot canonic
ENSG00000263310: uniprot wasn't found for ENSG00000263310
ENSG00000170325: Transcripts don't match the Uniprot canonic
ENSG00000272602: uniprot wasn't found for ENSG00000272602
ENSG00000167840: Transcripts don't match the Uniprot canonic
ENSG00000187607: uniprot wasn't found for ENSG00000187607
ENSG00000030419: Transcripts don't match the Uniprot canonic
ENSG00000063587: Transcripts don't match the Uniprot canonic
ENSG00000141040: Transcripts don't match the Uniprot canonic
ENSG00000196381: Transcripts don't match the Uniprot canonic
ENSG00000089335: Transcripts don't match the Uniprot canonic
ENSG00000256463: Transcripts don't match the Uniprot canonic
ENSG00000124782: Transcripts don't match the Uniprot canonic