In [1]:
#Import packages
import pandas as pd
import numpy as np
from biomart import BiomartServer
import json
import pickle
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
#Reading the Zinc-finger domain data
curr_dir = !pwd
my_path = curr_dir[0]+"/hmm_domains/"
#filename = "zf-C2H2.csv"
filename = "Homeobox.csv"
domain_data = pd.read_csv(my_path+filename, sep='\t', index_col=0)

In [3]:
#Importing Biomart database
server = BiomartServer("http://grch37.ensembl.org/biomart")
ens_genes = server.datasets[u'hsapiens_gene_ensembl']

In [4]:
#Return the Uniprot id for querying json entries in Uniprot, or "" if not found
def get_uniprot_id(ensembl_id):
    #Querying the Biomart database to get number of matches
    num = ens_genes.count({
        'filters': {
                'ensembl_gene_id': ensembl_id
        },
        'attributes': [
                'uniprot_swissprot'
        ]
    })

    #In case no match is found
    if (num == 0):
        return ""

    #In case of at least one match: querying the Biomart database
    response = ens_genes.search({
        'filters': {
                'ensembl_gene_id': ensembl_id
        },
        'attributes': [
                'uniprot_swissprot'
        ]
    })

    #Reading the query result
    uniprot_id = ""
    for line in response.iter_lines():
        line = line.decode('utf-8')
        uniprot_id = (line.split("\t")[0]).encode('ascii','ignore')
        break
    
    return uniprot_id

In [5]:
def find_longest_prot(prot_ids):
    max_len = 0
    max_prot_id = ""
    for prot in prot_ids:
        prot_lens = (gene_table[gene_table["prot_id"] == prot]["length"]).unique()
        if (len(prot_lens) > 1):
            print "Error: more than one length fir protein id: "+prot #Sanity check
        if (prot_lens[0] > max_len):
            max_len = prot_lens[0]
            max_prot_id = prot
    return max_prot_id

In [6]:
#Create a dictionary of the canonical transcipt ids and protein ids
#canonic_transcript = {}
canonic_protein = {}
genes_ids = (domain_data["ensembl_id"]).unique()
no_uniprot = 0
no_canonic_len = 0

In [7]:
for i in range(len(genes_ids)):
    ens_gene = genes_ids[i]
    gene_table = domain_data[domain_data["ensembl_id"] == ens_gene]
    protein_ids = (gene_table["prot_id"]).unique()

    if (len(protein_ids) == 1):
        #Saving the one protein id available
        canonic_protein[ens_gene] = protein_ids[0]

    #If there's more then one transcript: finding what's the canonic protein length from uniprot
    else:
        uniprot_id = get_uniprot_id(ens_gene)
        if (uniprot_id == ""):
            print ens_gene+": uniprot wasn't found for "+ens_gene
            canonic_protein[ens_gene] = find_longest_prot(protein_ids)
            no_uniprot += 1
            continue
        uniprot_url = "http://togows.dbcls.jp/entry/uniprot/"+uniprot_id+".json"
        uniprot_json = pd.read_json(uniprot_url)
        canonic_len = uniprot_json.aalen[0]

        found = False
        for prot in protein_ids:
            prot_lens = (gene_table[gene_table["prot_id"] == prot]["length"]).unique()
            if (len(prot_lens) > 1):
                print "Error: more than one length for protein id: "+prot #Sanity check

            #If the length equal the canonical, this is the canonical protein id
            if (prot_lens[0] == canonic_len):
                found = True
                canonic_protein[ens_gene] = prot
                break

        if (found == False):
            print ens_gene+": Proteins don't match the Uniprot canonic"
            no_canonic_len += 1
            canonic_protein[ens_gene] = find_longest_prot(protein_ids)

ENSG00000234669: Proteins don't match the Uniprot canonic
ENSG00000148516: Proteins don't match the Uniprot canonic
ENSG00000184937: Proteins don't match the Uniprot canonic
ENSG00000147124: Proteins don't match the Uniprot canonic
ENSG00000130544: Proteins don't match the Uniprot canonic
ENSG00000170954: Proteins don't match the Uniprot canonic
ENSG00000203326: Proteins don't match the Uniprot canonic
ENSG00000164011: Proteins don't match the Uniprot canonic
ENSG00000171606: Proteins don't match the Uniprot canonic
ENSG00000066827: Proteins don't match the Uniprot canonic
ENSG00000263310: uniprot wasn't found for ENSG00000263310
ENSG00000170325: Proteins don't match the Uniprot canonic
ENSG00000272602: uniprot wasn't found for ENSG00000272602
ENSG00000167840: Proteins don't match the Uniprot canonic
ENSG00000187607: uniprot wasn't found for ENSG00000187607
ENSG00000030419: Proteins don't match the Uniprot canonic
ENSG00000063587: Proteins don't match the Uniprot canonic
ENSG0000014104

In [9]:
with open(my_path+'zinc_canonic_prot.pik', 'wb') as handle:
    pickle.dump(canonic_protein, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#Printouts from creating the canonic dictionary
ENSG00000234669: Proteins don't match the Uniprot canonic
ENSG00000148516: Proteins don't match the Uniprot canonic
ENSG00000184937: Proteins don't match the Uniprot canonic
ENSG00000147124: Proteins don't match the Uniprot canonic
ENSG00000130544: Proteins don't match the Uniprot canonic
ENSG00000170954: Proteins don't match the Uniprot canonic
ENSG00000203326: Proteins don't match the Uniprot canonic
ENSG00000164011: Proteins don't match the Uniprot canonic
ENSG00000171606: Proteins don't match the Uniprot canonic
ENSG00000066827: Proteins don't match the Uniprot canonic
ENSG00000263310: uniprot wasn't found for ENSG00000263310
ENSG00000170325: Proteins don't match the Uniprot canonic
ENSG00000272602: uniprot wasn't found for ENSG00000272602
ENSG00000167840: Proteins don't match the Uniprot canonic
ENSG00000187607: uniprot wasn't found for ENSG00000187607
ENSG00000030419: Proteins don't match the Uniprot canonic
ENSG00000063587: Proteins don't match the Uniprot canonic
ENSG00000141040: Proteins don't match the Uniprot canonic
ENSG00000196381: Proteins don't match the Uniprot canonic
ENSG00000089335: Proteins don't match the Uniprot canonic
ENSG00000256463: Proteins don't match the Uniprot canonic
ENSG00000124782: Proteins don't match the Uniprot canonic