In [1]:
#Import packages
import pandas as pd
import numpy as np
from biomart import BiomartServer
import json
import pickle
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>");

In [2]:
#Return the Uniprot id for querying json entries in Uniprot, or "" if not found
def get_uniprot_id(ensembl_id):
    #Querying the Biomart database to get number of matches
    num = ens_genes.count({
        'filters': {
                'ensembl_gene_id': ensembl_id
        },
        'attributes': [
                'uniprotswissprot'
        ]
    })

    #In case no match is found
    if (num == 0):
        return ""

    #In case of at least one match: querying the Biomart database
    response = ens_genes.search({
        'filters': {
                'ensembl_gene_id': ensembl_id
        },
        'attributes': [
                'uniprotswissprot'
        ]
    })

    #Reading the query result
    uniprot_id = ""
    for line in response.iter_lines():
        line = line.decode('utf-8')
        uniprot_id = (line.split("\t")[0]).encode('ascii','ignore')
        break
    
    return uniprot_id

In [3]:
def find_longest_prot(prot_ids):
    max_len = 0
    max_prot_id = ""
    for prot in prot_ids:
        prot_lens = (gene_table[gene_table["prot"] == prot]["length"]).unique()
        if (len(prot_lens) > 1):
            print "Error: more than one length fir protein id: "+prot #Sanity check
        if (prot_lens[0] > max_len):
            max_len = prot_lens[0]
            max_prot_id = prot
    return max_prot_id

In [4]:
#Reading the domain data
curr_dir = !pwd
pfam_version = "31"
in_path = curr_dir[0]+"/../3.parse_HMMER/hmm_domains/pfam-v"+pfam_version+"/"
out_path = curr_dir[0]+"/domains_canonic_prot/pfam-v"+pfam_version+"/"
domains_files = !ls $in_path

#Importing Biomart database
server = BiomartServer("http://grch37.ensembl.org/biomart")
ens_genes = server.datasets[u'hsapiens_gene_ensembl']

In [6]:
%%time
print "Starting..."
for dom_filename in domains_files[:]:
    domain_data = pd.read_csv(in_path+dom_filename, sep='\t', index_col=0)
    canonic_protein = {}
    genes_ids = (domain_data["gene"]).unique()
    no_uniprot = 0
    no_canonic_len = 0
    
    for i in range(0,len(genes_ids)):
        ens_gene = genes_ids[i]
        gene_table = domain_data[domain_data["gene"] == ens_gene]
        protein_ids = (gene_table["prot"]).unique()

        if (len(protein_ids) == 1):
            #Saving the one protein id available
            canonic_protein[ens_gene] = protein_ids[0]

        #If there's more then one transcript: finding what's the canonic protein length from uniprot
        else:
            ens_gene_major_number = ens_gene[:ens_gene.find(".")]
            uniprot_id = get_uniprot_id(ens_gene_major_number)
            if (uniprot_id == ""):
                #print ens_gene+": uniprot wasn't found for "+ens_gene
                canonic_protein[ens_gene] = find_longest_prot(protein_ids)
                no_uniprot += 1
                continue
            uniprot_url = "http://togows.dbcls.jp/entry/uniprot/"+uniprot_id+".json"
            try:
                uniprot_json = pd.read_json(uniprot_url)
                canonic_len = uniprot_json.aalen[0]
            except:
                canonic_protein[ens_gene] = find_longest_prot(protein_ids)
                no_uniprot += 1
                continue

            found = False
            for prot in protein_ids:
                prot_lens = (gene_table[gene_table["prot"] == prot]["length"]).unique()
                if (len(prot_lens) > 1):
                    print "Error: more than one length for protein id: "+prot #Sanity check

                #If the length equal the canonical, this is the canonical protein id
                if (prot_lens[0] == canonic_len):
                    found = True
                    canonic_protein[ens_gene] = prot
                    break

            if (found == False):
                #print ens_gene+": Proteins don't match the Uniprot canonic"
                no_canonic_len += 1
                canonic_protein[ens_gene] = find_longest_prot(protein_ids)
                
    domain_sym = dom_filename[:dom_filename.find(".")]
    with open(out_path+domain_sym+"_canonic_prot.pik", 'wb') as handle:
        pickle.dump(canonic_protein, handle, protocol=pickle.HIGHEST_PROTOCOL)  
        
    print "Finished "+dom_filename

Starting...
Finished 14-3-3.csv
Finished 1-cysPrx_C.csv
Finished 23ISL.csv
Finished 2Fe-2S_thioredx.csv
Finished 2-Hacid_dh_C.csv
Finished 2-Hacid_dh.csv
Finished 2OG-FeII_Oxy_2.csv
Finished 2OG-FeII_Oxy_3.csv
Finished 2OG-FeII_Oxy_4.csv
Finished 2OG-FeII_Oxy.csv
Finished 2-oxoacid_dh.csv
Finished 2-oxogl_dehyd_N.csv
Finished 3Beta_HSD.csv
Finished 3-HAO.csv
Finished 3HCDH.csv
Finished 3HCDH_N.csv
Finished 3-PAP.csv
Finished 40S_S4_C.csv
Finished 40S_SA_C.csv
Finished 4_1_CTD.csv
Finished 4F5.csv
Finished 4HBT_2.csv
Finished 4HBT_3.csv
Finished 4HBT.csv
Finished 53-BP1_Tudor.csv
Finished 5-FTHF_cyc-lig.csv
Finished 5HT_transport_N.csv
Finished 5-nucleotidase.csv
Finished 5_nucleotid_C.csv
Finished 5_nucleotid.csv
Finished 60KD_IMP.csv
Finished 6PF2K.csv
Finished 6PGD.csv
Finished 7tm_1.csv
Finished 7tm_2.csv
Finished 7tm_3.csv
Finished 7tm_4.csv
Finished 7TM_GPCR_Srsx.csv
Finished 7TM_GPCR_Srv.csv
Finished 7TM_GPCR_Srw.csv
Finished 7TM_GPCR_Srx.csv
Finished A1_Propeptide.csv
Finished A

Finished ArsA_ATPase.csv
Finished ART.csv
Finished Arv1.csv
Finished Arylesterase.csv
Finished ASC.csv
Finished ASCH.csv
Finished ASD1.csv
Finished ASD2.csv
Finished ASF1_hist_chap.csv
Finished ASH.csv
Finished Ashwin.csv
Finished ASL_C2.csv
Finished Asn_synthase.csv
Finished Asparaginase_2.csv
Finished Asparaginase.csv
Finished Asp_Arg_Hydrox.csv
Finished Asp-B-Hydro_N.csv
Finished Asp.csv
Finished Asp_Glu_race_2.csv
Finished Asp_protease_2.csv
Finished Asp_protease.csv
Finished Astacin.csv
Finished AstE_AspA.csv
Finished ASXH.csv
Finished ATAD4.csv
Finished ATE_C.csv
Finished ATE_N.csv
Finished ATF7IP_BD.csv
Finished ATG101.csv
Finished ATG11.csv
Finished ATG13.csv
Finished Atg14.csv
Finished ATG16.csv
Finished ATG2_CAD.csv
Finished ATG7_N.csv
Finished Atg8.csv
Finished ATG_C.csv
Finished AT_hook.csv
Finished ATP11.csv
Finished ATP12.csv
Finished ATP1G1_PLM_MAT8.csv
Finished ATP_bind_1.csv
Finished ATP_bind_3.csv
Finished ATP_Ca_trans_C.csv
Finished ATP-cone.csv
Finished ATP-grasp_2.

Finished CCDC50_N.csv
Finished CCDC53.csv
Finished CCDC66.csv
Finished CCDC71L.csv
Finished CCDC73.csv
Finished CCDC74_C.csv
Finished CCDC84.csv
Finished CCDC85.csv
Finished CCDC92.csv
Finished CCER1.csv
Finished CCM2_C.csv
Finished CCSAP.csv
Finished CCSMST1.csv
Finished CD20.csv
Finished CD225.csv
Finished CD24.csv
Finished CD34_antigen.csv
Finished CD36.csv
Finished CD45.csv
Finished CD47.csv
Finished CD4-extracel.csv
Finished CD52.csv
Finished CD99L2.csv
Finished CDC24.csv
Finished CDC27.csv
Finished CDC37_C.csv
Finished CDC37_M.csv
Finished CDC37_N.csv
Finished CDC45.csv
Finished CDC48_2.csv
Finished CDC48_N.csv
Finished CDC50.csv
Finished Cdc6_C.csv
Finished CDC73_C.csv
Finished CDC73_N.csv
Finished CDI.csv
Finished CDK5_activator.csv
Finished CDKN3.csv
Finished CDO_I.csv
Finished CDP-OH_P_transf.csv
Finished CDRT4.csv
Finished CDT1_C.csv
Finished CDT1.csv
Finished CDV3.csv
Finished CEBP1_N.csv
Finished CEBP_ZZ.csv
Finished CECR6_TMEM121.csv
Finished cEGF.csv
Finished Cementoin.c

Finished DAGAT.csv
Finished DAGK_acc.csv
Finished DAGK_cat.csv
Finished DAG_kinase_N.csv
Finished Dak1.csv
Finished Dak2.csv
Finished Dala_Dala_lig_C.csv
Finished DALR_1.csv
Finished DAN.csv
Finished DAOA.csv
Finished DAO_C.csv
Finished DAO.csv
Finished DAP10.csv
Finished DAP3.csv
Finished DAP.csv
Finished Dapper.csv
Finished DARPP-32.csv
Finished DASH_Hsk3.csv
Finished Daxx.csv
Finished DAZAP2.csv
Finished DBB.csv
Finished DBC1.csv
Finished DBINO.csv
Finished DBP10CT.csv
Finished dbPDZ_assoc.csv
Finished DBR1.csv
Finished DCA16.csv
Finished DCAF15_WD40.csv
Finished DCAF17.csv
Finished DCB.csv
Finished Dcc1.csv
Finished dCMP_cyt_deam_1.csv
Finished DCP1.csv
Finished DCP2.csv
Finished DcpS_C.csv
Finished DcpS.csv
Finished DCR.csv
Finished DC_STAMP.csv
Finished DCX.csv
Finished DDA1.csv
Finished DDDD.csv
Finished DDE_1.csv
Finished dDENN.csv
Finished DDE_Tnp_1_7.csv
Finished DDE_Tnp_4.csv
Finished DDHD.csv
Finished DDOST_48kD.csv
Finished DDRGK.csv
Finished DDT.csv
Finished DEAD_2.csv
Fi

Finished DUF4392.csv
Finished DUF4414.csv
Finished DUF4430.csv
Finished DUF4440.csv
Finished DUF4455.csv
Finished DUF4456.csv
Finished DUF4457.csv
Finished DUF4460.csv
Finished DUF4461.csv
Finished DUF4464.csv
Finished DUF4470.csv
Finished DUF4471.csv
Finished DUF4472.csv
Finished DUF4476.csv
Finished DUF4477.csv
Finished DUF4481.csv
Finished DUF4482.csv
Finished DUF4483.csv
Finished DUF4485.csv
Finished DUF4486.csv
Finished DUF4487.csv
Finished DUF4490.csv
Finished DUF4495.csv
Finished DUF4496.csv
Finished DUF4497.csv
Finished DUF4498.csv
Finished DUF4499.csv
Finished DUF4500.csv
Finished DUF4501.csv
Finished DUF4502.csv
Finished DUF4503.csv
Finished DUF4504.csv
Finished DUF4505.csv
Finished DUF4506.csv
Finished DUF4507.csv
Finished DUF4508.csv
Finished DUF4509.csv
Finished DUF4510.csv
Finished DUF4512.csv
Finished DUF4513.csv
Finished DUF4514.csv
Finished DUF4515.csv
Finished DUF4516.csv
Finished DUF4517.csv
Finished DUF4518.csv
Finished DUF4519.csv
Finished DUF4520.csv
Finished DUF4

Finished ELYS-bb.csv
Finished ELYS.csv
Finished EMG1.csv
Finished EMI.csv
Finished EMP24_GP25L.csv
Finished EMP70.csv
Finished Enamelin.csv
Finished EndIII_4Fe-2S.csv
Finished Endomucin.csv
Finished Endonuclease_5.csv
Finished Endonuclease_NS.csv
Finished Endostatin.csv
Finished Endosulfine.csv
Finished Endothelin.csv
Finished Engrail_1_C_sig.csv
Finished Enkurin.csv
Finished Enolase_C.csv
Finished Enolase_N.csv
Finished ENT.csv
Finished ENTH.csv
Finished EnY2.csv
Finished EP400_N.csv
Finished E_Pc_C.csv
Finished Ependymin.csv
Finished EphA2_TM.csv
Finished Ephrin.csv
Finished Ephrin_lbd.csv
Finished Ephrin_rec_like.csv
Finished Epiglycanin_C.csv
Finished Epiglycanin_TR.csv
Finished Epimerase_2.csv
Finished Epimerase.csv
Finished EPL1.csv
Finished EpoR_lig-bind.csv
Finished EPO_TPO.csv
Finished EPTP.csv
Finished ERAP1_C.csv
Finished ERbeta_N.csv
Finished ERCC3_RAD25_C.csv
Finished ERCC4.csv
Finished ER.csv
Finished eRF1_1.csv
Finished eRF1_2.csv
Finished eRF1_3.csv
Finished Erf4.csv
Fi

Finished GATase_7.csv
Finished GATase.csv
Finished GatB_N.csv
Finished GatB_Yqey.csv
Finished GAT.csv
Finished Gate.csv
Finished Gb3_synth.csv
Finished GBP_C.csv
Finished GBP.csv
Finished Gcd10p.csv
Finished GCD14.csv
Finished GCFC.csv
Finished GCIP.csv
Finished GCM.csv
Finished GCN5L1.csv
Finished GCOM2.csv
Finished GCR.csv
Finished GCS.csv
Finished GCSF.csv
Finished GCV_H.csv
Finished GCV_T_C.csv
Finished GCV_T.csv
Finished GDA1_CD39.csv
Finished GDC-P.csv
Finished GDE_C.csv
Finished GDI.csv
Finished GDNF.csv
Finished GDPD.csv
Finished GDP_Man_Dehyd.csv
Finished GDWWSH.csv
Finished Ge1_WD40.csv
Finished GED.csv
Finished Gelsolin.csv
Finished Gemin6.csv
Finished Gemin7.csv
Finished GEMIN8.csv
Finished Geminin.csv
Finished GFA.csv
Finished GFO_IDH_MocA.csv
Finished GF_recep_IV.csv
Finished GFRP.csv
Finished GGACT.csv
Finished G-gamma.csv
Finished G_glu_transpept.csv
Finished GGN.csv
Finished GH3.csv
Finished GHBP.csv
Finished GHMP_kinases_C.csv
Finished GHMP_kinases_N.csv
Finished GIDA

Finished HS1_rep.csv
Finished hSac2.csv
Finished HSA.csv
Finished HSBP1.csv
Finished HSCB_C.csv
Finished HSD3.csv
Finished HSF_DNA-bind.csv
Finished hSH3.csv
Finished HSL_N.csv
Finished HSP20.csv
Finished HSP70.csv
Finished HSP90.csv
Finished HSR.csv
Finished HTH_11.csv
Finished HTH_23.csv
Finished HTH_3.csv
Finished HTH_40.csv
Finished HTH_44.csv
Finished HTH_9.csv
Finished HTH_psq.csv
Finished HTH_Tnp_Tc5.csv
Finished Humanin.csv
Finished HUN.csv
Finished Hus1.csv
Finished HVSL.csv
Finished Hyccin.csv
Finished Hydant_A_N.csv
Finished Hydantoinase_A.csv
Finished Hydantoinase_B.csv
Finished Hydin_ADK.csv
Finished Hydrolase_6.csv
Finished Hydrolase.csv
Finished Hydrolase_like.csv
Finished Hyd_WA.csv
Finished HYLS1_C.csv
Finished HYR.csv
Finished IATP.csv
Finished IBB.csv
Finished IBN_N.csv
Finished IBR.csv
Finished ICA69.csv
Finished ICAM_N.csv
Finished ICAP-1_inte_bdg.csv
Finished ICAT.csv
Finished ICMT.csv
Finished IDO.csv
Finished IER.csv
Finished IF-2B.csv
Finished IF-2.csv
Finished

Finished LRR_9.csv
Finished LRRC37AB_C.csv
Finished LRRC37.csv
Finished LRRCT.csv
Finished LRRFIP.csv
Finished LRRNT.csv
Finished LSM14.csv
Finished LsmAD.csv
Finished LSM.csv
Finished LSM_int_assoc.csv
Finished Lsm_interact.csv
Finished LSR.csv
Finished LST1.csv
Finished LTD.csv
Finished LTV.csv
Finished LUC7.csv
Finished Lung_7-TM_R.csv
Finished LURAP.csv
Finished Ly49.csv
Finished Lyase_1.csv
Finished Lyase_aromatic.csv
Finished LYRIC.csv
Finished Lys.csv
Finished LysM.csv
Finished Lysyl_oxidase.csv
Finished M16C_assoc.csv
Finished M20_dimer.csv
Finished M60-like_N.csv
Finished MA3.csv
Finished Mab-21.csv
Finished Macoilin.csv
Finished MACPF.csv
Finished Macro.csv
Finished Macscav_rec.csv
Finished Mad3_BUB1_I.csv
Finished MAD.csv
Finished Maelstrom.csv
Finished Maf1.csv
Finished MafB19-deam.csv
Finished Maf.csv
Finished Maf_N.csv
Finished MAGE.csv
Finished MAGE_N.csv
Finished MAGI_u1.csv
Finished MAGI_u5.csv
Finished Mago-bind.csv
Finished Mago_nashi.csv
Finished MAGP.csv
Finished M

Finished Myb_DNA-binding.csv
Finished MYCBPAP.csv
Finished Myc-LZ.csv
Finished Myc_N.csv
Finished Myc_target_1.csv
Finished Myelin_MBP.csv
Finished Myelin_PLP.csv
Finished Myelin-PO_C.csv
Finished MYEOV2.csv
Finished Myf5.csv
Finished MYO10_CC.csv
Finished Myosin_head.csv
Finished Myosin_N.csv
Finished Myosin_tail_1.csv
Finished Myosin_TH1.csv
Finished Myosin-VI_CBD.csv
Finished Myotub-related.csv
Finished MYT1.csv
Finished MyTH4.csv
Finished N1221.csv
Finished N2227.csv
Finished N6-adenineMlase.csv
Finished NAAA-beta.csv
Finished Nab1.csv
Finished Na_Ca_ex_C.csv
Finished Na_Ca_ex.csv
Finished NAC.csv
Finished NACHT.csv
Finished NAD_binding_10.csv
Finished NAD_binding_11.csv
Finished NAD_binding_1.csv
Finished NAD_binding_2.csv
Finished NAD_binding_3.csv
Finished NAD_binding_4.csv
Finished NAD_binding_5.csv
Finished NAD_binding_6.csv
Finished NAD_binding_8.csv
Finished NAD_Gly3P_dh_C.csv
Finished NAD_Gly3P_dh_N.csv
Finished NADH_4Fe-4S.csv
Finished NADH_B2.csv
Finished NADHdh_A3.csv
Fi

Finished PAC1.csv
Finished PAC2.csv
Finished PAC3.csv
Finished PAC4.csv
Finished Pacs-1.csv
Finished PA.csv
Finished PACT_coil_coil.csv
Finished PAD.csv
Finished PAD_M.csv
Finished PAD_N.csv
Finished PADR1.csv
Finished PAE.csv
Finished Paf1.csv
Finished Paf67.csv
Finished PAF-AH_p_II.csv
Finished PAF.csv
Finished PAG.csv
Finished PAH.csv
Finished PALB2_WD40.csv
Finished Palm_thioest.csv
Finished PALP.csv
Finished Pam16.csv
Finished PAM2.csv
Finished PAN_1.csv
Finished PAN_4.csv
Finished Pannexin_like.csv
Finished PAP2_C.csv
Finished PAP2.csv
Finished PAPA-1.csv
Finished PAP_assoc.csv
Finished PAP_central.csv
Finished PapD-like.csv
Finished Papilin_u7.csv
Finished PAP_RNA-bind.csv
Finished PAPS_reduct.csv
Finished ParA.csv
Finished Paralemmin.csv
Finished Parathyroid.csv
Finished ParBc.csv
Finished ParcG.csv
Finished PARG_cat.csv
Finished PARM.csv
Finished PARP.csv
Finished PARP_reg.csv
Finished PAS_11.csv
Finished PAS_3.csv
Finished PAS_4.csv
Finished PAS_8.csv
Finished PAS_9.csv
Finis

Finished P_proprotein.csv
Finished PPTA.csv
Finished PQ-loop.csv
Finished PQQ_2.csv
Finished PQQ_3.csv
Finished PRA1.csv
Finished PRAP.csv
Finished PRAS.csv
Finished Pr_beta_C.csv
Finished PRCC.csv
Finished Prefoldin_2.csv
Finished Prefoldin_3.csv
Finished Prefoldin.csv
Finished PRELI.csv
Finished Prenylcys_lyase.csv
Finished Prenyltrans.csv
Finished Prenyltransf.csv
Finished Presenilin.csv
Finished Preseq_ALAS.csv
Finished Pre-SET.csv
Finished Pribosyl_synth.csv
Finished Pribosyltran.csv
Finished Pribosyltran_N.csv
Finished PRIMA1.csv
Finished Prion_bPrPp.csv
Finished Prion.csv
Finished Prion_octapep.csv
Finished PRKCSH_1.csv
Finished PRKCSH.csv
Finished PRKCSH-like.csv
Finished PRK.csv
Finished PRKG1_interact.csv
Finished PRMT5_C.csv
Finished PRMT5.csv
Finished PRMT5_TIM.csv
Finished PRNT.csv
Finished PRO8NT.csv
Finished PROCN.csv
Finished PROCT.csv
Finished Pro_dh.csv
Finished Profilin.csv
Finished Prog_receptor.csv
Finished Pro_isomerase.csv
Finished Prokineticin.csv
Finished Prok-

Finished Ribosomal_L34.csv
Finished Ribosomal_L34e.csv
Finished Ribosomal_L35Ae.csv
Finished Ribosomal_L35p.csv
Finished Ribosomal_L36.csv
Finished Ribosomal_L36e.csv
Finished Ribosomal_L37ae.csv
Finished Ribosomal_L37e.csv
Finished Ribosomal_L38e.csv
Finished Ribosomal_L39.csv
Finished Ribosomal_L3.csv
Finished Ribosomal_L40e.csv
Finished Ribosomal_L41.csv
Finished Ribosomal_L44.csv
Finished Ribosomal_L4.csv
Finished Ribosomal_L50.csv
Finished Ribosomal_L5_C.csv
Finished Ribosomal_L5.csv
Finished Ribosomal_L5e.csv
Finished Ribosomal_L6.csv
Finished Ribosomal_L6e.csv
Finished Ribosomal_L6e_N.csv
Finished Ribosomal_L7Ae.csv
Finished Ribosomal_L9_N.csv
Finished Ribosomal_S10.csv
Finished Ribosomal_S11.csv
Finished Ribosomal_S13.csv
Finished Ribosomal_S13_N.csv
Finished Ribosomal_S14.csv
Finished Ribosomal_S15.csv
Finished Ribosomal_S16.csv
Finished Ribosomal_S17.csv
Finished Ribosomal_S17e.csv
Finished Ribosomal_S17_N.csv
Finished Ribosomal_S18.csv
Finished Ribosomal_S19.csv
Finished Rib

Finished Sgf11.csv
Finished SGIII.csv
Finished SGL.csv
Finished SGS.csv
Finished SGT1.csv
Finished SGTA_dimer.csv
Finished SH2_2.csv
Finished SH2.csv
Finished SH3_1.csv
Finished SH3_2.csv
Finished SH3_3.csv
Finished SH3_4.csv
Finished SH3_9.csv
Finished SH3BGR.csv
Finished SH3BP5.csv
Finished SH3-RhoG_link.csv
Finished SH3-WW_linker.csv
Finished Shadoo.csv
Finished Shal-type.csv
Finished Sharpin_PH.csv
Finished Shikimate_DH.csv
Finished SHIPPO-rpt.csv
Finished Shisa.csv
Finished ShK.csv
Finished SHMT.csv
Finished SHNi-TPR.csv
Finished SHQ1.csv
Finished SHR-BD.csv
Finished SHS2_Rpb7-N.csv
Finished Shugoshin_C.csv
Finished Shugoshin_N.csv
Finished Siah-Interact_N.csv
Finished SID-1_RNA_chan.csv
Finished Sigma54_activat.csv
Finished SIKE.csv
Finished SIM_C.csv
Finished SIMPL.csv
Finished SIN1.csv
Finished Sin3a_C.csv
Finished Sin3_corepress.csv
Finished Sina.csv
Finished Sin_N.csv
Finished SIP1.csv
Finished SIR2_2.csv
Finished SIR2.csv
Finished SIS.csv
Finished SIT.csv
Finished Siva.csv
F

Finished Tet_JBP.csv
Finished Tetraspannin.csv
Finished TEX12.csv
Finished TEX13.csv
Finished TEX15.csv
Finished TEX19.csv
Finished TEX29.csv
Finished TEX33.csv
Finished Tex_N.csv
Finished Tex_YqgF.csv
Finished TF_AP-2.csv
Finished Tfb2.csv
Finished Tfb4.csv
Finished Tfb5.csv
Finished TFCD_C.csv
Finished TFIIA.csv
Finished TFIIA_gamma_C.csv
Finished TFIIA_gamma_N.csv
Finished TFIIB.csv
Finished TFIID-18kDa.csv
Finished TFIID_20kDa.csv
Finished TFIID_30kDa.csv
Finished TFIID-31kDa.csv
Finished TFIID_NTD2.csv
Finished TFIIE-A_C.csv
Finished TFIIE_alpha.csv
Finished TFIIE_beta.csv
Finished TFIIF_alpha.csv
Finished TFIIF_beta.csv
Finished TFIIIC_delta.csv
Finished TFIIIC_sub6.csv
Finished TFIIS_C.csv
Finished TFIIS_M.csv
Finished TF_Otx.csv
Finished TFR_dimer.csv
Finished TF_Zn_Ribbon.csv
Finished TGF_beta.csv
Finished TGF_beta_GS.csv
Finished TGFb_propeptide.csv
Finished TGS.csv
Finished TGT.csv
Finished TH1.csv
Finished THAP.csv
Finished THEG4.csv
Finished THEG.csv
Finished THF_DHG_CYH_C

Finished Ubiq_cyt_C_chap.csv
Finished Ubiq-Cytc-red_N.csv
Finished Ubiquitin_2.csv
Finished Ubiquitin_3.csv
Finished ubiquitin.csv
Finished UBN_AB.csv
Finished U-box.csv
Finished UBX.csv
Finished UBZ_FAAP20.csv
Finished UCH_1.csv
Finished UCH.csv
Finished UCH_N.csv
Finished UCMA.csv
Finished UCR_14kD.csv
Finished UCR_6-4kD.csv
Finished UCR_hinge.csv
Finished UcrQ.csv
Finished UCR_TM.csv
Finished UCR_UQCRX_QCR9.csv
Finished uDENN.csv
Finished UDG.csv
Finished UDP-g_GGTase.csv
Finished UDPG_MGDP_dh_C.csv
Finished UDPG_MGDP_dh.csv
Finished UDPG_MGDP_dh_N.csv
Finished UDPGP.csv
Finished UDPGT.csv
Finished UEV.csv
Finished UFC1.csv
Finished UFD1.csv
Finished Ufd2P_core.csv
Finished Ufm1.csv
Finished UIM.csv
Finished ULD.csv
Finished UME.csv
Finished UMP1.csv
Finished UMPH-1.csv
Finished UnbV_ASPIC.csv
Finished UNC119_bdg.csv
Finished UNC45-central.csv
Finished UNC-50.csv
Finished UNC-79.csv
Finished UNC80.csv
Finished UNC-93.csv
Finished UN_NPL4.csv
Finished UPA.csv
Finished UPAR_LY6.csv
Fi

Finished zf-TRAF.csv
Finished zf-trcl.csv
Finished zf-TRM13_CCCH.csv
Finished zf-U11-48K.csv
Finished zf-U1.csv
Finished zf-UBP.csv
Finished zf-UBR.csv
Finished Zfx_Zfy_act.csv
Finished ZFYVE21_C.csv
Finished zf-ZPR1.csv
Finished zinc_ribbon_10.csv
Finished zinc_ribbon_16.csv
Finished zinc_ribbon_2.csv
Finished zinc_ribbon_6.csv
Finished zinc_ribbon_9.csv
Finished Zip.csv
Finished Zn_dep_PLPC.csv
Finished zn-ribbon_14.csv
Finished Zn_ribbon_17.csv
Finished Zona_pellucida.csv
Finished ZU5.csv
Finished Zw10.csv
Finished Zwilch.csv
Finished Zwint.csv
Finished ZZ.csv
CPU times: user 13min 45s, sys: 39.2 s, total: 14min 24s
Wall time: 12h 58min 50s


### draft - old code - don't run

In [36]:
#Create a dictionary of the canonical transcipt ids and protein ids
#canonic_transcript = {}
canonic_protein = {}
genes_ids = (domain_data["gene"]).unique()
no_uniprot = 0
no_canonic_len = 0

In [37]:
for i in range(0,len(genes_ids)):
    ens_gene = genes_ids[i]
    gene_table = domain_data[domain_data["gene"] == ens_gene]
    protein_ids = (gene_table["prot"]).unique()

    if (len(protein_ids) == 1):
        #Saving the one protein id available
        canonic_protein[ens_gene] = protein_ids[0]

    #If there's more then one transcript: finding what's the canonic protein length from uniprot
    else:
        ens_gene_major_number = ens_gene[:ens_gene.find(".")]
        uniprot_id = get_uniprot_id(ens_gene_major_number)
        if (uniprot_id == ""):
            print ens_gene+": uniprot wasn't found for "+ens_gene
            canonic_protein[ens_gene] = find_longest_prot(protein_ids)
            no_uniprot += 1
            continue
        uniprot_url = "http://togows.dbcls.jp/entry/uniprot/"+uniprot_id+".json"
        uniprot_json = pd.read_json(uniprot_url)
        canonic_len = uniprot_json.aalen[0]

        found = False
        for prot in protein_ids:
            prot_lens = (gene_table[gene_table["prot"] == prot]["length"]).unique()
            if (len(prot_lens) > 1):
                print "Error: more than one length for protein id: "+prot #Sanity check

            #If the length equal the canonical, this is the canonical protein id
            if (prot_lens[0] == canonic_len):
                found = True
                canonic_protein[ens_gene] = prot
                break

        if (found == False):
            print ens_gene+": Proteins don't match the Uniprot canonic"
            no_canonic_len += 1
            canonic_protein[ens_gene] = find_longest_prot(protein_ids)

ENSG00000167840.9: Proteins don't match the Uniprot canonic
ENSG00000105261.5: uniprot wasn't found for ENSG00000105261.5
ENSG00000170325.10: Proteins don't match the Uniprot canonic
ENSG00000147124.8: Proteins don't match the Uniprot canonic
ENSG00000171606.13: Proteins don't match the Uniprot canonic
ENSG00000184937.8: Proteins don't match the Uniprot canonic
ENSG00000196381.6: Proteins don't match the Uniprot canonic
ENSG00000164011.13: Proteins don't match the Uniprot canonic
ENSG00000204644.5: Proteins don't match the Uniprot canonic
ENSG00000206510.5: Proteins don't match the Uniprot canonic
ENSG00000141040.10: Proteins don't match the Uniprot canonic
ENSG00000227124.4: Proteins don't match the Uniprot canonic
ENSG00000089335.16: Proteins don't match the Uniprot canonic
ENSG00000234669.3: Proteins don't match the Uniprot canonic
ENSG00000223858.3: Proteins don't match the Uniprot canonic
ENSG00000187607.11: uniprot wasn't found for ENSG00000187607.11
ENSG00000223852.3: Proteins d

In [38]:
with open(out_path+domain+"_canonic_prot.pik", 'wb') as handle:
    pickle.dump(canonic_protein, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#Printouts from creating the canonic dictionary for Zinc finger domain
ENSG00000167840.9: Proteins don't match the Uniprot canonic
ENSG00000105261.5: uniprot wasn't found for ENSG00000105261.5
ENSG00000170325.10: Proteins don't match the Uniprot canonic
ENSG00000147124.8: Proteins don't match the Uniprot canonic
ENSG00000171606.13: Proteins don't match the Uniprot canonic
ENSG00000184937.8: Proteins don't match the Uniprot canonic
ENSG00000196381.6: Proteins don't match the Uniprot canonic
ENSG00000164011.13: Proteins don't match the Uniprot canonic
ENSG00000204644.5: Proteins don't match the Uniprot canonic
ENSG00000206510.5: Proteins don't match the Uniprot canonic
ENSG00000141040.10: Proteins don't match the Uniprot canonic
ENSG00000227124.4: Proteins don't match the Uniprot canonic
ENSG00000089335.16: Proteins don't match the Uniprot canonic
ENSG00000234669.3: Proteins don't match the Uniprot canonic
ENSG00000223858.3: Proteins don't match the Uniprot canonic
ENSG00000187607.11: uniprot wasn't found for ENSG00000187607.11
ENSG00000223852.3: Proteins don't match the Uniprot canonic
ENSG00000232099.3: Proteins don't match the Uniprot canonic
ENSG00000226858.3: Proteins don't match the Uniprot canonic
ENSG00000169955.6: Proteins don't match the Uniprot canonic
ENSG00000255073.4: uniprot wasn't found for ENSG00000255073.4
ENSG00000263310.1: uniprot wasn't found for ENSG00000263310.1
ENSG00000272602.1: uniprot wasn't found for ENSG00000272602.1
ENSG00000272580.1: uniprot wasn't found for ENSG00000272580.1

In [None]:
#Printouts from creating the canonic dictionary for Homeobox domain
ENSG00000006377.9: Proteins don't match the Uniprot canonic
ENSG00000257923.5: Proteins don't match the Uniprot canonic
ENSG00000106331.10: Proteins don't match the Uniprot canonic
ENSG00000109072.9: Proteins don't match the Uniprot canonic
ENSG00000258389.2: uniprot wasn't found for ENSG00000258389.2

In [None]:
#Printouts from creating the canonic dictionary for WW domain
ENSG00000081026.14: Proteins don't match the Uniprot canonic
ENSG00000151276.19: Proteins don't match the Uniprot canonic
ENSG00000196504.11: Proteins don't match the Uniprot canonic

In [None]:
#Printouts from creating the canonic dictionary for PUF domain

In [None]:
#Printouts from creating the canonic dictionary for SH3_1 domain
ENSG00000008735.10: Proteins don't match the Uniprot canonic
ENSG00000163486.8: uniprot wasn't found for ENSG00000163486.8
ENSG00000143514.12: Proteins don't match the Uniprot canonic
ENSG00000188747.4: Proteins don't match the Uniprot canonic
ENSG00000266028.3: uniprot wasn't found for ENSG00000266028.3