## Map domains' gene ids to canonic protein sequences 

In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import pickle
import sys
import subprocess

sys.path.append('/home/anat/Research/ExAC/5.HMM_alter_align') 
from calc_exac_freq_func import codon_table, retrieve_codon_seq
from mapping_func import create_exon_pos_table

In [8]:
curr_dir = !pwd
pfam_version = "32"
domains_th = "10"
TEST_PROCCESSED_DOMAINS = False

#Read the list of domains
if (TEST_PROCCESSED_DOMAINS):
    with open(curr_dir[0]+"/../13.Process_domains_not_in_training/processed_domains_not_in_pipeline_final_list.pik", 'rb') as handle:
        filtered_domains_list = pickle.load(handle)
else:
    if (pfam_version=="32"):
        with open(curr_dir[0]+"/../5.domains_stats/pfam-v"+pfam_version+"/human_domains_list.pik", 'rb') as handle:
            filtered_domains_list = pickle.load(handle)
    else:
        with open(curr_dir[0]+"/../5.domains_stats/pfam-v"+pfam_version+"/filtered"+domains_th+"_list.pik", 'rb') as handle:
            filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Read the substitutions table (for the exons translation)
with open("/home/anat/Research/ExAC/9.Features_exploration/codon_ns_table.pik", 'rb') as handle:
    codon_ns_table = pickle.load(handle)
    
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]
len(filtered_domains_list)

6503

In [3]:
def reverse_complement(seq):
    """
    Given a DNA sequence, return the reverse-complement of that sequence.
    """
    
    #Complement strand - transversing the bp to base-complement
    complement_seq = []
    for c in seq:
        if (c.upper() == 'A'):
            complement_seq.append('T')
        elif (c.upper() == 'T'):
            complement_seq.append('A')
        elif (c.upper() == 'G'):
            complement_seq.append('C')
        else:
            complement_seq.append('G')
    
    #Reversing the sequence
    comp_seq = ''.join(complement_seq)
    rev_comp_seq = comp_seq[::-1]
    
    return rev_comp_seq

In [4]:
def retrieve_exon_seq(exon_start, exon_end, chrom):
    """
    Retrieve the exon sequence from the ref sequence, according to exons start and end positions
    """
    chromsome_name = "chr"+chrom
    seq_start = int(exon_start) - 1
    
    #Calling hg19.2bit to retreive the DNA sequence
    query = subprocess.check_output("../5.HMM_alter_align/twoBitToFa ../5.HMM_alter_align/hg19.2bit stdout -seq=%s -start=%s -end=%s" %(chromsome_name, str(seq_start), str(exon_end)), shell=True)
    query = ''.join(query.split()[1:]) #Remove 1st line, whitespaces and newlines
    
    return query.upper()

In [5]:
def exons_translate_to_prot(exon_table, chrom_raw_data, chrom):
    
    dna_seq = ""
    
    #Get all the exons dna sequence
    for index, exon in exon_table.iterrows():
        exon_seq = retrieve_exon_seq(exon["start_pos"], exon["end_pos"], chrom)
        
        if (chrom_raw_data.find("complement") >= 0):
            exon_seq = reverse_complement(exon_seq)
        
        dna_seq = dna_seq+exon_seq
        
    #Translate to protein sequence
    prot_seq = []
    next_codon_idx = 0 
    while (next_codon_idx+2 < len(dna_seq)):
        codon = dna_seq[next_codon_idx:next_codon_idx+3]
        prot_seq.append(codon_table[codon])
        next_codon_idx += 3
    
    #Convert all codons to one amino acids string
    protein_str = ''.join(prot_seq) 
    return protein_str

### One dictionary for all domains (no gene duplication)

In [7]:
%%time
gene_dict = defaultdict(dict)

for domain_name in filtered_domains_list:
    
    in_path = curr_dir[0]+"/hmm_domains/pfam-v"+pfam_version+"/"
    filename = domain_name+".csv"
    domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})
    
    #Sort the domain data
    sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
    sorted_domain_data = sorted_domain_data.reset_index(drop=True)

    #Get the canonic protein ids file for the domain
    with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v"+pfam_version+"/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in sorted_domain_data["gene"].unique():

        prot_id = canonic_protein[gene]
        if (gene_dict.has_key(gene) and gene_dict[gene].has_key(prot_id)):
            continue
        
        #Get the exons sequence file for the protein
        chrom = sorted_domain_data[sorted_domain_data["gene"] == gene]["chrom_num"].unique()[0]
        if (chrom not in chromosome_names):
            continue
        exons_file = pd.read_csv(curr_dir[0]+"/from_shilpa/exons_seqs/"+chrom+"/"+gene+"/"+prot_id+".exons.txt", skiprows=1, header=None, names=["pos", "exon_seq"], sep='\t')
        
        #Get the chrom raw data
        chrom_raw_data = sorted_domain_data[sorted_domain_data["gene"] == gene][sorted_domain_data["#TargetID"] == prot_id]["chromosome"].unique() #there should be only one element here
        if (len(chrom_raw_data) > 1):
            print " Error: "+gene+": more than one chromosome raw data" #sanity check
        chrom_raw_data = chrom_raw_data[0]
        
        #Create exons table
        exon_table = create_exon_pos_table(chrom_raw_data, prot_id) #exons frameshifts are also fixed here!
        
        #Translate all the exons dna sequences to one protein sequence
        prot_seq = exons_translate_to_prot(exon_table, chrom_raw_data, chrom)
        gene_dict[gene][prot_id] = prot_seq
        
    print "Finished domain "+str(domain_name)
    
#Saving one dictionary for all the domains together
if (TEST_PROCCESSED_DOMAINS):
    with open(curr_dir[0]+"/canonic_prot_seq/pfam-v"+pfam_version+"/processed_domains_genes_prot_seq.pik", 'wb') as handle:
        pickle.dump(gene_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(curr_dir[0]+"/canonic_prot_seq/pfam-v"+pfam_version+"/all_domains_genes_prot_seq.pik", 'wb') as handle:
        pickle.dump(gene_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Finished domain 1-cysPrx_C
Finished domain 14-3-3
Finished domain 2-Hacid_dh
Finished domain 2-Hacid_dh_C
Finished domain 2-oxoacid_dh
Finished domain 2-oxogl_dehyd_N
Finished domain 23ISL
Finished domain 2Fe-2S_thioredx
Finished domain 2OG-FeII_Oxy
Finished domain 2OG-FeII_Oxy_2
Finished domain 2OG-FeII_Oxy_3
Finished domain 2OG-FeII_Oxy_4
Finished domain 3-HAO
Finished domain 3-PAP
Finished domain 3Beta_HSD
Finished domain 3HCDH
Finished domain 3HCDH_N
Finished domain 40S_S4_C
Finished domain 40S_SA_C
Finished domain 4F5
Finished domain 4HBT
Finished domain 4HBT_2
Finished domain 4HBT_3
Finished domain 4_1_CTD
Finished domain 5-FTHF_cyc-lig
Finished domain 5-nucleotidase
Finished domain 53-BP1_Tudor
Finished domain 5HT_transport_N
Finished domain 5_nucleotid
Finished domain 5_nucleotid_C
Finished domain 60KD_IMP
Finished domain 6PF2K
Finished domain 6PGD
Finished domain 7TM_GPCR_Srsx
Finished domain 7TM_GPCR_Srv
Finished domain 7TM_GPCR_Srw
Finished domain 7TM_GPCR_Srx
Finished domai

Finished domain CABS1
Finished domain CAC1F_C
Finished domain CAF-1_p150
Finished domain CAF-1_p60_C
Finished domain CAF1
Finished domain CAF1-p150_C2
Finished domain CAF1-p150_N
Finished domain CAF1A
Finished domain CAF1C_H4-bd
Finished domain CAGE1
Finished domain CALCOCO1
Finished domain CALM_bind
Finished domain CAML
Finished domain CAMSAP_CC1
Finished domain CAMSAP_CH
Finished domain CAMSAP_CKK
Finished domain CAP
Finished domain CAP-ZIP_m
Finished domain CAP18_C
Finished domain CAP_C
Finished domain CAP_GLY
Finished domain CAP_N
Finished domain CARD
Finished domain CARD_2
Finished domain CARM1
Finished domain CARMIL_C
Finished domain CART
Finished domain CASP_C
Finished domain CAS_C
Finished domain CAS_CSE1
Finished domain CATSPERB
Finished domain CATSPERD
Finished domain CATSPERG
Finished domain CBAH
Finished domain CBF
Finished domain CBFB_NFYA
Finished domain CBFD_NFYB_HMF
Finished domain CBFNT
Finished domain CBF_beta
Finished domain CBM53
Finished domain CBM_14
Finished doma

Finished domain Carn_acyltransf
Finished domain Cas1_AcylT
Finished domain Casc1_N
Finished domain Casein
Finished domain Casein_kappa
Finished domain Caskin-Pro-rich
Finished domain Caskin-tail
Finished domain Caskin1-CID
Finished domain Cast
Finished domain Castor1_N
Finished domain Catalase
Finished domain Catalase-rel
Finished domain Cathelicidins
Finished domain CathepsinC_exc
Finished domain Cation_ATPase
Finished domain Cation_ATPase_C
Finished domain Cation_ATPase_N
Finished domain Cation_efflux
Finished domain Caudal_act
Finished domain Cauli_VI
Finished domain Caveolin
Finished domain CbiA
Finished domain Cbl_N
Finished domain Cbl_N2
Finished domain Cbl_N3
Finished domain Ccdc124
Finished domain Cdc6_C
Finished domain Cementoin
Finished domain Centro_C10orf90
Finished domain Cep57_CLD
Finished domain Cep57_MT_bd
Finished domain Ceramidase
Finished domain Ceramidase_alk
Finished domain Ceramidse_alk_C
Finished domain Cg6151-P
Finished domain Cgr1
Finished domain ChaC
Finished 

Finished domain DUF4645
Finished domain DUF4647
Finished domain DUF4648
Finished domain DUF4650
Finished domain DUF4653
Finished domain DUF4655
Finished domain DUF4656
Finished domain DUF4657
Finished domain DUF4658
Finished domain DUF4659
Finished domain DUF4660
Finished domain DUF4661
Finished domain DUF4662
Finished domain DUF4663
Finished domain DUF4665
Finished domain DUF4671
Finished domain DUF4672
Finished domain DUF4674
Finished domain DUF4675
Finished domain DUF4677
Finished domain DUF4678
Finished domain DUF4679
Finished domain DUF4680
Finished domain DUF4681
Finished domain DUF4682
Finished domain DUF4683
Finished domain DUF4684
Finished domain DUF4685
Finished domain DUF4686
Finished domain DUF4687
Finished domain DUF4688
Finished domain DUF4689
Finished domain DUF4690
Finished domain DUF4691
Finished domain DUF4692
Finished domain DUF4693
Finished domain DUF4695
Finished domain DUF4696
Finished domain DUF4698
Finished domain DUF4699
Finished domain DUF4701
Finished domain 

Finished domain EMC1_C
Finished domain EMC3_TMCO1
Finished domain EMG1
Finished domain EMI
Finished domain EMP24_GP25L
Finished domain EMP70
Finished domain ENT
Finished domain ENTH
Finished domain EP400_N
Finished domain EPL1
Finished domain EPOP
Finished domain EPO_TPO
Finished domain EPTP
Finished domain ER
Finished domain ERAP1_C
Finished domain ERCC3_RAD25_C
Finished domain ERCC4
Finished domain ERG2_Sigma1R
Finished domain ERG4_ERG24
Finished domain ERGIC_N
Finished domain ERK-JNK_inhib
Finished domain ERM
Finished domain ERO1
Finished domain ER_lumen_recept
Finished domain ERbeta_N
Finished domain ERp29
Finished domain ERp29_N
Finished domain ESCRT-II
Finished domain ESR1_C
Finished domain ESSS
Finished domain EST1
Finished domain EST1_DNA_bind
Finished domain ETAA1
Finished domain ETC_C1_NDUFA4
Finished domain ETC_C1_NDUFA5
Finished domain ETF
Finished domain ETF_QO
Finished domain ETF_alpha
Finished domain ETS_PEA3_N
Finished domain EURL
Finished domain EVC2_like
Finished doma

Finished domain HCNGP
Finished domain HCO3_cotransp
Finished domain HCR
Finished domain HD
Finished domain HDAC4_Gln
Finished domain HDNR
Finished domain HD_2
Finished domain HD_3
Finished domain HD_4
Finished domain HEAT
Finished domain HEAT_2
Finished domain HEAT_EZ
Finished domain HEAT_PBS
Finished domain HECA
Finished domain HECT
Finished domain HECT_2
Finished domain HECW1_helix
Finished domain HECW_N
Finished domain HELP
Finished domain HEM4
Finished domain HEPN
Finished domain HEPN_DZIP3
Finished domain HEXIM
Finished domain HGAL
Finished domain HGTP_anticodon
Finished domain HGTP_anticodon2
Finished domain HHH
Finished domain HHH_2
Finished domain HHH_3
Finished domain HHH_5
Finished domain HHH_7
Finished domain HHH_8
Finished domain HHH_9
Finished domain HH_signal
Finished domain HIF-1
Finished domain HIF-1a_CTAD
Finished domain HIG_1_N
Finished domain HILPDA
Finished domain HIN
Finished domain HIP1_clath_bdg
Finished domain HIRAN
Finished domain HIRA_B
Finished domain HIT
Fin

Finished domain MIEAP
Finished domain MIF
Finished domain MIF4G
Finished domain MIF4G_like
Finished domain MIF4G_like_2
Finished domain MIG-14_Wnt-bd
Finished domain MIIP
Finished domain MINDY_DUB
Finished domain MIOX
Finished domain MIP
Finished domain MIP-T3
Finished domain MIP-T3_C
Finished domain MIR
Finished domain MIS13
Finished domain MISS
Finished domain MIT
Finished domain MITF_TFEB_C_3_N
Finished domain MIT_C
Finished domain MJ1316
Finished domain MKLP1_Arf_bdg
Finished domain MKRN1_C
Finished domain MLANA
Finished domain MLIP
Finished domain MMACHC
Finished domain MMADHC
Finished domain MMR_HSR1
Finished domain MMR_HSR1_Xtn
Finished domain MMS19_C
Finished domain MMS19_N
Finished domain MMS1_N
Finished domain MMS22L_C
Finished domain MMS22L_N
Finished domain MM_CoA_mutase
Finished domain MMgT
Finished domain MMtag
Finished domain MNNL
Finished domain MNR
Finished domain MOEP19
Finished domain MOFRL
Finished domain MOR2-PAG1_C
Finished domain MOR2-PAG1_N
Finished domain MORN


Finished domain NAD_binding_4
Finished domain NAD_binding_5
Finished domain NAD_binding_6
Finished domain NAD_binding_8
Finished domain NAD_kinase
Finished domain NAD_synthase
Finished domain NAGLU
Finished domain NAGLU_C
Finished domain NAGLU_N
Finished domain NAGPA
Finished domain NAGidase
Finished domain NAP
Finished domain NAPRTase
Finished domain NAPRTase_C
Finished domain NAPRTase_N
Finished domain NARG2_C
Finished domain NARP1
Finished domain NAT
Finished domain NB-ARC
Finished domain NCBP3
Finished domain NCD1
Finished domain NCD2
Finished domain NCD3G
Finished domain NCKAP5
Finished domain NCOA_u2
Finished domain NCU-G1
Finished domain NDK
Finished domain NDT80_PhoG
Finished domain NDUFA12
Finished domain NDUFB10
Finished domain NDUFV3
Finished domain NDUF_B12
Finished domain NDUF_B4
Finished domain NDUF_B5
Finished domain NDUF_B6
Finished domain NDUF_B7
Finished domain NDUF_B8
Finished domain NDUF_C2
Finished domain NECFESHC
Finished domain NEMO
Finished domain NEMP
Finished 

Finished domain P21-Arc
Finished domain P2X_receptor
Finished domain P33MONOX
Finished domain P34-Arc
Finished domain P4Ha_N
Finished domain P5-ATPase
Finished domain P53
Finished domain P53_TAD
Finished domain P53_tetramer
Finished domain P5CR_dimer
Finished domain P66_CC
Finished domain P68HR
Finished domain PA
Finished domain PA14
Finished domain PA26
Finished domain PA28_alpha
Finished domain PA28_beta
Finished domain PABP
Finished domain PAC1
Finished domain PAC2
Finished domain PAC3
Finished domain PAC4
Finished domain PACT_coil_coil
Finished domain PAD
Finished domain PADR1
Finished domain PAD_M
Finished domain PAD_N
Finished domain PAE
Finished domain PAF
Finished domain PAF-AH_p_II
Finished domain PAG
Finished domain PAH
Finished domain PALB2_WD40
Finished domain PALP
Finished domain PAM2
Finished domain PAN_1
Finished domain PAN_4
Finished domain PAP2
Finished domain PAP2_C
Finished domain PAPA-1
Finished domain PAPS_reduct
Finished domain PAP_RNA-bind
Finished domain PAP_ass

Finished domain Pcc1
Finished domain Pdase_M17_N2
Finished domain Pecanex_C
Finished domain Pellino
Finished domain Penicillinase_R
Finished domain Pentapeptide
Finished domain Pentapeptide_3
Finished domain Pentapeptide_4
Finished domain Pentaxin
Finished domain Pep3_Vps18
Finished domain Pep_M12B_propep
Finished domain Pep_deformylase
Finished domain Pepdidase_M14_N
Finished domain Pept_tRNA_hydro
Finished domain Peptidase_A22B
Finished domain Peptidase_C1
Finished domain Peptidase_C101
Finished domain Peptidase_C12
Finished domain Peptidase_C13
Finished domain Peptidase_C14
Finished domain Peptidase_C15
Finished domain Peptidase_C1_2
Finished domain Peptidase_C2
Finished domain Peptidase_C26
Finished domain Peptidase_C39_2
Finished domain Peptidase_C48
Finished domain Peptidase_C50
Finished domain Peptidase_C54
Finished domain Peptidase_C65
Finished domain Peptidase_C78
Finished domain Peptidase_C97
Finished domain Peptidase_C98
Finished domain Peptidase_M1
Finished domain Peptidase

Finished domain Ribosomal_S17e
Finished domain Ribosomal_S18
Finished domain Ribosomal_S19
Finished domain Ribosomal_S19e
Finished domain Ribosomal_S2
Finished domain Ribosomal_S21
Finished domain Ribosomal_S21e
Finished domain Ribosomal_S24e
Finished domain Ribosomal_S25
Finished domain Ribosomal_S26e
Finished domain Ribosomal_S27
Finished domain Ribosomal_S27e
Finished domain Ribosomal_S28e
Finished domain Ribosomal_S30
Finished domain Ribosomal_S3Ae
Finished domain Ribosomal_S3_C
Finished domain Ribosomal_S4
Finished domain Ribosomal_S4e
Finished domain Ribosomal_S5
Finished domain Ribosomal_S5_C
Finished domain Ribosomal_S6
Finished domain Ribosomal_S6e
Finished domain Ribosomal_S7
Finished domain Ribosomal_S7e
Finished domain Ribosomal_S8
Finished domain Ribosomal_S8e
Finished domain Ribosomal_S9
Finished domain Ribul_P_3_epim
Finished domain Ric8
Finished domain RicinB_lectin_2
Finished domain Ricin_B_lectin
Finished domain Rieske
Finished domain Rieske_2
Finished domain Rif1_N
F

Finished domain SYCE1
Finished domain SYCP2_ARLD
Finished domain SYCP2_SLD
Finished domain SYF2
Finished domain SYS1
Finished domain S_100
Finished domain Sacchrp_dh_C
Finished domain Sacchrp_dh_NADP
Finished domain Sad1_UNC
Finished domain Sam68-YY
Finished domain SapA
Finished domain SapB_1
Finished domain SapB_2
Finished domain Sarcoglycan_1
Finished domain Sarcoglycan_2
Finished domain Sarcolipin
Finished domain Sas10
Finished domain Sas10_Utp3
Finished domain Sas6_CC
Finished domain SbcCD_C
Finished domain Sclerostin
Finished domain Scm3
Finished domain Scramblase
Finished domain Scs3p
Finished domain Sde2_N_Ubi
Finished domain Sdh5
Finished domain Sdh_cyt
Finished domain Sds3
Finished domain Sec1
Finished domain Sec10
Finished domain Sec15
Finished domain Sec16
Finished domain Sec16_C
Finished domain Sec20
Finished domain Sec23_BS
Finished domain Sec23_helical
Finished domain Sec23_trunk
Finished domain Sec2p
Finished domain Sec3-PIP2_bind
Finished domain Sec34
Finished domain Se

Finished domain TIR
Finished domain TIR_2
Finished domain TIR_3
Finished domain TK
Finished domain TLD
Finished domain TLE_N
Finished domain TLV_coat
Finished domain TM140
Finished domain TM2
Finished domain TM231
Finished domain TMA7
Finished domain TMC
Finished domain TMCCDC2
Finished domain TMCO5
Finished domain TMEM100
Finished domain TMEM101
Finished domain TMEM107
Finished domain TMEM108
Finished domain TMEM117
Finished domain TMEM119
Finished domain TMEM125
Finished domain TMEM126
Finished domain TMEM131_like
Finished domain TMEM132
Finished domain TMEM132D_C
Finished domain TMEM132D_N
Finished domain TMEM135_C_rich
Finished domain TMEM138
Finished domain TMEM141
Finished domain TMEM144
Finished domain TMEM151
Finished domain TMEM154
Finished domain TMEM156
Finished domain TMEM164
Finished domain TMEM169
Finished domain TMEM171
Finished domain TMEM173
Finished domain TMEM174
Finished domain TMEM18
Finished domain TMEM187
Finished domain TMEM189_B_dmain
Finished domain TMEM190
Fi

Finished domain UAA
Finished domain UAE_UbL
Finished domain UBA
Finished domain UBA2_C
Finished domain UBA_4
Finished domain UBA_5
Finished domain UBA_6
Finished domain UBA_e1_thiolCys
Finished domain UBD
Finished domain UBM
Finished domain UBN_AB
Finished domain UBX
Finished domain UBZ_FAAP20
Finished domain UCH
Finished domain UCH_1
Finished domain UCH_C
Finished domain UCH_N
Finished domain UCMA
Finished domain UCR_14kD
Finished domain UCR_6-4kD
Finished domain UCR_TM
Finished domain UCR_UQCRX_QCR9
Finished domain UCR_hinge
Finished domain UDG
Finished domain UDP-g_GGTase
Finished domain UDPGP
Finished domain UDPGT
Finished domain UDPG_MGDP_dh
Finished domain UDPG_MGDP_dh_C
Finished domain UDPG_MGDP_dh_N
Finished domain UEV
Finished domain UFC1
Finished domain UFD1
Finished domain UIM
Finished domain ULD
Finished domain UME
Finished domain UMP1
Finished domain UMPH-1
Finished domain UNC-50
Finished domain UNC-79
Finished domain UNC-93
Finished domain UNC119_bdg
Finished domain UNC45

Finished domain dNK
Finished domain dUTPase
Finished domain dbPDZ_assoc
Finished domain dsDNA_bind
Finished domain dsrm
Finished domain eIF-1a
Finished domain eIF-3_zeta
Finished domain eIF-3c_N
Finished domain eIF-5_eIF-2B
Finished domain eIF-5a
Finished domain eIF-6
Finished domain eIF2A
Finished domain eIF2_C
Finished domain eIF3_N
Finished domain eIF3_p135
Finished domain eIF3_subunit
Finished domain eIF3g
Finished domain eIF3m_C_helix
Finished domain eIF_4EBP
Finished domain eRF1_1
Finished domain eRF1_2
Finished domain eRF1_3
Finished domain ecTbetaR2
Finished domain efThoc1
Finished domain fn1
Finished domain fn2
Finished domain fn3
Finished domain fn3_4
Finished domain gag-asp_proteas
Finished domain hDGE_amylase
Finished domain hEGF
Finished domain hGDE_N
Finished domain hGDE_central
Finished domain hNIFK_binding
Finished domain hSH3
Finished domain hSac2
Finished domain hnRNP_Q_AcD
Finished domain ig
Finished domain ketoacyl-synt
Finished domain mRNA_cap_C
Finished domain mRN

### A different dictionary for each domain (draft code)

In [6]:
%%time

for domain_name in filtered_domains_list:
    
    in_path = curr_dir[0]+"/hmm_domains/pfam-v"+pfam_version+"/"
    filename = domain_name+".csv"
    domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})
    gene_dict = defaultdict(dict)

    #Sort the domain data
    sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
    sorted_domain_data = sorted_domain_data.reset_index(drop=True)

    #Get the canonic protein ids file for the domain
    with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v"+pfam_version+"/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in sorted_domain_data["gene"].unique():
        
        prot_id = canonic_protein[gene]
        
        #Get the exons sequence file for the protein
        chrom = sorted_domain_data[sorted_domain_data["gene"] == gene]["chrom_num"].unique()[0]
        if (chrom not in chromosome_names):
            continue
        exons_file = pd.read_csv(curr_dir[0]+"/from_shilpa/exons_seqs/"+chrom+"/"+gene+"/"+prot_id+".exons.txt", skiprows=1, header=None, names=["pos", "exon_seq"], sep='\t')
        
        #Get the chrom raw data
        chrom_raw_data = sorted_domain_data[sorted_domain_data["gene"] == gene][sorted_domain_data["#TargetID"] == prot_id]["chromosome"].unique() #there should be only one element here
        if (len(chrom_raw_data) > 1):
            print " Error: "+gene+": more than one chromosome raw data" #sanity check
        chrom_raw_data = chrom_raw_data[0]
        
        #Create exons table
        exon_table = create_exon_pos_table(chrom_raw_data, prot_id) #exons frameshifts are also fixed here!
        
        #Translate all the exons dna sequences to one protein sequence
        prot_seq = exons_translate_to_prot(exon_table, chrom_raw_data, chrom)
        gene_dict[gene] = prot_seq
        
    with open(curr_dir[0]+"/canonic_prot_seq/pfam-v"+pfam_version+"/"+domain_name+"_prot_seq.pik", 'wb') as handle:
        pickle.dump(gene_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print "Finished domain "+str(domain_name)
    

Finished domain 1-cysPrx_C
Finished domain 14-3-3
Finished domain 2-Hacid_dh
Finished domain 2-Hacid_dh_C
Finished domain 2-oxoacid_dh
Finished domain 2-oxogl_dehyd_N
Finished domain 23ISL
Finished domain 2Fe-2S_thioredx
Finished domain 2OG-FeII_Oxy
Finished domain 2OG-FeII_Oxy_2
Finished domain 2OG-FeII_Oxy_3
Finished domain 2OG-FeII_Oxy_4
Finished domain 3-HAO
Finished domain 3-PAP
Finished domain 3Beta_HSD
Finished domain 3HCDH
Finished domain 3HCDH_N
Finished domain 40S_S4_C
Finished domain 40S_SA_C
Finished domain 4F5
Finished domain 4HBT




KeyboardInterrupt: 