## Map domains' gene ids to canonic protein sequences 

In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import pickle
import sys
import subprocess

sys.path.append('/home/anat/Research/ExAC/5.HMM_alter_align') 
from calc_exac_freq_func import codon_table, retrieve_codon_seq
from mapping_func import create_exon_pos_table

In [3]:
curr_dir = !pwd
pfam_version = "31"
domains_th = "10"
TEST_PROCCESSED_DOMAINS = True

#Read the list of domains
if (TEST_PROCCESSED_DOMAINS):
    with open(curr_dir[0]+"/../13.Process_domains_not_in_training/processed_domains_not_in_pipeline_final_list.pik", 'rb') as handle:
        filtered_domains_list = pickle.load(handle)
else:
    with open(curr_dir[0]+"/../5.domains_stats/pfam-v"+pfam_version+"/filtered"+domains_th+"_list.pik", 'rb') as handle:
        filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Read the substitutions table (for the exons translation)
with open("/home/anat/Research/ExAC/9.Features_exploration/codon_ns_table.pik", 'rb') as handle:
    codon_ns_table = pickle.load(handle)
    
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]
len(filtered_domains_list)

825

In [4]:
def reverse_complement(seq):
    """
    Given a DNA sequence, return the reverse-complement of that sequence.
    """
    
    #Complement strand - transversing the bp to base-complement
    complement_seq = []
    for c in seq:
        if (c.upper() == 'A'):
            complement_seq.append('T')
        elif (c.upper() == 'T'):
            complement_seq.append('A')
        elif (c.upper() == 'G'):
            complement_seq.append('C')
        else:
            complement_seq.append('G')
    
    #Reversing the sequence
    comp_seq = ''.join(complement_seq)
    rev_comp_seq = comp_seq[::-1]
    
    return rev_comp_seq

In [5]:
def retrieve_exon_seq(exon_start, exon_end, chrom):
    """
    Retrieve the exon sequence from the ref sequence, according to exons start and end positions
    """
    chromsome_name = "chr"+chrom
    seq_start = int(exon_start) - 1
    
    #Calling hg19.2bit to retreive the DNA sequence
    query = subprocess.check_output("../5.HMM_alter_align/twoBitToFa ../5.HMM_alter_align/hg19.2bit stdout -seq=%s -start=%s -end=%s" %(chromsome_name, str(seq_start), str(exon_end)), shell=True)
    query = ''.join(query.split()[1:]) #Remove 1st line, whitespaces and newlines
    
    return query.upper()

In [6]:
def exons_translate_to_prot(exon_table, chrom_raw_data, chrom):
    
    dna_seq = ""
    
    #Get all the exons dna sequence
    for index, exon in exon_table.iterrows():
        exon_seq = retrieve_exon_seq(exon["start_pos"], exon["end_pos"], chrom)
        
        if (chrom_raw_data.find("complement") >= 0):
            exon_seq = reverse_complement(exon_seq)
        
        dna_seq = dna_seq+exon_seq
        
    #Translate to protein sequence
    prot_seq = []
    next_codon_idx = 0 
    while (next_codon_idx+2 < len(dna_seq)):
        codon = dna_seq[next_codon_idx:next_codon_idx+3]
        prot_seq.append(codon_table[codon])
        next_codon_idx += 3
    
    #Convert all codons to one amino acids string
    protein_str = ''.join(prot_seq) 
    return protein_str

### A different dictionary for each domain

In [6]:
%%time

for domain_name in filtered_domains_list:
    
    in_path = curr_dir[0]+"/hmm_domains/pfam-v"+pfam_version+"/"
    filename = domain_name+".csv"
    domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})
    gene_dict = defaultdict(dict)

    #Sort the domain data
    sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
    sorted_domain_data = sorted_domain_data.reset_index(drop=True)

    #Get the canonic protein ids file for the domain
    with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v"+pfam_version+"/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in sorted_domain_data["gene"].unique():
        
        prot_id = canonic_protein[gene]
        
        #Get the exons sequence file for the protein
        chrom = sorted_domain_data[sorted_domain_data["gene"] == gene]["chrom_num"].unique()[0]
        if (chrom not in chromosome_names):
            continue
        exons_file = pd.read_csv(curr_dir[0]+"/from_shilpa/exons_seqs/"+chrom+"/"+gene+"/"+prot_id+".exons.txt", skiprows=1, header=None, names=["pos", "exon_seq"], sep='\t')
        
        #Get the chrom raw data
        chrom_raw_data = sorted_domain_data[sorted_domain_data["gene"] == gene][sorted_domain_data["#TargetID"] == prot_id]["chromosome"].unique() #there should be only one element here
        if (len(chrom_raw_data) > 1):
            print " Error: "+gene+": more than one chromosome raw data" #sanity check
        chrom_raw_data = chrom_raw_data[0]
        
        #Create exons table
        exon_table = create_exon_pos_table(chrom_raw_data, prot_id) #exons frameshifts are also fixed here!
        
        #Translate all the exons dna sequences to one protein sequence
        prot_seq = exons_translate_to_prot(exon_table, chrom_raw_data, chrom)
        gene_dict[gene] = prot_seq
        
    with open(curr_dir[0]+"/canonic_prot_seq/pfam-v"+pfam_version+"/"+domain_name+"_prot_seq.pik", 'wb') as handle:
        pickle.dump(gene_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print "Finished domain "+str(domain_name)
    

Finished domain 2OG-FeII_Oxy_3
Finished domain 7TM_GPCR_Srsx
Finished domain 7tm_1
Finished domain 7tm_2
Finished domain 7tm_3
Finished domain 7tm_4
Finished domain A2M
Finished domain A2M_N
Finished domain A2M_N_2
Finished domain A2M_comp
Finished domain A2M_recep
Finished domain AAA
Finished domain AAA_11
Finished domain AAA_12
Finished domain AAA_17
Finished domain AAA_18
Finished domain AAA_33
Finished domain AAA_5
Finished domain AAA_6
Finished domain AAA_7
Finished domain AAA_8
Finished domain AAA_9
Finished domain AA_permease
Finished domain AA_permease_2
Finished domain ABC2_membrane_3
Finished domain ABC_membrane
Finished domain ABC_tran
Finished domain ADAM_CR
Finished domain ADAM_spacer1
Finished domain ADH_N
Finished domain ADH_zinc_N
Finished domain ADK
Finished domain AMP-binding
Finished domain AMP-binding_C
Finished domain ANAPC3
Finished domain ANAPC4_WD40
Finished domain ANATO
Finished domain ANF_receptor
Finished domain APC_r
Finished domain APOBEC_C
Finished domain 



### One dictionary for all domains (no gene duplication)

In [7]:
%%time
gene_dict = defaultdict(dict)

for domain_name in filtered_domains_list:
    
    in_path = curr_dir[0]+"/hmm_domains/pfam-v"+pfam_version+"/"
    filename = domain_name+".csv"
    domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})
    
    #Sort the domain data
    sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
    sorted_domain_data = sorted_domain_data.reset_index(drop=True)

    #Get the canonic protein ids file for the domain
    with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v"+pfam_version+"/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in sorted_domain_data["gene"].unique():

        prot_id = canonic_protein[gene]
        if (gene_dict.has_key(gene) and gene_dict[gene].has_key(prot_id)):
            continue
        
        #Get the exons sequence file for the protein
        chrom = sorted_domain_data[sorted_domain_data["gene"] == gene]["chrom_num"].unique()[0]
        if (chrom not in chromosome_names):
            continue
        exons_file = pd.read_csv(curr_dir[0]+"/from_shilpa/exons_seqs/"+chrom+"/"+gene+"/"+prot_id+".exons.txt", skiprows=1, header=None, names=["pos", "exon_seq"], sep='\t')
        
        #Get the chrom raw data
        chrom_raw_data = sorted_domain_data[sorted_domain_data["gene"] == gene][sorted_domain_data["#TargetID"] == prot_id]["chromosome"].unique() #there should be only one element here
        if (len(chrom_raw_data) > 1):
            print " Error: "+gene+": more than one chromosome raw data" #sanity check
        chrom_raw_data = chrom_raw_data[0]
        
        #Create exons table
        exon_table = create_exon_pos_table(chrom_raw_data, prot_id) #exons frameshifts are also fixed here!
        
        #Translate all the exons dna sequences to one protein sequence
        prot_seq = exons_translate_to_prot(exon_table, chrom_raw_data, chrom)
        gene_dict[gene][prot_id] = prot_seq
        
    print "Finished domain "+str(domain_name)
    
#Saving one dictionary for all the domains together
if (TEST_PROCCESSED_DOMAINS):
    with open(curr_dir[0]+"/canonic_prot_seq/pfam-v"+pfam_version+"/processed_domains_genes_prot_seq.pik", 'wb') as handle:
        pickle.dump(gene_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(curr_dir[0]+"/canonic_prot_seq/pfam-v"+pfam_version+"/all_domains_genes_prot_seq.pik", 'wb') as handle:
        pickle.dump(gene_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Finished domain 14-3-3
Finished domain 2-Hacid_dh
Finished domain 2OG-FeII_Oxy_2
Finished domain 3HCDH
Finished domain 4HBT
Finished domain 4HBT_2
Finished domain 5-FTHF_cyc-lig
Finished domain 5_nucleotid_C
Finished domain 6PGD
Finished domain 7TM_GPCR_Srv
Finished domain 7TM_GPCR_Srw
Finished domain AAA_29
Finished domain AA_kinase
Finished domain ABM
Finished domain ACPS
Finished domain ACP_syn_III_C
Finished domain ACT
Finished domain ADH_N_2
Finished domain ADK_lid
Finished domain ADP_PFK_GK
Finished domain ADP_ribosyl_GH
Finished domain AICARFT_IMPCHas
Finished domain AIF_C
Finished domain AIRC
Finished domain AIRS
Finished domain AIRS_C
Finished domain ALAD
Finished domain AMPK1_CBM
Finished domain APG12
Finished domain AP_endonuc_2
Finished domain ART
Finished domain ASF1_hist_chap
Finished domain ATP-cone
Finished domain ATP-grasp
Finished domain ATP-grasp_2
Finished domain ATP-grasp_3
Finished domain ATP-grasp_4
Finished domain ATP-gua_Ptrans
Finished domain ATP-gua_PtransN
F

Finished domain Glyco_hydro_99
Finished domain Glyco_tran_28_C
Finished domain Glyco_tranf_2_3
Finished domain Glyco_trans_1_2
Finished domain Glyco_trans_1_4
Finished domain Glyco_trans_4_4
Finished domain Glyco_transf_4
Finished domain Glyco_transf_41
Finished domain Glyco_transf_64
Finished domain Glycogen_syn
Finished domain Glycohydro_20b2
Finished domain Glycolytic
Finished domain Glycos_transf_1
Finished domain Glycos_transf_3
Finished domain Glyoxalase
Finished domain Glyoxalase_4
Finished domain Gp_dh_C
Finished domain H2TH
Finished domain HAD
Finished domain HAT_KAT11
Finished domain HBB
Finished domain HD
Finished domain HD_3
Finished domain HEAT_PBS
Finished domain HGTP_anticodon
Finished domain HHH
Finished domain HIN
Finished domain HIT
Finished domain HMGL-like
Finished domain HMG_CoA_synt_C
Finished domain HMG_CoA_synt_N
Finished domain HNOB
Finished domain HORMA
Finished domain HSP90
Finished domain Ham1p_like
Finished domain Helicase_C_2
Finished domain Heme_oxygenase

Finished domain Ribonuclease_T2
Finished domain Ribosomal_L1
Finished domain Ribosomal_L10
Finished domain Ribosomal_L11
Finished domain Ribosomal_L11_N
Finished domain Ribosomal_L13
Finished domain Ribosomal_L14
Finished domain Ribosomal_L15e
Finished domain Ribosomal_L16
Finished domain Ribosomal_L17
Finished domain Ribosomal_L19
Finished domain Ribosomal_L2
Finished domain Ribosomal_L20
Finished domain Ribosomal_L21e
Finished domain Ribosomal_L21p
Finished domain Ribosomal_L22
Finished domain Ribosomal_L23
Finished domain Ribosomal_L26
Finished domain Ribosomal_L27
Finished domain Ribosomal_L27A
Finished domain Ribosomal_L28
Finished domain Ribosomal_L29
Finished domain Ribosomal_L2_C
Finished domain Ribosomal_L3
Finished domain Ribosomal_L30
Finished domain Ribosomal_L32e
Finished domain Ribosomal_L34
Finished domain Ribosomal_L35p
Finished domain Ribosomal_L36
Finished domain Ribosomal_L37e
Finished domain Ribosomal_L39
Finished domain Ribosomal_L4
Finished domain Ribosomal_L5
Fin

