In [2]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import matplotlib.pyplot as plt
from os import environ
import pickle
import fileinput
import sys
import datetime
from mapping_func import create_exon_pos_table, find_chrom_bps, protein_pos_to_hmm_state_and_aa
from calc_exac_freq_func import create_alt_codon, exac_validation_checks, retrieve_codon_seq, codon_table
from indels_func import is_indel, table_editing, indel_type
from entropy_func import JSD_background, JSD, SE
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [3]:
#Getting path
curr_dir = !pwd

#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/filtered50_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Getting the domain index as environment variable called "idx"
try:
    domain_index = int(environ['idx'])
except:
    domain_index = 0
domain_name = filtered_domains_list[domain_index]

In [6]:
filtered_domains_list

['7TM_GPCR_Srsx',
 '7tm_1',
 '7tm_4',
 'AAA',
 'AAA_5',
 'ABC_membrane',
 'ABC_tran',
 'Ank',
 'Ank_2',
 'Ank_3',
 'Ank_4',
 'Ank_5',
 'Annexin',
 'Arf',
 'Arm',
 'BACK',
 'BTB',
 'BTB_2',
 'Bromodomain',
 'C1-set',
 'C1_1',
 'C2',
 'C2-set_2',
 'CBFD_NFYB_HMF',
 'CH',
 'CUB',
 'Cadherin',
 'Cadherin_2',
 'Cadherin_3',
 'Cadherin_C_2',
 'Cadherin_tail',
 'Collagen',
 'DEAD',
 'DUF1220',
 'EF-hand_1',
 'EF-hand_5',
 'EF-hand_6',
 'EF-hand_7',
 'EF-hand_8',
 'EGF',
 'EGF_2',
 'EGF_3',
 'EGF_CA',
 'F-box',
 'F-box-like',
 'FXa_inhibition',
 'Filament',
 'Filamin',
 'Forkhead',
 'HLH',
 'HMG_box',
 'HMG_box_2',
 'Helicase_C',
 'Hemopexin',
 'Histone',
 'Homeobox',
 'Homeobox_KN',
 'Hormone_recep',
 'I-set',
 'IQ',
 'Ig_2',
 'Ig_3',
 'Ion_trans',
 'Ion_trans_2',
 'KH_1',
 'KRAB',
 'Kazal_2',
 'Kelch_1',
 'Kelch_2',
 'Kelch_3',
 'Kelch_4',
 'Kelch_6',
 'Keratin_B2_2',
 'LIM',
 'LRRNT',
 'LRR_4',
 'LRR_5',
 'LRR_6',
 'LRR_8',
 'Laminin_EGF',
 'Laminin_G_1',
 'Laminin_G_2',
 'Ldl_recept_a',
 '

In [4]:
in_path = curr_dir[0]+"/../3.parse_HMMER/hmm_domains/pfam-v30/"
filename = domain_name+".csv"
domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})
#Sort the zinc finger data
sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
sorted_domain_data = sorted_domain_data.reset_index(drop=True)

#Get the canonic protein id for Zinc domain
with open(curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v30/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
    canonic_protein = pickle.load(handle)
    
#Get the filtered table of domains
    
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]

In [5]:
#Counting how many domains instances are excluded because of strange chrom names.
chrom_names_list = sorted_domain_data["chrom_num"].tolist()
strange_chrom_sum = 0
for name in chrom_names_list:
    if (name not in chromosome_names):
        strange_chrom_sum += 1
print "Leaving outside "+str(strange_chrom_sum)+" out of "+str(len(chrom_names_list))

Leaving outside 19 out of 239


In [23]:
#A function that return a dict with the MAF info for the protein position and corresponding chromosomal location
def calc_exac_maf_data(chrom_pos_list, chrom_gene_table, indels_table, protein_pos, aa, chrom_raw_data, chrom, hmm_state):
    
    res_dict = {}
    res_dict["chrom"] = chrom
    res_dict["chrom_pos"] = chrom_pos_list
    res_dict["prot_pos"] = protein_pos
    res_dict["aa_ref"] = aa
    res_dict["bp_ref"] = retrieve_codon_seq(chrom_pos_list, chrom_raw_data, chrom)
    res_dict["an_adj"] = []
    res_dict["filter"] = []
    frameshift_cnt = 0
    errors_cnt = 0
    filter_cnt = 0
    inframe_ids = []
    
    #For error logging
    functionNameAsString = sys._getframe().f_code.co_name
    
    #Coverting the HMMER 'x' chat to '*' for stop codon notion unifirmity
    if (aa == 'X'):
        aa = '*'
    
    #Validation: checking that the returned codon sequence from hg19 match the HMMER amino-acid
    translated_aa = codon_table[(res_dict["bp_ref"]).upper()]
    if (translated_aa != aa):
        print "chrom_pos_list = " +str(chrom_pos_list)+" protein_pos = "+str(protein_pos)+" hmm_state = "+str(hmm_state)
        print functionNameAsString+" Error: hg19 codon sequence retrieved "+(res_dict["bp_ref"]).upper()+"="+translated_aa+" doesn't match HMMER amino-acid "+aa
    
    alterations_af_dict = defaultdict(list)
    alterations_af_adj_dict = defaultdict(list)
    
    for i in range(len(chrom_pos_list)):
        chrom_pos = chrom_pos_list[i]
        alt_codon_pos = i
            
        #Retreiving relevant ExAC entry
        chrom_alter_table = chrom_gene_table[chrom_gene_table["pos"] == chrom_pos]
        chrom_alter_table = chrom_alter_table.reset_index(drop=True)
                
        if (chrom_alter_table.shape[0] == 0):
            #No ExAC entry for this chromosome position - not adding alteration data
            continue
        
        else:
            #In case there are several alterations for that position, iterating
            for index, line in chrom_alter_table.iterrows():
                chrom_alter = line
                
                res_dict["filter"].append(chrom_alter["filter"])
                res_dict["an_adj"].append(chrom_alter["AN_Adj"])
                
                #Extracting ref and alt
                exac_ref_bp = chrom_alter["ref"]
                exac_alt_bp = chrom_alter["alt"]
                
                #Check if frameshift indel - skip (we assume the whole protein may not function and don't add those to the MAF count)
                if (is_indel(exac_ref_bp, exac_alt_bp, chrom_alter) == indel_type.FRAME_SHIFT_INDEL):
                    frameshift_cnt += 1
                    continue
                    
                #Check if inframe indel - take data from the indels table later
                elif (is_indel(exac_ref_bp, exac_alt_bp, chrom_alter) == indel_type.IN_FRAME_INDEL):
                    indel_id = chrom_alter["comments"][chrom_alter["comments"].find("-"):]
                    inframe_ids.append(indel_id)
                    continue     
                
                #Perform validation checks (comparing ExAC and HMMER data)
                (exac_prot_data, exac_alt_aa, exac_alt_codon, errors) = exac_validation_checks(chrom_alter, protein_pos, aa, alt_codon_pos, chrom_pos, res_dict["bp_ref"])
                if (errors):
                    errors_cnt += 1
                    #Skipping if there are validation errors
                    #continue

                #Extracting ExAC allele frequency data
                af = chrom_alter["AF"]
                an = int(chrom_alter["AN"])
                an_adj = int(chrom_alter["AN_Adj"])
                ac_adj = chrom_alter["AC_Adj"]
                
                #Calculating the alteration relevant data
                alt_codon = create_alt_codon(exac_ref_bp, exac_alt_bp, res_dict["bp_ref"], alt_codon_pos, chrom_raw_data)
                if (len(alt_codon) != 3):
                    alt_aa = "indel"
                    continue #TODO: handle inframe indels
                else:
                    alt_aa = codon_table[alt_codon.upper()]

                #Validation: ExAC alt codon and aa for the first alteration match the calculated alt
                #if (exac_prot_data and exac_alt_codon.upper() != alt_codon):
                    #print functionNameAsString+" "+ str(chrom_pos)+" Error: the ExAC alt codon "+exac_alt_codon.upper()+" doesn't match my alt codon calculation "+alt_codon
                if (exac_prot_data and exac_alt_aa != alt_aa):
                    print functionNameAsString+" "+ str(chrom_pos)+" Error: the ExAC alt aa "+exac_alt_aa+" doesn't match my alt aa calculation "+alt_aa

                if (alt_aa == res_dict["aa_ref"]):
                    #Not logging alteration for synonymous mutations
                    continue

                #Non-synonymous(!!!) - logging the alteration
                else:
                    alterations_af_dict[alt_aa].append(float(af))
                    if (an_adj == 0):
                        af_adj = 0
                    else:
                        af_adj = float(ac_adj)/float(an_adj)
                    af_adj_format = float('{:.3e}'.format(float(af_adj)))
                    alterations_af_adj_dict[alt_aa].append(af_adj_format)

    #Calculating the overall MAF from the alteration dicts
    res_dict["af"] = 0
    res_dict["af_adj"] = 0
    
    for aa in alterations_af_dict.keys():
        aa_sum = sum(alterations_af_dict[aa])
        aa_adj_sum = sum(alterations_af_adj_dict[aa])
        
        #Checking if any alteration is above 0.5, and changing the ref accordingly
        if (aa != "indel" and aa_sum > 0.5):
            
            #Adding the refrence allele to the alterations dicts
            old_ref = res_dict["aa_ref"]
            sum_of_all_alt = sum(sum(alterations_af_dict.values(), []))
            sum_of_all_alt_adj = sum(sum(alterations_af_adj_dict.values(), []))
            alterations_af_dict[old_ref] = [1 - sum_of_all_alt]
            alterations_af_adj_dict[old_ref] = [1 - sum_of_all_alt_adj]
            
            #Updating the aa to be the ref
            res_dict["aa_ref"] = aa
            #Updating the alt bp to be the ref
            exac_codons = chrom_alter["codons"]
            exac_alt_codon = exac_codons[exac_codons.find("/")+1:]
            res_dict["bp_ref"] = exac_alt_codon.upper()
            res_dict["af"] =(1 - aa_sum)
            res_dict["af_adj"] = (1 - aa_adj_sum)
            
            #Deleting from the alterations dicts
            del alterations_af_dict[aa]
            del alterations_af_adj_dict[aa]
            break
        else:
            res_dict["af"] += aa_sum
            res_dict["af_adj"] += aa_adj_sum
        
        #Fix the AF format
        res_dict["af"] = float('{:.3e}'.format(float(res_dict["af"])))
        res_dict["af_adj"] = float('{:.3e}'.format(float(res_dict["af_adj"])))
        
    #Calculating the overall Jensen-Shannon Divergrence
    #if (len(alterations_af_dict.keys()) == 0):
        #res_dict["JSD"] = 0
        #res_dict["JSD_adj"] = 0
    #else:
        #res_dict["JSD"] = JSD(alterations_af_dict, res_dict["aa_ref"], res_dict["af"], hmm_state, JSD_background.MAJOR_ALLELE)
        #res_dict["JSD_adj"] = JSD(alterations_af_adj_dict, res_dict["aa_ref"], res_dict["af_adj"], hmm_state, JSD_background.MAJOR_ALLELE)
    
    res_dict["alterations_af_adj_dict"] = alterations_af_adj_dict
    
    return (res_dict, frameshift_cnt, errors_cnt, filter_cnt)

In [6]:
chrom_path = curr_dir[0]+"/../1.parse_ExAC/parsed/"
chrom_filename = "parsed_chrom"
states_dict = defaultdict(list)
print "Starting...."

#For error logging
functionNameAsString = sys._getframe().f_code.co_name

#A list of all the ens genes
domain_ens_genes_all = []

#A list to count frameshifts per gene
domain_ens_genes_frameshifts = []

#A list to count validation errors per gene
domain_ens_genes_errors = []

#A list to count ExAC filtered-out per gene
domain_ens_genes_filter = []

for chrom in chromosome_names:
    
    #Filtering the domain data relevant to this chromosome
    domain_chrom_data = sorted_domain_data[sorted_domain_data["chrom_num"] == chrom]
    
    #Loading the ExAC parsed data of this chromosome
    fields = ['chrom', 'pos', 'ref', 'alt', "filter", 'AC', 'AC_Adj', 'AF', 'AN', 'AN_Adj', 'gene', 'feature', 
              'feature_type', 'conseq', 'prot_pos', 'amino_acids', 'codons', 'strand', 'ENSP', 'exon', 
              'intron', 'domains']
    chrom_csv = pd.read_csv(chrom_path+chrom_filename+chrom+".csv", sep='\t', index_col=0, usecols=fields)
    chrom_csv = chrom_csv.sort_values(by=["pos"])
    chrom_csv = chrom_csv.reset_index(drop=True)
    chrom_csv.fillna('', inplace=True)
    chrom_csv["comments"] = ""
    
    #Getting a list of all the relevant ensembl gene ids for this chromosome
    domain_ens_genes = (domain_chrom_data["gene"]).unique()
    domain_ens_genes_all.extend(domain_ens_genes)
    
    #For each ensembl gene in the domain data - finding all the ExAC alterations
    for ens_gene in domain_ens_genes:
        
        #Filtering the domain data for this gene according to the canonical protein id
        canonic_prot = canonic_protein[ens_gene]
        canonic_prot_t = canonic_prot[:canonic_prot.find(".")] #Trimming the ".#" at the end
        domain_gene_table = domain_chrom_data[domain_chrom_data["prot"] == canonic_prot]
        #Making sure that if two HMM-matches overlaps, the higher bit score will come first in the table
        domain_gene_table = domain_gene_table.sort_values(by="BitScore", ascending=False)
        domain_gene_name = domain_gene_table["hugoSymbol"].unique()[0]
        if (len(domain_gene_table["hugoSymbol"].unique()) > 1):
            print functionNameAsString+" Error: "+ens_gene+": more than one Hugo symbol" #sanity check
        
        #Creating a table of the exons for this gene, according to the canonical protein
        chrom_raw_data = domain_gene_table["chromosome"].unique()[0] #there should be only one element here
        if (len(domain_gene_table["chromosome"].unique()) > 1):
            print functionNameAsString+" Error: "+ens_gene+": more than one chromosome raw data" #sanity check
        targetid = domain_gene_table["#TargetID"].unique()[0]
        exon_table = create_exon_pos_table(chrom_raw_data, targetid)
        
        #Filtering the chromosome data to the gene exons region
        exons_start_pos = min(exon_table["start_pos"][0],exon_table["start_pos"][len(exon_table)-1]) #in case of complelemt, the minimal position could be at the last row
        exons_end_pos = max(exon_table["end_pos"][0],exon_table["end_pos"][len(exon_table)-1]) #in case of complelemt, the maximal position could be at the first row
        chrom_gene_table = chrom_csv[chrom_csv["pos"] >= int(exons_start_pos)][chrom_csv["pos"] <= int(exons_end_pos)][chrom_csv["ENSP"] == canonic_prot_t]
        chrom_gene_table = chrom_gene_table.reset_index(drop=True)
        
        #Handling indels
        indels_table = table_editing(chrom_gene_table)
        
        #A counter for frameshifts inside the domain
        protein_frameshifts_cnt = 0
        #A counter for validation errors inside the domain
        protein_errors_cnt = 0
        #A counter for ExAC filter-out inside the domain
        protein_filter_cnt = 0
        
        #Iterating over the amino-acids of the protein
        prot_len = int(domain_gene_table["length"].unique()[0])
        for protein_pos in range(1,prot_len+1):
    
            #Trying to match HMM-state, and retreive the aa from HMMER results
            (hmm_state, aa) = protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table) #TODO: what happens when two matches overlap? maybe sort to the best bit score?
                
            #If there's a match to HMM-state: find the corresponding codon bps chromosome positions
            if (hmm_state > 0):
                chrom_pos_list =find_chrom_bps(protein_pos, exon_table, chrom_raw_data)
                
                #Analysis of the amino-acid MAF and realted data, returned in a dictionary
                (info_dict, frameshift_cnt, errors_cnt, filter_cnt) = calc_exac_maf_data(chrom_pos_list, chrom_gene_table, indels_table, protein_pos, aa, chrom_raw_data, chrom, hmm_state)
                info_dict["ens_gene"] = ens_gene
                
                #Adding the dictionary to the HMM-state list
                states_dict[hmm_state].append(info_dict)
                
                #Adding the frameshifts to the global counter
                protein_frameshifts_cnt += frameshift_cnt
                
                #Adding the errors to the global counter
                protein_errors_cnt += errors_cnt
                
                #Adding the filtered to the global counter
                protein_filter_cnt += filter_cnt
        
        
        domain_ens_genes_frameshifts.append(protein_frameshifts_cnt)
        domain_ens_genes_errors.append(protein_errors_cnt)
        domain_ens_genes_filter.append(protein_filter_cnt)
        print "Finished protein "+ens_gene
                                
    print "Finished chromosome "+chrom

!mkdir -p domains_states_dicts/pfam-v30/$domain_name
with open(curr_dir[0]+"/domains_states_dicts/pfam-v30/"+domain_name+"/"+domain_name+"_hmm_states_dict_"+datetime.date.today().strftime("%m.%d")+".pik", 'wb') as handle:
    pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Starting....
Finished protein ENSG00000116329.6
Finished protein ENSG00000121764.7
Finished protein ENSG00000121933.13
Finished protein ENSG00000133019.7
Finished protein ENSG00000163485.11
Finished protein ENSG00000170989.8
Finished protein ENSG00000177151.3
Finished protein ENSG00000179546.3
Finished protein ENSG00000188822.6
Finished protein ENSG00000196240.2
Finished protein ENSG00000198049.5
Finished chromosome 1
Finished protein ENSG00000121966.6
Finished protein ENSG00000135914.5
Finished protein ENSG00000171596.6
Finished protein ENSG00000222040.3
Finished chromosome 2
Finished protein ENSG00000121807.5
Finished protein ENSG00000121853.3
Finished protein ENSG00000151577.8
Finished protein ENSG00000160791.12
Finished protein ENSG00000163823.3
Finished protein ENSG00000163914.4
Finished protein ENSG00000179097.4
Finished protein ENSG00000179934.5
Finished protein ENSG00000180914.6
Finished protein ENSG00000183625.10
Finished protein ENSG00000183813.6
Finished protein ENSG00000196



In [30]:
sum(domain_ens_genes_errors)

0

In [30]:
sum(domain_ens_genes_frameshifts)

1360

In [31]:
sum(domain_ens_genes_filter)

5056

In [28]:
sum(domain_ens_genes_filter)

5690

In [35]:
max(chrom_gene_table["AN"])

68528

### Debugging code

In [20]:
chrom_path = curr_dir[0]+"/../1.parse_ExAC/parsed/"
chrom_filename = "parsed_chrom"
states_dict = defaultdict(list)
print "Starting...."

chrom = "6"

#For error logging
functionNameAsString = sys._getframe().f_code.co_name

#A list of all the ens genes
domain_ens_genes_all = []

#A list to count frameshifts per gene
domain_ens_genes_frameshifts = []

#A list to count validation errors per gene
domain_ens_genes_errors = []

#A list to count ExAC filtered-out per gene
domain_ens_genes_filter = []

#Filtering the domain data relevant to this chromosome
domain_chrom_data = sorted_domain_data[sorted_domain_data["chrom_num"] == chrom]

#Loading the ExAC parsed data of this chromosome
fields = ['chrom', 'pos', 'ref', 'alt', "filter", 'AC', 'AC_Adj', 'AF', 'AN', 'AN_Adj', 'gene', 'feature', 
          'feature_type', 'conseq', 'prot_pos', 'amino_acids', 'codons', 'strand', 'ENSP', 'exon', 
          'intron', 'domains']
chrom_csv = pd.read_csv(chrom_path+chrom_filename+chrom+".csv", sep='\t', index_col=0, usecols=fields)
chrom_csv = chrom_csv.sort_values(by=["pos"])
chrom_csv = chrom_csv.reset_index(drop=True)
chrom_csv.fillna('', inplace=True)
chrom_csv["comments"] = ""

#Getting a list of all the relevant ensembl gene ids for this chromosome
domain_ens_genes = (domain_chrom_data["gene"]).unique()

Starting....


In [8]:
domain_ens_genes

array(['ENSG00000112038.13', 'ENSG00000112218.7', 'ENSG00000118432.11',
       'ENSG00000135312.4', 'ENSG00000135577.4', 'ENSG00000137252.5',
       'ENSG00000146378.5', 'ENSG00000146383.7', 'ENSG00000146385.1',
       'ENSG00000146399.1', 'ENSG00000152034.6', 'ENSG00000168830.6',
       'ENSG00000204703.3', 'ENSG00000237110.1'], dtype=object)

In [21]:
ens_gene = "ENSG00000237110.1"

#Filtering the domain data for this gene according to the canonical protein id
canonic_prot = canonic_protein[ens_gene]
canonic_prot_t = canonic_prot[:canonic_prot.find(".")] #Trimming the ".#" at the end
domain_gene_table = domain_chrom_data[domain_chrom_data["prot"] == canonic_prot]
#Making sure that if two HMM-matches overlaps, the higher bit score will come first in the table
domain_gene_table = domain_gene_table.sort_values(by="BitScore", ascending=False)
domain_gene_name = domain_gene_table["hugoSymbol"].unique()[0]
if (len(domain_gene_table["hugoSymbol"].unique()) > 1):
    print functionNameAsString+" Error: "+ens_gene+": more than one Hugo symbol" #sanity check

#Creating a table of the exons for this gene, according to the canonical protein
chrom_raw_data = domain_gene_table["chromosome"].unique()[0] #there should be only one element here
if (len(domain_gene_table["chromosome"].unique()) > 1):
    print functionNameAsString+" Error: "+ens_gene+": more than one chromosome raw data" #sanity check
targetid = domain_gene_table["#TargetID"].unique()[0]
exon_table = create_exon_pos_table(chrom_raw_data, targetid)

#Filtering the chromosome data to the gene exons region
#Filtering the chromosome data to the gene exons region
exons_start_pos = min(exon_table["start_pos"][0],exon_table["start_pos"][len(exon_table)-1]) #in case of complelemt, the minimal position could be at the last row
exons_end_pos = max(exon_table["end_pos"][0],exon_table["end_pos"][len(exon_table)-1]) #in case of complelemt, the maximal position could be at the first row
chrom_gene_table = chrom_csv[chrom_csv["pos"] >= int(exons_start_pos)][chrom_csv["pos"] <= int(exons_end_pos)][chrom_csv["ENSP"] == canonic_prot_t]
chrom_gene_table = chrom_gene_table.reset_index(drop=True)

#Handling indels
indels_table = table_editing(chrom_gene_table)

#A counter for frameshifts inside the domain
protein_frameshifts_cnt = 0
#A counter for validation errors inside the domain
protein_errors_cnt = 0
#A counter for ExAC filter-out inside the domain
protein_filter_cnt = 0

#Iterating over the amino-acids of the protein
prot_len = int(domain_gene_table["length"].unique()[0])
for protein_pos in range(1,prot_len+1):

    #Trying to match HMM-state, and retreive the aa from HMMER results
    (hmm_state, aa) = protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table)

    #If there's a match to HMM-state: find the corresponding codon bps chromosome positions
    if (hmm_state > 0):
        chrom_pos_list =find_chrom_bps(protein_pos, exon_table, chrom_raw_data)

        #Analysis of the amino-acid MAF and realted data, returned in a dictionary
        (info_dict, frameshift_cnt, errors_cnt, filter_cnt) = calc_exac_maf_data(chrom_pos_list, chrom_gene_table, indels_table, protein_pos, aa, chrom_raw_data, chrom, hmm_state)
        info_dict["ens_gene"] = ens_gene

        #Adding the dictionary to the HMM-state list
        states_dict[hmm_state].append(info_dict)

        #Adding the frameshifts to the global counter
        protein_frameshifts_cnt += frameshift_cnt

        #Adding the errors to the global counter
        protein_errors_cnt += errors_cnt

        #Adding the filtered to the global counter
        protein_filter_cnt += filter_cnt

domain_ens_genes_frameshifts.append(protein_frameshifts_cnt)
domain_ens_genes_errors.append(protein_errors_cnt)
domain_ens_genes_filter.append(protein_filter_cnt)
print "Finished protein "+ens_gene

chrom_pos_list = (132859609, 132859610, 132859611) protein_pos = 61 hmm_state = 19
calc_exac_maf_data Error: hg19 codon sequence retrieved TAA=* doesn't match HMMER amino-acid X
exac_validation_checks 132859609 Error: ExAC amino acid identity * doesn't match HMMER amino-acid X
Finished protein ENSG00000237110.1


In [22]:
states_dict[19]

[{'JSD': 3.2768464049097433e-06,
  'JSD_adj': 3.266939962647083e-06,
  'aa_ref': 'K',
  'af': 0.245,
  'af_adj': 0.24470000000000003,
  'alterations_af_adj_dict': defaultdict(list, {'X': [0.24470000000000003]}),
  'an_adj': [105448],
  'bp_ref': 'Aaa',
  'chrom': '6',
  'chrom_pos': (132859609, 132859610, 132859611),
  'ens_gene': 'ENSG00000237110.1',
  'filter': ['PASS'],
  'prot_pos': 61}]

In [12]:
domain_gene_table["Target_Seq"].tolist()[0][18]

'X'

In [24]:
domain_gene_table["chromosome"]

190    GRCh37:6:132859429..132860470
Name: chromosome, dtype: object

In [10]:
domain_gene_table

Unnamed: 0,#TargetID,pfam_id,domain_name,E-value,BitScore,TargetStart,TargetEnd,HMM_Seq,Target_Seq,HMM_Pos,...,transcript,gene_biotype,transcript_biotype,hgncID,hugoSymbol,refseq,entrez,length,HMMStart,HMMEnd
190,ENSP00000424607.1,PF10320,7TM_GPCR_Srsx,1.3e-08,29.5,43,326,lviGliGNvllilltlkkkkLrskssiLicvlciadllclvgelvf...,AVLAAFGNLLVMIAILHFXQLHTPTNFLIASLACADFLVGVTVMPF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",...,ENST00000434551.1,polymorphic_pseudogene,polymorphic_pseudogene,20977,TAAR9,NP_778227,134860,346,1,257


In [13]:
chrom_gene_table[chrom_gene_table["pos"] == 132859609]

Unnamed: 0,pos,ref,alt,filter,AC,AC_Adj,AF,AN,AN_Adj,gene,...,conseq,prot_pos,amino_acids,codons,strand,ENSP,exon,intron,domains,comments
32,132859609,T,A,PASS,79916,79646,0.755,105892,105448,ENSG00000237110,...,stop_lost,61,*/K,Taa/Aaa,1,ENSP00000424607,1/1,,,


In [14]:
states_dict[19]

[{'JSD': 0,
  'JSD_adj': 0,
  'aa_ref': 'X',
  'af': 0,
  'af_adj': 0,
  'alterations_af_adj_dict': defaultdict(list, {}),
  'an_adj': [105448],
  'bp_ref': 'TAA',
  'chrom': '6',
  'chrom_pos': (132859609, 132859610, 132859611),
  'ens_gene': 'ENSG00000237110.1',
  'filter': ['PASS'],
  'prot_pos': 61}]

In [None]:
for d in states_dict[19]:
    if (d["pos"] == "132859609"

In [109]:
protein_pos_to_hmm_state_and_aa(204, domain_gene_table)

(122, 'K')

In [72]:
protein_pos = 276
row = domain_gene_table.ix[9]
target_start = row["TargetStart"]
target_end = row["TargetEnd"]
hmm_pos = (row["HMM_Pos"]).split(",")
target_seq = list(row["Target_Seq"])
index_inside_match = int(protein_pos - target_start)
orig_index = index_inside_match
aa = (target_seq[index_inside_match]).upper()

In [75]:
index_inside_match

240

In [76]:
for j in range(orig_index):
    if (target_seq[j] == "-"):
        index_inside_match += 1

In [69]:
index_inside_match

141

In [58]:
HMM_Seq

'vllyGppGtGKTllakavakel......gvefleisgsellsk......................yvgesekkirelfkeakeka....kpsilfiDEidalaksrsgseseeeervvnqLlteldgvkkkeskvivigatnrpdkldpallrgRfdrkieiel'

In [94]:
#Get deletions indices
indices = [i for i, x in enumerate(target_seq) if x == "-"]

target_seq_no_del = [i for j, i in enumerate(target_seq) if j not in indices]
hmm_pos_no_del = [i for j, i in enumerate(hmm_pos) if j not in indices]

In [105]:
hmm_pos_no_del[141]

'122'

In [86]:
target_seq_no_del = remove_values_from_list(target_seq, "-")

In [88]:
target_seq_no_del[141]

'K'

In [90]:
indices = [i for i, x in enumerate(target_seq) if x == "-"]

In [91]:
indices

[106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 145, 152]

In [21]:
def is_number(s):
    """
    Boolean function - determine if a given text can be converted to a number
    """
    try:
        float(s)
        return True
    except ValueError:
        return False


In [107]:
def protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table):
    """
    A function that return the hmm state of that protein position, and the amino acid.
    return -1 for positions outside of domains regions, -2 for matching insertion
    #TODO: do we need also transcript id? do we want to consider more than 1 transcript per gene?
    """
    for index, row in domain_gene_table.iterrows():
        target_start = row["TargetStart"]
        target_end = row["TargetEnd"]
        aa = "-"
        
        #Check if the position is inside this domain instance of the gene
        if (protein_pos >= target_start and protein_pos <= target_end):
            
            hmm_pos = (row["HMM_Pos"]).split(",")
            target_seq = list(row["Target_Seq"])
            index_inside_match = int(protein_pos - target_start)
            
            #Get deletions indices
            indices = [i for i, x in enumerate(target_seq) if x == "-"]
            
            #Remove deletions from bith lists
            target_seq_no_del = [i for j, i in enumerate(target_seq) if j not in indices]
            hmm_pos_no_del = [i for j, i in enumerate(hmm_pos) if j not in indices]
            
            #Get the aa
            aa = (target_seq_no_del[index_inside_match]).upper()
            
            #Find the HMM match state
            hmm_state_text = hmm_pos_no_del[index_inside_match]
            if (is_number(hmm_state_text) == True):
                hmm_state = int(hmm_state_text)
            else:
                #the position match insertion
                hmm_state = -2
            
            #Returning hmm_state and aa for match inside a domain's regions
            return(hmm_state, aa)
            
    #The protein position isn't in any domain region        
    return (-1,'-')

In [None]:
def protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table):
    """
    A function that return the hmm state of that protein position, and the amino acid.
    return -1 for positions outside of domains regions, -2 for matching insertion
    #TODO: do we need also transcript id? do we want to consider more than 1 transcript per gene?
    """
    for index, row in domain_gene_table.iterrows():
        target_start = row["TargetStart"]
        target_end = row["TargetEnd"]
        aa = "-"
        
        #Check if the position is inside this domain instance of the gene
        if (protein_pos >= target_start and protein_pos <= target_end):
            
            hmm_pos = (row["HMM_Pos"]).split(",")
            target_seq = list(row["Target_Seq"])
            index_inside_match = int(protein_pos - target_start)
            
            #Get deletions indices
            indices = [i for i, x in enumerate(target_seq) if x == "-"]
            
            #Remove deletions from bith lists
            target_seq_no_del = [i for j, i in enumerate(target_seq) if j not in indices]
            hmm_pos_no_del = [i for j, i in enumerate(hmm_pos) if j not in indices]
            
            #Find the HMM match state
            hmm_state_text = hmm_pos[index_inside_match]
            if (is_number(hmm_state_text) == True):
                hmm_state = int(hmm_state_text)
            else:
                #the position match insertion
                hmm_state = -2
            
            #Returning hmm_state and aa for match inside a domain's regions
            return(hmm_state, aa)
            
            orig_index = index_inside_match
            
            #If original index is inside a deletion, advance index to the end of the deletion.
            aa = (target_seq[index_inside_match]).upper()
            while (aa == "-"):
                index_inside_match += 1
                aa = (target_seq[index_inside_match]).upper()

            #Correct index_inside_match for previous deletions '-' inside the HMM alignment
            prev_del_move = 0
            for j in range(orig_index):
                if (target_seq[j] == "-"):
                    index_inside_match += 1
                    prev_del_move += 1
            
            #Check "previous deletions" added while correcting counter
            corr_del = 0
            for j in range(index_inside_match-prev_del_move, index_inside_match):
                if (target_seq[j] == "-"):
                    corr_del += 1
                    
            #If corrected counter led to a new deletion, advance counter to the end of the new deletion
            aa = (target_seq[index_inside_match]).upper()
            while (aa == "-"):
                aa = (target_seq[index_inside_match]).upper()
                index_inside_match += 1
            
            #Update in case previous deletions were added for corrected index
            if (corr_del > 0):
                index_inside_match += corr_del
                aa = (target_seq[index_inside_match]).upper()
                while (aa == "-"):
                    index_inside_match += 1
                    aa = (target_seq[index_inside_match]).upper()
            
            #Find the HMM match state
            hmm_state_text = hmm_pos[index_inside_match]
            if (is_number(hmm_state_text) == True):
                hmm_state = int(hmm_state_text)
            else:
                #the position match insertion
                hmm_state = -2
            
            #Returning hmm_state and aa for match inside a domain's regions
            return(hmm_state, aa)
            
    #The protein position isn't in any domain region        
    return (-1,'-')

In [None]:
###from the .py file
def protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table):
    """
    A function that return the hmm state of that protein position, and the amino acid.
    return -1 for positions outside of domains regions, -2 for matching insertion
    #TODO: do we need also transcript id? do we want to consider more than 1 transcript per gene?
    """
    for index, row in domain_gene_table.iterrows():
        target_start = row["TargetStart"]
        target_end = row["TargetEnd"]
        aa = "-"
        
        #Check if the position is inside this domain instance of the gene
        if (protein_pos >= target_start and protein_pos <= target_end):
            
            hmm_pos = (row["HMM_Pos"]).split(",")
            target_seq = list(row["Target_Seq"])
            index_inside_match = int(protein_pos - target_start)
            orig_index = index_inside_match
            
            #If original index is inside a deletion, advance index to the end of the deletion.
            while (aa == "-"):
                aa = (target_seq[index_inside_match]).upper()
                if (aa == "-"):
                    index_inside_match += 1

            #Correct index_inside_match for previous deletions '-' inside the HMM alignment
            prev_del_move = 0
            for j in range(orig_index):
                if (target_seq[j] == "-"):
                    index_inside_match += 1
                    prev_del_move += 1
                    
            #Reading aa again in case index has changed
            aa = (target_seq[index_inside_match]).upper()
            
            #Check previous deletions added while correcting counter
            corr_del = 0
            for j in range(index_inside_match-prev_del_move, index_inside_match):
                if (target_seq[j] == "-"):
                    corr_del += 1
            
            #If corrected counter led to a new deletion, advance counter to the end of the new deletion
            while (aa == "-"):
                aa = (target_seq[index_inside_match]).upper()
                if (aa == "-"):
                    index_inside_match += 1
            
            #Update in case previous deletions were added for corrected index
            if (corr_del > 0):
                index_inside_match += corr_del
                aa = (target_seq[index_inside_match]).upper()
            
            #Find the HMM match state
            hmm_state_text = hmm_pos[index_inside_match]
            if (is_number(hmm_state_text) == True):
                hmm_state = int(hmm_state_text)
            else:
                #the position match insertion
                hmm_state = -2
            
            #Returning hmm_state and aa for match inside a domain's regions
            return(hmm_state, aa)
            
    #The protein position isn't in any domain region        
    return (-1,'-')