# Parse the HMM results, filter by domain, try to map Hugo Symbol to Emsembl ID

In [289]:
#Import packages
import pandas as pd
import numpy as np
import mygene
import unicodedata
from biomart import BiomartServer
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the file as is into a data-frame

In [309]:
curr_dir = !pwd
my_path = curr_dir[0]+"/from_shilpa/"
filename = "allhmmresbyprot.txt"
allhmm = pd.read_csv(my_path+filename, sep='\t')
allhmm.columns = ["Hugo_gene_id", "Pfam_id_name", "e-value", "bit_score", "protein_start", "protein_end",
                  "HMM_alignment", "sequence_aligment", "HMM_match_states", "refSeq_id_HMM_start_HMM_end"]

#### A little more processing to the data to look better in the data-frame

In [310]:
#Disable false positive Pandas warning for 'SettingWithCopyWarning'
pd.options.mode.chained_assignment = None

#Splitting the Pfam id and domain name column
allhmm['pfam_id'], allhmm['domain_name'] = zip(*allhmm['Pfam_id_name'].apply(lambda x: x.split('_', 1)))
del allhmm['Pfam_id_name']
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[0:1] + cols[-2:] + cols[1:-2]
allhmm = allhmm[cols]

#Splitting the refSeq and HMM start end column
allhmm['refSeq_id'], allhmm['HMM_start'], allhmm['HMM_end'] = zip(*allhmm['refSeq_id_HMM_start_HMM_end'].apply(lambda x: x.split(' ', 2)))
del allhmm['refSeq_id_HMM_start_HMM_end']

#Extracting the numbers alone from HMM start and HMM end
allhmm['HMM_start'] = allhmm['HMM_start'].apply(lambda x: x[x.find('=')+1:x.find(';')])
allhmm['HMM_end'] = allhmm['HMM_end'].apply(lambda x: x[x.find('=')+1:x.find(';')])

#Extract only the hugo symbol (without the .number) and add to a new column
allhmm["Hugo_symbol"] = allhmm['Hugo_gene_id'].apply(lambda x: x.split('.')[0])
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[0:1] + cols[-1:] + cols[1:-1]
allhmm = allhmm[cols]

#Saving the processed data-frame
allhmm.to_csv(my_path+"allhmm_parsed.csv", sep='\t')

In [308]:
allhmm

Unnamed: 0,Hugo_gene_id,Hugo_symbol,pfam_id,domain_name,e-value,bit_score,protein_start,protein_end,HMM_alignment,sequence_aligment,HMM_match_states,refSeq_id,HMM_start,HMM_end
0,A1BG.001,A1BG,PF13895,Ig_2,2.500000e-10,35.2,28,112,kpvleapp.tvltegsdvtLtCsadgnptpklqwykegsllt.......,QPSLWAESeSLLKPLANVTLTCQAHLE-TPDFQLFKNGVAQEpvhl...,"1,2,3,4,5,6,7,8,a8-0,9,10,11,12,13,14,15,16,17...",NP_570602.2,1,80
1,A1BG.001,A1BG,PF13895,Ig_2,2.000000e-14,48.8,209,301,kpvle...apptvltegsdvtLtCsadgnptpklqwykegsllt.....,PPVLMhhgESSQVLHPGNKVTLTCVAPLS-GVDFQLRRGEKELLvp...,"1,2,3,4,5,a5-0,a5-1,a5-2,6,7,8,9,10,11,12,13,1...",NP_570602.2,1,80
2,A1BG.001,A1BG,PF13895,Ig_2,5.700000e-12,31.9,29,111,pvleapp.tvltegsdvtLtCsadgnptpklqwykegsllt....q...,PSLWAESeSLLKPLANVTLTCQAHLE-TPDFQLFKNGVAQEpvhlD...,"2,3,4,5,6,7,8,a8-0,9,10,11,12,13,14,15,16,17,1...",NP_570602.2,2,79
3,A1CF.001,A1CF,PF00076,RRM_1,1.000000e-11,30.9,138,199,lfVgnLppdvteeeLkelFsk.fGpiesiklvrd..etgrskgfaf...,LFVGGIPKTKKREEILSEMKKvTEGVVDVIVYPSaaDKTKNRGFAF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001185747.1,1,59
4,A1CF.001,A1CF,PF00076,RRM_1,2.100000e-09,32.2,138,206,lfVgnLppdvteeeLkelFsk.fGpiesiklvrd..etgrskgfaf...,LFVGGIPKTKKREEILSEMKKvTEGVVDVIVYPSaaDKTKNRGFAF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001185747.1,1,70
5,A1CF.001,A1CF,PF00076,RRM_1,3.100000e-21,61.3,58,124,lfVgnLppdvteeeLkelFskfGpiesiklvrdetgrskgfafVeF...,IFIGKLPRDLFEDELIPLCEKIGKIYEMRMMMDFNGNNRGYAFVTF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001185747.1,1,67
6,A1CF.001,A1CF,PF00076,RRM_1,3.000000e-15,42.2,233,296,lfVgnLppdvteeeLkelFskf..GpiesiklvrdetgrskgfafV...,LYVRNLMLSTSEEMIEKEFNNIkpGAVERVKKIRD-------YAFV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001185747.1,1,69
7,A1CF.001,A1CF,PF00076,RRM_1,5.100000e-15,50.8,233,297,lfVgnLppdvteeeLkelFskf..GpiesiklvrdetgrskgfafV...,LYVRNLMLSTSEEMIEKEFNNIkpGAVERVKKIRD-------YAFV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001185747.1,1,70
8,A1CF.001,A1CF,PF00076,RRM_1,9.100000e-21,69.9,58,127,lfVgnLppdvteeeLkelFskfGpiesiklvrdetgrskgfafVeF...,IFIGKLPRDLFEDELIPLCEKIGKIYEMRMMMDFNGNNRGYAFVTF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001185747.1,1,70
9,A1CF.001,A1CF,PF13893,RRM_5,1.900000e-08,29.0,247,301,Lynlfskf..GnverikflekkkgfAlvefsdeaeAavaikklngv...,IEKEFNNIkpGAVERVK---KIRDYAFVHFSNREDAVEAMKALNGK...,"1,2,3,4,5,6,7,8,a8-0,a8-1,9,10,11,12,13,14,15,...",NP_001185747.1,1,56


### Filtering all Zinc finger domains

In [312]:
#Get the Zinc finger domain
zinc_domain = allhmm[allhmm["domain_name"] == "zf-C2H2"]

#Saving the Zinc finger domain to file
zinc_domain.to_csv(curr_dir[0]+"/hmm_domains/zf-C2H2.csv", sep='\t')

### Trying the map the Hugo symbols Ensembl Gene ID in two different ways:
#### 1) Using mygene Python package: https://pypi.python.org/pypi/mygene; http://nbviewer.jupyter.org/gist/newgene/6771106
This uses the latest builds according to their website: http://mygene.info/index.html
Status: NCBI snapshot: 20160227, Ensembl release: 83, UniProt: 20160218, UCSC: 20160228, NetAffy: na35.

##### Possible issues:
1. Some symbols have more than one match. In this case we take the first because the results are returned sorted. It says on the mg.query documentation: "Default: sort by matching scores in decending order."
2. Some symbols don't have any match: only very few

In [334]:
#Importing ensmbl database
mg = mygene.MyGeneInfo()

#Getting a list of unique hugo symbols
hugo_sym = (zinc_domain['Hugo_symbol']).unique()

#Creating a dictionary of Hugo symbol mapped to ensmbl gene id
hugo_ens_dict = {}
for sym in hugo_sym:
    query_res = (mg.query(sym, scopes='symbol', fields='ensembl.gene', species='human', as_dataframe = True))
    query_cols = query_res.columns.tolist()
    if ("ensembl" in query_cols) == False:
        continue
    ensmbl_res = query_res["ensembl"][0]
    #A case where the first element in the result is nan - taking the second element
    if type(ensmbl_res) != dict and type(ensmbl_res) != list:
        ensmbl_res = query_res["ensembl"][1]
        ensmbl_id = (ensmbl_res[u'gene']).encode('ascii','ignore')
    
    #A case where we have several ensmbl ids in the first element - taking the first from inside the first element
    elif len(ensmbl_res) > 1:
        ensmbl_id = (ensmbl_res[0][u'gene']).encode('ascii','ignore')
    
    #The general case: one ensmbl id in the first element
    else:
        ensmbl_id = (ensmbl_res[u'gene']).encode('ascii','ignore')
    hugo_ens_dict[sym] = ensmbl_id

#Create a column of the ensmbl and add to the zinc finger data-frame
ensmbl_id_col = []
for sym in zinc_domain["Hugo_symbol"]:
    if (sym in hugo_ens_dict.keys()):
        ensmbl_id_col.append(hugo_ens_dict[sym])
    else:
        ensmbl_id_col.append("")
zinc_domain["mygene_Ensembl_id"] = ensmbl_id_col

#### 2) Using Biomart Python package: https://pypi.python.org/pypi/biomart using the grch37 Ensembl version
##### Possible issues:
1. Some symbols have more than one match. There's no indication wheather taking the first one is the better match (most are with 2, but a few with 5,7 or 8 matches).
2. Some symbols don't have any match, sometimes this is because there are several Hugo symbols for the gene, so the other ones are shown as "synonyms" to the gene, but the biomart doesn't let you query the synonyms field as well. Possible solution: for the unmatched, find other gene aliases from other databases and then query them in biomart (not implemented yet).

In [283]:
#Importing Biomart database
server = BiomartServer("http://grch37.ensembl.org/biomart")
ens_genes = server.datasets[u'hsapiens_gene_ensembl']

#Getting a list of unique hugo symbols
hugo_sym = ((zinc_domain['Hugo_symbol']).unique()).tolist()

#Creating a dictionary of Hugo symbol mapped to ensmbl gene id
hugo_ens_dict = {}
not_found = 0
for sym in hugo_sym:
    #Querying the Biomart database to get number of matches
    num = ens_genes.count({
        'filters': {
                'hgnc_symbol': sym
        },
        'attributes': [
                'ensembl_gene_id'
        ]
    })
    
    #In case no match is found
    if (num == 0):
        not_found += 1
        continue
    
    #In case of at least one match: querying the Biomart database
    response = ens_genes.search({
        'filters': {
                'hgnc_symbol': sym
        },
        'attributes': [
                'ensembl_gene_id'
        ]
    })
    
    #Reading the query result
    for line in response.iter_lines():
        line = line.decode('utf-8')
        ensmbl_id = (line.split("\t")[0]).encode('ascii','ignore')
        break
    
    #Updating the dictionary
    hugo_ens_dict[sym] = ensmbl_id
    
#Create a column of the ensmbl and add to the zinc finger data-frame
ensmbl_id_col = []
for sym in zinc_domain["Hugo_symbol"]:
    if (sym in hugo_ens_dict.keys()):
        ensmbl_id_col.append(hugo_ens_dict[sym])
    else:
        ensmbl_id_col.append("")
zinc_domain["biomart_Ensembl_id"] = ensmbl_id_col

more than 1 match for EGR2
2
more than 1 match for GFI1
2
more than 1 match for KLF14
2
No match for LOC100287477
No match for LOC101060138
No match for LOC101060181
No match for LOC101060200
No match for LOC400682
more than 1 match for SALL3
2
more than 1 match for SCRT1
2
more than 1 match for ZBTB12
5
more than 1 match for ZBTB22
5
more than 1 match for ZBTB24
2
No match for ZFP112
No match for ZFP161
more than 1 match for ZFP57
8
more than 1 match for ZFP92
2
more than 1 match for ZNF100
2
more than 1 match for ZNF141
2
No match for ZNF167
more than 1 match for ZNF182
2
No match for ZNF187
No match for ZNF192
No match for ZNF193
No match for ZNF238
more than 1 match for ZNF275
2
No match for ZNF295
more than 1 match for ZNF311
7
more than 1 match for ZNF322
2
No match for ZNF323
more than 1 match for ZNF43
2
more than 1 match for ZNF430
2
No match for ZNF434
more than 1 match for ZNF449
2
more than 1 match for ZNF479
2
more than 1 match for ZNF486
2
No match for ZNF498
more than 1 

In [288]:
zinc_domain[zinc_domain["biomart"] != zinc_domain["ensmbl_gene_id"]]

Unnamed: 0,Hugo_gene_id,pfam_id,domain_name,e-value,bit_score,protein_start,protein_end,HMM_alignment,sequence_aligment,HMM_match_states,refSeq_id,HMM_start,HMM_end,Hugo_symbol,ensmbl_gene_id,biomart
129263,ZBTB12.001,PF00096,zf-C2H2,6.000000e-10,34.0,359,381,ykCpdCgksFkrksnLkrHirtH,FMCPRCGKQFNHSSNLNRHMNVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_862825.1,1,23,ZBTB12,ENSG00000204366,ENSG00000206366
129956,ZFP112.001,PF00096,zf-C2H2,1.100000e-08,29.7,777,799,ykCpdCgksFkrksnLkrHirtH,YKCEVCTKGFSESSRLQAHQRVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,
129957,ZFP112.001,PF00096,zf-C2H2,1.200000e-08,29.7,805,827,ykCpdCgksFkrksnLkrHirtH,YKCEQCGKGFSGYSSLQAHHRVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,
129958,ZFP112.001,PF00096,zf-C2H2,1.200000e-09,33.0,665,687,ykCpdCgksFkrksnLkrHirtH,YKCEECGKGFSKASTLLAHQRVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,
129959,ZFP112.001,PF00096,zf-C2H2,1.200000e-10,36.2,553,575,ykCpdCgksFkrksnLkrHirtH,YKCEECDKGFSRSSYLQAHQRVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,
129960,ZFP112.001,PF00096,zf-C2H2,1.200000e-10,36.3,693,715,ykCpdCgksFkrksnLkrHirtH,YQCDECGKSFSQRSYLQSHQSVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,
129961,ZFP112.001,PF00096,zf-C2H2,1.500000e-09,32.7,749,771,ykCpdCgksFkrksnLkrHirtH,YKCEMCGKGFSQSSRLEAHRRVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,
129962,ZFP112.001,PF00096,zf-C2H2,1.800000e-09,32.4,525,547,ykCpdCgksFkrksnLkrHirtH,YKCNICGKGFNHRSVLNVHQRVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,
129963,ZFP112.001,PF00096,zf-C2H2,1.800000e-10,27.3,693,715,ykCpdCgksFkrksnLkrHirtH,YQCDECGKSFSQRSYLQSHQSVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,
129964,ZFP112.001,PF00096,zf-C2H2,1.800000e-10,35.7,609,631,ykCpdCgksFkrksnLkrHirtH,YKCEECGKGFSRSSHLQGHQRVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_001076804.1,1,23,ZFP112,ENSG00000062370,


In [335]:
zinc_domain

Unnamed: 0,Hugo_gene_id,Hugo_symbol,pfam_id,domain_name,e-value,bit_score,protein_start,protein_end,HMM_alignment,sequence_aligment,HMM_match_states,refSeq_id,HMM_start,HMM_end,mygene_Ensembl_id
14471,BCL11A.001,BCL11A,PF00096,zf-C2H2,2.400000e-08,28.6,377,399,ykCpdCgksFkrksnLkrHirtH,KSCEFCGKTFKFQSNLVVHRRSH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_060484.2,1,23,ENSG00000119866
14472,BCL11A.001,BCL11A,PF00096,zf-C2H2,6.000000e-08,27.3,405,427,ykCpdCgksFkrksnLkrHirtH,YKCNLCDHACTQASKLKRHMKTH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_060484.2,1,23,ENSG00000119866
14473,BCL11A.003,BCL11A,PF00096,zf-C2H2,2.400000e-08,28.6,377,399,ykCpdCgksFkrksnLkrHirtH,KSCEFCGKTFKFQSNLVVHRRSH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_075044.2,1,23,ENSG00000119866
14474,BCL11A.003,BCL11A,PF00096,zf-C2H2,2.500000e-08,28.6,770,792,ykCpdCgksFkrksnLkrHirtH,YKCELCNYACAQSSKLTRHMKTH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_075044.2,1,23,ENSG00000119866
14475,BCL11A.003,BCL11A,PF00096,zf-C2H2,4.100000e-09,31.2,800,823,ykCpdCgksFkrksnLkrHirt.H,YKCEICKMPFSVYSTLEKHMKKwH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_075044.2,1,23,ENSG00000119866
14476,BCL11A.003,BCL11A,PF00096,zf-C2H2,6.000000e-08,27.3,405,427,ykCpdCgksFkrksnLkrHirtH,YKCNLCDHACTQASKLKRHMKTH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_075044.2,1,23,ENSG00000119866
14477,BCL11A.003,BCL11A,PF00096,zf-C2H2,8.500000e-10,33.4,742,764,ykCpdCgksFkrksnLkrHirtH,DTCEYCGKVFKNCSNLTVHRRSH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_075044.2,1,23,ENSG00000119866
14478,BCL11B.001,BCL11B,PF00096,zf-C2H2,2.500000e-08,28.6,753,775,ykCpdCgksFkrksnLkrHirtH,YKCELCNYACAQSSKLTRHMKTH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_075049.1,1,23,ENSG00000127152
14479,BCL11B.001,BCL11B,PF00096,zf-C2H2,2.900000e-08,28.4,356,378,ykCpdCgksFkrksnLkrHirtH,KSCEFCGKTFKFQSNLIVHRRSH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_075049.1,1,23,ENSG00000127152
14480,BCL11B.001,BCL11B,PF00096,zf-C2H2,5.700000e-09,30.7,783,806,ykCpdCgksFkrksnLkrHirt.H,YRCDICQMPFSVYSTLEKHMKKwH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",NP_075049.1,1,23,ENSG00000127152


In [284]:
not_found

19

In [285]:
#Create a column of the ensmbl and add to the data-frame
ensmbl_id_col = []
for sym in zinc_domain["Hugo_symbol"]:
    if (sym in hugo_ens_dict.keys()):
        ensmbl_id_col.append(hugo_ens_dict[sym])
    else:
        ensmbl_id_col.append("")
zinc_domain["biomart"] = ensmbl_id_col

In [280]:
response = ens_genes.count({
  'filters': {
      'hgnc_symbol': 'BCL11A'
  },
  'attributes': [
      'ensembl_gene_id' 
  ]
})

In [267]:
for line in response.iter_lines():
    line = line.decode('utf-8')
    #print len(line)
    y = line.split("\t")
    #print(line.split("\t"))
    

In [281]:
response

1