# Parse the HMM results, filter by domain

In [1]:
#Import packages
import pandas as pd
import numpy as np
import mygene
import unicodedata
from biomart import BiomartServer
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the file as is into a data-frame

In [64]:
curr_dir = !pwd
my_path = curr_dir[0]+"/from_shilpa/"
filename = "allhmmresbyprot-new.tsv"
allhmm = pd.read_csv(my_path+filename, sep='\t')
#allhmm.columns = ["Hugo_gene_id", "Pfam_id_name", "e-value", "bit_score", "protein_start", "protein_end",
                  #"HMM_alignment", "sequence_aligment", "HMM_match_states", "refSeq_id_HMM_start_HMM_end"]

#### A little more processing to the data to look better in the data-frame

In [65]:
#Disable false positive Pandas warning for 'SettingWithCopyWarning'
pd.options.mode.chained_assignment = None

#Splitting the Pfam id and domain name column
allhmm['pfam_id'], allhmm['domain_name'] = zip(*allhmm['HMM_Name'].apply(lambda x: x.split('_', 1)))
del allhmm['HMM_Name']
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[0:1] + cols[-2:] + cols[1:-2]
allhmm = allhmm[cols]

#Splitting the Description
allhmm['prot_id'], allhmm['ensembl_id'], allhmm['transcript_id'], allhmm['chromosome_id'], allhmm['length'], allhmm['refseq_hmm_start_end'] = zip(*allhmm['Description'].apply(lambda x: x.split(' ', 5)))
del allhmm['Description']

#Splitting the last column to deal with missing refseq ids
allhmm['refseq'] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("refseq:")+7:x.find("HMMStart")])
allhmm["hmm_start"] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("HMMStart")+9:x.find(";")])
allhmm["hmm_end"] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("HMMEnd")+7:-1])
del allhmm['refseq_hmm_start_end']

#Extracting the numbers alone from the description columns
allhmm['prot_id'] = allhmm['prot_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['ensembl_id'] = allhmm['ensembl_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['transcript_id'] = allhmm['transcript_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['chromosome_id'] = allhmm['chromosome_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['length'] = allhmm['length'].apply(lambda x: x[x.find(':')+1:])

#Extract only the hugo symbol (without the .number) and add to a new column
allhmm["Hugo_symbol"] = allhmm['#TargetID'].apply(lambda x: x.split('.')[0])
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[0:1] + cols[-1:] + cols[1:-1]
allhmm = allhmm[cols]

#Saving the processed data-frame
allhmm.to_csv(my_path+"allhmm_parsed.csv", sep='\t')

In [71]:
allhmm

Unnamed: 0,#TargetID,Hugo_symbol,pfam_id,domain_name,E-value,BitScore,TargetStart,TargetEnd,HMM_Seq,Target_Seq,HMM_Pos,prot_id,ensembl_id,transcript_id,chromosome_id,length,refseq,hmm_start,hmm_end
0,OR10C1.002,OR10C1,PF00001,7tm_1,2.800000e-26,88.2,42,291,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,GNFLIVVLVSTDAALQSPMYFFLRTLSALEIGYTSVTVPLLLHHLL...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000449846,ENSG00000235441,ENST00000550132,GRCh37.75:HSCHR6_MHC_SSTO:join(29384748..29384...,294,,1,268
1,OR10C1.002,OR10C1,PF00001,7tm_1,5.600000e-28,84.0,42,291,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,GNFLIVVLVSTDAALQSPMYFFLRTLSALEIGYTSVTVPLLLHHLL...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000449846,ENSG00000235441,ENST00000550132,GRCh37.75:HSCHR6_MHC_SSTO:join(29384748..29384...,294,,1,268
2,OPN1LW.002,OPN1LW,PF00001,7tm_1,3.400000e-21,61.7,3,125,silnllaisiDRYvaivkplkykrlkrrakav.illvWvlslllav...,GLWSLAIISWERWLVVCKPFGNVRFDAKLAIVgIAFSWIWSAVWTA...,"71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,8...",ENSP00000402493,ENSG00000102076,ENST00000442922,"GRCh37.75:X:join(153418415..153418581,15342004...",164,,71,187
3,P2RY2.001,P2RY2,PF00001,7tm_1,3.700000e-42,130.5,51,306,NllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel....,NAVALYIFLCRLKTWNASTTYMFHLAVSDALYAA-SLPLLVYYYAr...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",ENSP00000310305,ENSG00000175591,ENST00000311131,GRCh37.75:11:72945205..72946338,377,NP_788086,2,268
4,P2RY2.002,P2RY2,PF00001,7tm_1,3.700000e-42,130.5,51,306,NllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel....,NAVALYIFLCRLKTWNASTTYMFHLAVSDALYAA-SLPLLVYYYAr...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",ENSP00000377221,ENSG00000175591,ENST00000393596,GRCh37.75:11:72945205..72946338,377,NP_788085,2,268
5,P2RY2.003,P2RY2,PF00001,7tm_1,3.700000e-42,130.5,51,306,NllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel....,NAVALYIFLCRLKTWNASTTYMFHLAVSDALYAA-SLPLLVYYYAr...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",ENSP00000377222,ENSG00000175591,ENST00000393597,GRCh37.75:11:72945205..72946338,377,NP_002555,2,268
6,P2RY2.001,P2RY2,PF00001,7tm_1,4.200000e-41,137.5,50,306,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,LNAVALYIFLCRLKTWNASTTYMFHLAVSDALYAA-SLPLLVYYYA...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000310305,ENSG00000175591,ENST00000311131,GRCh37.75:11:72945205..72946338,377,NP_788086,1,268
7,P2RY2.002,P2RY2,PF00001,7tm_1,4.200000e-41,137.5,50,306,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,LNAVALYIFLCRLKTWNASTTYMFHLAVSDALYAA-SLPLLVYYYA...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000377221,ENSG00000175591,ENST00000393596,GRCh37.75:11:72945205..72946338,377,NP_788085,1,268
8,P2RY2.003,P2RY2,PF00001,7tm_1,4.200000e-41,137.5,50,306,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,LNAVALYIFLCRLKTWNASTTYMFHLAVSDALYAA-SLPLLVYYYA...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000377222,ENSG00000175591,ENST00000393597,GRCh37.75:11:72945205..72946338,377,NP_002555,1,268
9,OR8B8.001,OR8B8,PF00001,7tm_1,2.600000e-24,72.0,41,235,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,GNLGLITLIRLNSHLHTPMYFFLYNLSFIDFCYSSVITPKMLMSFV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000330280,ENSG00000197125,ENST00000328064,GRCh37.75:11:complement(124310046..124310981),311,NP_036510,1,206


### Filtering all Zinc finger domains

In [72]:
#Get the Zinc finger domain
zinc_domain = allhmm[allhmm["domain_name"] == "zf-C2H2"]

#Saving the Zinc finger domain to file
zinc_domain.to_csv(curr_dir[0]+"/hmm_domains/zf-C2H2.csv", sep='\t')