# Parse the HMM results, filter by domain

In [2]:
#Import packages
import pandas as pd
import numpy as np
import unicodedata
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the file as is into a data-frame

In [3]:
curr_dir = !pwd
my_path = curr_dir[0]+"/from_shilpa/"
filename = "allhmmresbyprot-new.tsv"
allhmm = pd.read_csv(my_path+filename, sep='\t')

#### A little more processing to the data to look better in the data-frame

In [23]:
#Disable false positive Pandas warning for 'SettingWithCopyWarning'
pd.options.mode.chained_assignment = None

#Splitting the Pfam id and domain name column
allhmm['pfam_id'], allhmm['domain_name'] = zip(*allhmm['HMM_Name'].apply(lambda x: x.split('_', 1)))
del allhmm['HMM_Name']
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[0:1] + cols[-2:] + cols[1:-2]
allhmm = allhmm[cols]

#Splitting the Description
allhmm['prot_id'], allhmm['ensembl_id'], allhmm['transcript_id'], allhmm['chromosome_id'], allhmm['length'], allhmm['refseq_hmm_start_end'] = zip(*allhmm['Description'].apply(lambda x: x.split(' ', 5)))
del allhmm['Description']

#Splitting the last column to deal with missing refseq ids
allhmm['refseq'] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("refseq:")+7:x.find("HMMStart")])
allhmm["hmm_start"] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("HMMStart")+9:x.find(";")])
allhmm["hmm_end"] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("HMMEnd")+7:-1])
del allhmm['refseq_hmm_start_end']

#Extracting the numbers alone from the description columns
allhmm['prot_id'] = allhmm['prot_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['ensembl_id'] = allhmm['ensembl_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['transcript_id'] = allhmm['transcript_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['chromosome_id'] = allhmm['chromosome_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['length'] = allhmm['length'].apply(lambda x: x[x.find(':')+1:])

#Extract only the hugo symbol (without the .number) and add to a new column
allhmm["Hugo_symbol"] = allhmm['#TargetID'].apply(lambda x: x.split('.')[0])
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[0:1] + cols[-1:] + cols[1:-1]
allhmm = allhmm[cols]

#Seperate chromosome number to a different column
allhmm["chrom_num"] = allhmm["chromosome_id"].apply(lambda x: x[x.find(":")+1:x.find(":", x.find(":")+1)])
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[:15] + cols[-1:] + cols[15:-1]
allhmm = allhmm[cols]

#Saving the processed data-frame
allhmm.to_csv(my_path+"allhmm_parsed.csv", sep='\t')

In [4]:
allhmm = pd.read_csv(my_path+"allhmm_parsed.csv", sep='\t')

In [8]:
allhmm[allhmm["chrom_num"] == "1"]

Unnamed: 0.1,Unnamed: 0,#TargetID,Hugo_symbol,pfam_id,domain_name,E-value,BitScore,TargetStart,TargetEnd,HMM_Seq,...,HMM_Pos,prot_id,ensembl_id,transcript_id,chromosome_id,chrom_num,length,refseq,hmm_start,hmm_end
118,118,OR2T5.001,OR2T5,PF00001,7tm_1,1.600000e-25,75.9,45,294,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355429,ENSG00000203661,ENST00000366473,GRCh37.75:1:248651890..248652837,1,315,NP_001004697,1,268
119,119,OR2T5.001,OR2T5,PF00001,7tm_1,5.400000e-24,80.6,45,294,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355429,ENSG00000203661,ENST00000366473,GRCh37.75:1:248651890..248652837,1,315,NP_001004697,1,268
120,120,OR2T29.001,OR2T29,PF00001,7tm_1,1.600000e-25,75.9,45,294,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000331774,ENSG00000182783,ENST00000328570,GRCh37.75:1:complement(248721845..248722792),1,315,NP_001004694,1,268
121,121,OR2T29.001,OR2T29,PF00001,7tm_1,5.400000e-24,80.6,45,294,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000331774,ENSG00000182783,ENST00000328570,GRCh37.75:1:complement(248721845..248722792),1,315,NP_001004694,1,268
128,128,OR14A16.001,OR14A16,PF00001,7tm_1,4.300000e-28,84.3,39,226,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000350248,ENSG00000196772,ENST00000357627,GRCh37.75:1:complement(247978102..247979031),1,309,NP_001001966,1,174
129,129,OR14A16.001,OR14A16,PF00001,7tm_1,6.400000e-26,87.0,39,287,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000350248,ENSG00000196772,ENST00000357627,GRCh37.75:1:complement(247978102..247979031),1,309,NP_001001966,1,268
139,139,OR14A2.001,OR14A2,PF00001,7tm_1,2.900000e-26,78.4,40,267,NllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyelt...,...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",ENSP00000355441,ENSG00000241128,ENST00000366485,GRCh37.75:1:complement(247886401..247887345),1,314,,2,240
140,140,OR14A2.001,OR14A2,PF00001,7tm_1,8.400000e-24,80.0,39,288,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355441,ENSG00000241128,ENST00000366485,GRCh37.75:1:complement(247886401..247887345),1,314,,1,268
166,166,GPR25.001,GPR25,PF00001,7tm_1,1.900000e-45,141.2,56,307,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000301917,ENSG00000170128,ENST00000304244,GRCh37.75:1:200842166..200843251,1,361,NP_005289,1,268
167,167,GPR25.001,GPR25,PF00001,7tm_1,9.500000e-45,149.6,56,307,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000301917,ENSG00000170128,ENST00000304244,GRCh37.75:1:200842166..200843251,1,361,NP_005289,1,268


### Filtering all Zinc finger domains

In [29]:
#Get the Zinc finger domain
zinc_domain = allhmm[allhmm["domain_name"] == "zf-C2H2"]
zinc_domain = zinc_domain.reset_index(drop=True)

#Saving the Zinc finger domain to file
zinc_domain.to_csv(curr_dir[0]+"/hmm_domains/zf-C2H2.csv", sep='\t')