# Parse the HMM results, filter by domain

In [1]:
#Import packages
import pandas as pd
import numpy as np
import unicodedata
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the file as is into a data-frame

In [2]:
curr_dir = !pwd
my_path = curr_dir[0]+"/from_shilpa/"
filename = "allhmmresbyprot-new.tsv"
allhmm = pd.read_csv(my_path+filename, sep='\t')

#### A little more processing to the data to look better in the data-frame

In [3]:
#Disable false positive Pandas warning for 'SettingWithCopyWarning'
pd.options.mode.chained_assignment = None

#Splitting the Pfam id and domain name column
allhmm['pfam_id'], allhmm['domain_name'] = zip(*allhmm['HMM_Name'].apply(lambda x: x.split('_', 1)))
del allhmm['HMM_Name']
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[0:1] + cols[-2:] + cols[1:-2]
allhmm = allhmm[cols]

#Splitting the Description
allhmm['prot_id'], allhmm['ensembl_id'], allhmm['transcript_id'], allhmm['chromosome_id'], allhmm['length'], allhmm['refseq_hmm_start_end'] = zip(*allhmm['Description'].apply(lambda x: x.split(' ', 5)))
del allhmm['Description']

#Splitting the last column to deal with missing refseq ids
allhmm['refseq'] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("refseq:")+7:x.find("HMMStart")])
allhmm["hmm_start"] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("HMMStart")+9:x.find(";")])
allhmm["hmm_end"] = allhmm['refseq_hmm_start_end'].apply(lambda x: x[x.find("HMMEnd")+7:-1])
del allhmm['refseq_hmm_start_end']

#Extracting the numbers alone from the description columns
allhmm['prot_id'] = allhmm['prot_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['ensembl_id'] = allhmm['ensembl_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['transcript_id'] = allhmm['transcript_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['chromosome_id'] = allhmm['chromosome_id'].apply(lambda x: x[x.find(':')+1:])
allhmm['length'] = allhmm['length'].apply(lambda x: x[x.find(':')+1:])

#Extract only the hugo symbol (without the .number) and add to a new column
allhmm["Hugo_symbol"] = allhmm['#TargetID'].apply(lambda x: x.split('.')[0])
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[0:1] + cols[-1:] + cols[1:-1]
allhmm = allhmm[cols]

#Seperate chromosome number to a different column
allhmm["chrom_num"] = allhmm["chromosome_id"].apply(lambda x: x[x.find(":")+1:x.find(":", x.find(":")+1)])
#Get the columns to the original order
cols = allhmm.columns.tolist()
cols = cols[:15] + cols[-1:] + cols[15:-1]
allhmm = allhmm[cols]

#Saving the processed data-frame
allhmm.to_csv(my_path+"allhmm_parsed.csv", sep='\t')

In [35]:
allhmm = pd.read_csv(my_path+"allhmm_parsed.csv", sep='\t')

In [36]:
allhmm

Unnamed: 0.1,Unnamed: 0,#TargetID,Hugo_symbol,pfam_id,domain_name,E-value,BitScore,TargetStart,TargetEnd,HMM_Seq,...,HMM_Pos,prot_id,ensembl_id,transcript_id,chromosome_id,chrom_num,length,refseq,hmm_start,hmm_end
0,0,OR10C1.002,OR10C1,PF00001,7tm_1,2.800000e-26,88.2,42,291,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000449846,ENSG00000235441,ENST00000550132,GRCh37.75:HSCHR6_MHC_SSTO:join(29384748..29384...,HSCHR6_MHC_SSTO,294,,1,268
1,1,OR10C1.002,OR10C1,PF00001,7tm_1,5.600000e-28,84.0,42,291,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000449846,ENSG00000235441,ENST00000550132,GRCh37.75:HSCHR6_MHC_SSTO:join(29384748..29384...,HSCHR6_MHC_SSTO,294,,1,268
2,2,OPN1LW.002,OPN1LW,PF00001,7tm_1,3.400000e-21,61.7,3,125,silnllaisiDRYvaivkplkykrlkrrakav.illvWvlslllav...,...,"71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,8...",ENSP00000402493,ENSG00000102076,ENST00000442922,"GRCh37.75:X:join(153418415..153418581,15342004...",X,164,,71,187
3,3,P2RY2.001,P2RY2,PF00001,7tm_1,3.700000e-42,130.5,51,306,NllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel....,...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",ENSP00000310305,ENSG00000175591,ENST00000311131,GRCh37.75:11:72945205..72946338,11,377,NP_788086,2,268
4,4,P2RY2.002,P2RY2,PF00001,7tm_1,3.700000e-42,130.5,51,306,NllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel....,...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",ENSP00000377221,ENSG00000175591,ENST00000393596,GRCh37.75:11:72945205..72946338,11,377,NP_788085,2,268
5,5,P2RY2.003,P2RY2,PF00001,7tm_1,3.700000e-42,130.5,51,306,NllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel....,...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",ENSP00000377222,ENSG00000175591,ENST00000393597,GRCh37.75:11:72945205..72946338,11,377,NP_002555,2,268
6,6,P2RY2.001,P2RY2,PF00001,7tm_1,4.200000e-41,137.5,50,306,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000310305,ENSG00000175591,ENST00000311131,GRCh37.75:11:72945205..72946338,11,377,NP_788086,1,268
7,7,P2RY2.002,P2RY2,PF00001,7tm_1,4.200000e-41,137.5,50,306,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000377221,ENSG00000175591,ENST00000393596,GRCh37.75:11:72945205..72946338,11,377,NP_788085,1,268
8,8,P2RY2.003,P2RY2,PF00001,7tm_1,4.200000e-41,137.5,50,306,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000377222,ENSG00000175591,ENST00000393597,GRCh37.75:11:72945205..72946338,11,377,NP_002555,1,268
9,9,OR8B8.001,OR8B8,PF00001,7tm_1,2.600000e-24,72.0,41,235,gNllvllviltkkslrtstnyfilsLaisDlllgllvlpfaiiyel...,...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000330280,ENSG00000197125,ENST00000328064,GRCh37.75:11:complement(124310046..124310981),11,311,NP_036510,1,206


In [32]:
#Reading the table of domains gathering threshold
domain_GA = pd.read_csv(curr_dir[0]+"/Pfam/domains_GA.csv", sep='\t', index_col=0)

### Filtering all Zinc finger domains

In [37]:
#Get the Zinc finger domain
zinc_domain = allhmm[allhmm["domain_name"] == "zf-C2H2"]
zinc_domain = zinc_domain.reset_index(drop=True)

In [33]:
#Get the zinc finger gathering threshold
zinc_GA = float(domain_GA[domain_GA["name"] == "zf-C2H2"]["GA"])

In [48]:
zinc_ga_filtered = zinc_domain[zinc_domain["BitScore"] >= zinc_GA]

In [None]:
#Saving the Zinc finger domain to file
zinc_ga_filtered.to_csv(curr_dir[0]+"/hmm_domains/zf-C2H2.csv", sep='\t')

### Filtering all Homeobox  domains

In [8]:
#Get the Homeodomain domain
homeobox_domain = allhmm[allhmm["domain_name"] == "Homeobox"]
homeobox_domain = homeobox_domain.reset_index(drop=True)

#Saving the Homeobox domain to file
homeobox_domain.to_csv(curr_dir[0]+"/hmm_domains/Homeobox.csv", sep='\t')

In [6]:
homeobox_domain

Unnamed: 0,#TargetID,Hugo_symbol,pfam_id,domain_name,E-value,BitScore,TargetStart,TargetEnd,HMM_Seq,Target_Seq,HMM_Pos,prot_id,ensembl_id,transcript_id,chromosome_id,chrom_num,length,refseq,hmm_start,hmm_end
0,POU5F1.006,POU5F1,PF00046,Homeobox,1.900000e-22,75.5,61,117,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,KRKRTSIENRVRGNLENLFLQCPKPTLQQISHIAQQLGLEKDVVRV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000475880,ENSG00000204531,ENST00000606567,GRCh37.75:6:complement(join(31133704..31133719...,6,190,,1,57
1,POU5F1.006,POU5F1,PF00046,Homeobox,2.800000e-23,67.9,61,117,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,KRKRTSIENRVRGNLENLFLQCPKPTLQQISHIAQQLGLEKDVVRV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000475880,ENSG00000204531,ENST00000606567,GRCh37.75:6:complement(join(31133704..31133719...,6,190,,1,57
2,GSC2.001,GSC2,PF00046,Homeobox,1.100000e-24,83.0,127,183,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,RRHRTIFSEEQLQALEALFVQNQYPDVSTRERLAGRIRLREERVEV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000086933,ENSG00000063515,ENST00000086933,GRCh37.75:22:complement(join(19137538..1913779...,22,205,NP_005306,1,57
3,GSC2.001,GSC2,PF00046,Homeobox,1.100000e-25,75.5,127,183,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,RRHRTIFSEEQLQALEALFVQNQYPDVSTRERLAGRIRLREERVEV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000086933,ENSG00000063515,ENST00000086933,GRCh37.75:22:complement(join(19137538..1913779...,22,205,NP_005306,1,57
4,PAX7.001,PAX7,PF00046,Homeobox,1.800000e-27,81.3,218,274,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,RRSRTTFTAEQLEELEKAFERTHYPDIYTREELAQRTKLTEARVQV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364524,ENSG00000009709,ENST00000375375,"GRCh37.75:1:join(18958098..18958182,18960797.....",1,520,NP_002575,1,57
5,PAX7.001,PAX7,PF00046,Homeobox,3.200000e-27,91.3,218,274,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,RRSRTTFTAEQLEELEKAFERTHYPDIYTREELAQRTKLTEARVQV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364524,ENSG00000009709,ENST00000375375,"GRCh37.75:1:join(18958098..18958182,18960797.....",1,520,NP_002575,1,57
6,PAX7.003,PAX7,PF00046,Homeobox,1.700000e-27,81.4,218,274,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,RRSRTTFTAEQLEELEKAFERTHYPDIYTREELAQRTKLTEARVQV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000403389,ENSG00000009709,ENST00000420770,"GRCh37.75:1:join(18958098..18958182,18960797.....",1,505,NP_001128726,1,57
7,PAX7.003,PAX7,PF00046,Homeobox,3.200000e-27,91.3,218,274,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,RRSRTTFTAEQLEELEKAFERTHYPDIYTREELAQRTKLTEARVQV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000403389,ENSG00000009709,ENST00000420770,"GRCh37.75:1:join(18958098..18958182,18960797.....",1,505,NP_001128726,1,57
8,PAX7.002,PAX7,PF00046,Homeobox,1.700000e-27,81.3,216,272,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,RRSRTTFTAEQLEELEKAFERTHYPDIYTREELAQRTKLTEARVQV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000383502,ENSG00000009709,ENST00000400661,"GRCh37.75:1:join(18958098..18958182,18960797.....",1,518,NP_039236,1,57
9,PAX7.002,PAX7,PF00046,Homeobox,3.200000e-27,91.3,216,272,rrkRttftkeqleeLeelFeenrypsaeereeLAkklgLeerqVkv...,RRSRTTFTAEQLEELEKAFERTHYPDIYTREELAQRTKLTEARVQV...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000383502,ENSG00000009709,ENST00000400661,"GRCh37.75:1:join(18958098..18958182,18960797.....",1,518,NP_039236,1,57
