In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Using the Pfam-A.clans.tsv file

In [2]:
curr_dir = !pwd
pfam_version = "32"
input_path = curr_dir[0]+"/../2.parse_Pfam/v"+pfam_version+"/"
filename = "Pfam-A.clans.tsv"

#Get the data into a data frame (header has a different delimeter)
pfam_human_proteome = pd.read_csv(input_path+filename, sep='\t', index_col=False, header=None)
pfam_human_proteome.columns = ["pfam_id", "clan_id", "clan_name", "domain_name", "description"]

#### Creating a dict to map domain to clan and vice versa

In [3]:
domain_to_clan = {}
clan_to_domain = defaultdict(set)
for index, line in pfam_human_proteome.iterrows():
    domain_name = line["domain_name"]
    clan = line["clan_name"]
    domain_to_clan[domain_name] = clan
    clan_to_domain[clan].add(domain_name)

In [4]:
#Saving dicts
with open(input_path+"updated_domain_to_clan_dict.pik", 'wb') as handle:
    pickle.dump(domain_to_clan, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(input_path+"updated_clan_to_domains_dict.pik", 'wb') as handle:
    pickle.dump(clan_to_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Mapping of domain names to PFAM ids

In [5]:
domain_to_pfam_acc = {}
for index, line in pfam_human_proteome.iterrows():
    domain_name = line["domain_name"]
    domain_acc = line["pfam_id"]
    domain_to_pfam_acc[domain_name] = domain_acc

In [6]:
with open(input_path+"updated_domain_to_pfam_acc_dict.pik", 'wb') as handle:
    pickle.dump(domain_to_pfam_acc, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Using the 9606.tsv file (that had some clan associations missing!)

In [2]:
curr_dir = !pwd
pfam_version = "31"
input_path = curr_dir[0]+"/../2.parse_Pfam/v"+pfam_version+"/"
filename = "9606.tsv" #Downloaded from: ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam30.0/proteomes/

#Get the data into a data frame (header has a different delimeter)
pfam_human_proteome = pd.read_csv(input_path+filename, sep='\t', skiprows=[0,1,2], index_col=False, header=None)
pfam_human_proteome_header = pd.read_csv(input_path+filename, sep='<', skiprows=[0,1], nrows=1, header=None)
pfam_human_proteome_header = pfam_human_proteome_header.iloc[0].tolist()
pfam_human_proteome_header.remove("#")
pfam_human_proteome_header = [x[:x.find('>')] for x in pfam_human_proteome_header]
pfam_human_proteome_header
pfam_human_proteome.columns = pfam_human_proteome_header

#### Creating a dict to map domain to clan and vice versa

In [3]:
domain_to_clan = {}
clan_to_domain = defaultdict(set)
for index, line in pfam_human_proteome.iterrows():
    domain_name = line["hmm name"]
    clan = line["clan"]
    domain_to_clan[domain_name] = clan
    clan_to_domain[clan].add(domain_name)

In [4]:
#Saving dicts
with open(input_path+"domain_to_clan_dict.pik", 'wb') as handle:
    pickle.dump(domain_to_clan, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(input_path+"clan_to_domains_dict.pik", 'wb') as handle:
    pickle.dump(clan_to_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Mapping of domain names to PFAM ids

In [6]:
domain_to_pfam_acc = {}
for index, line in pfam_human_proteome.iterrows():
    domain_name = line["hmm name"]
    domain_acc = line["hmm acc"]
    domain_to_pfam_acc[domain_name] = domain_acc

In [8]:
with open(input_path+"domain_to_pfam_acc_dict.pik", 'wb') as handle:
    pickle.dump(domain_to_pfam_acc, handle, protocol=pickle.HIGHEST_PROTOCOL)