In [35]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import operator

curr_dir = !pwd
instance_cutoff = "10"

# Read the list of filtered domains
with open(curr_dir[0] + "/../5.domains_stats/filtered"+instance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

domains_dict = defaultdict(defaultdict)
# Read GO terms line by line
with open(curr_dir[0] + '/../pfam2GO/pfam2go.txt','r') as f:
    for line in f.readlines():
        # Skip heading
        if line[0] != 'P':
            continue
        # Skip domains that are not included in the analysis
        domain = line[13:line.find('>')-1]
        if not domain in filtered_domains_list:
            continue
        # Map domain to descriptions and GO numbers
        tokens = line.split('GO:')
        name = tokens[1][0:len(tokens[1])-3]
        number = int(tokens[2])
        # If domain not already present, create a new dictionary and add it
        if not domain in domains_dict:
            temp = defaultdict(list)
            temp['names'].append(name)
            temp['numbers'].append(number)
            domains_dict[domain] = temp
        # Otherwise, append to the existing dictionary
        else:
            domains_dict[domain]['names'].append(name)
            domains_dict[domain]['numbers'].append(number)
            
# Compute frequency counts
counts = {}
numbers_to_names = {}
numbers_to_domains = {}
for k in domains_dict.keys():
    for i in range(0,len(domains_dict[k]['numbers'])):
        num = domains_dict[k]['numbers'][i]
        if num in counts:
            counts[num] += 1
        else:
            counts[num] = 1
        if not num in numbers_to_names.keys():
            numbers_to_names[num] = domains_dict[k]['names'][i]
            numbers_to_domains[num] = k
        else:
            numbers_to_domains[num] += "." + k
sorted_counts = sorted(counts.items(),key=operator.itemgetter(1),reverse=True)

# Add to dataframe
all_counts = pd.DataFrame(columns=['number','count','name','domains'])
i = 1
for c in sorted_counts:
    all_counts.loc[i,'number'] = c[0]
    all_counts.loc[i,'count'] = c[1]
    all_counts.loc[i,'name'] = numbers_to_names[c[0]]
    all_counts.loc[i,'domains'] = numbers_to_domains[c[0]]
    i += 1
    
# Save to csv
all_counts.to_csv('freq_counts' + instance_cutoff + '.csv')

In [1]:
# Reorganize by domain — traverse GO term tree first!
import numpy as np
import pandas as pd

instance_cutoff = "10"

# Read file
freq_counts_broad = pd.read_csv("freq_counts_broad"+instance_cutoff,index_col=0)

domain_counts = pd.DataFrame(columns=["count","groups"])
for row in freq_counts_broad.iterrows():
    for d in freq_counts_broad.loc[row[0],"domains"].split("."):
        if d in domain_counts.index:
            domain_counts.loc[d,"count"] += 1
            domain_counts.loc[d,"groups"] += "." + freq_counts_broad.loc[row[0],"name"]
        else:
            domain_counts.loc[d,"count"] = 1
            domain_counts.loc[d,"groups"] = freq_counts_broad.loc[row[0],"name"]

domain_counts.sort_index()

Unnamed: 0,count,groups
2OG-FeII_Oxy_3,8,oxidation-reduction process.metabolic process....
7tm_1,26,integral component of membrane.membrane.G-prot...
7tm_2,26,integral component of membrane.membrane.G-prot...
7tm_3,26,integral component of membrane.membrane.G-prot...
7tm_4,26,integral component of membrane.membrane.G-prot...
A2M,2,endopeptidase inhibitor activity.endopeptidase...
A2M_N,8,endopeptidase inhibitor activity.peptidase inh...
A2M_comp,4,extracellular region.extracellular space.cellu...
A2M_recep,2,extracellular region.cellular_component
AAA,10,ATP binding.purine ribonucleoside binding.aden...


In [2]:
# For domains that have no pfam GO term, find domain with GO term from the same family if possible
import numpy as np
import pandas as pd
import pickle

curr_dir = !pwd

# Read the list of filtered domains
with open(curr_dir[0] + "/../5.domains_stats/filtered"+instance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

# Map family to domain name
# Any match is fine, so overwriting on repeats is allowable
fam_map = {}
for row in domain_counts.iterrows():
    fam = row[0].replace('-','_').split('_')[0].lower()
    fam_map[fam] = row[0]
    
# Find matches
for domain in filtered_domains_list:
    if not domain in domain_counts.index:
        fam = domain.replace('-','_').split('_')[0].lower()
        if not fam in fam_map:
            continue
        close_domain = fam_map[fam]
        # Copy row
        domain_counts.loc[domain,"count"] = domain_counts.loc[close_domain,"count"]
        domain_counts.loc[domain,"groups"] = domain_counts.loc[close_domain,"groups"]
        # Update freq_counts
        terms = domain_counts.loc[domain,"groups"].split(".")
        for t in terms:
            freq_counts_broad.loc[freq_counts_broad.loc[:,"name"] == t,"domains"] += "." + domain
            freq_counts_broad.loc[freq_counts_broad.loc[:,"name"] == t,"count"] += 1

In [21]:
# Check which domains are omitted
count = 0
f = open("unmapped_domains", 'w')
for domain in filtered_domains_list:
    if not domain in domain_counts.index:
        count += 1
        f.write(domain + '\n')
print(count)
f.close()

260


In [3]:
# Save to csv
freq_counts_broad.to_csv('freq_counts_expanded' + instance_cutoff + '.csv')
# Print
freq_counts_broad.sort_values(["count"],ascending=False)

Unnamed: 0,number,count,name,domains
244,3674,337,molecular_function,ECH_1.PP2C.Prenyltrans.Epimerase.AMP-binding.M...
216,5488,231,binding,LRR_6.WW.EF-hand_4.Fz.Kazal_2.Kazal_1.Spectrin...
238,8150,189,biological_process,Methyltransf_11.Aldedh.ECH_1.Sulfatase.Acyltra...
443,9987,111,cellular process,Zip.Ion_trans.Sugar_tr.ABC_membrane.AA_permeas...
218,5575,107,cellular_component,Zip.Ion_trans.AA_permease_2.MIP.Frizzled.HRM.L...
1,5515,105,protein binding,LRR_6.WW.EF-hand_4.Fz.Kazal_2.Kazal_1.Spectrin...
26,3824,93,catalytic activity,ECH_1.PP2C.Prenyltrans.Epimerase.AMP-binding.A...
19,8152,92,metabolic process,Methyltransf_11.Aldedh.ECH_1.Sulfatase.Acyltra...
451,43167,91,ion binding,FYVE.HMA.zf-CCCH.zf-C3HC4.Metallothio.EF-hand_...
442,44699,91,single-organism process,Acyl-CoA_dh_N.Acyl-CoA_dh_M.Aldedh.2OG-FeII_Ox...


In [35]:
# Modify freq_counts_broad for logical operations
for row in freq_counts_broad.iterrows():
    domains = freq_counts_broad.loc[row[0],"domains"]
    # This cell may be rerun, so be sure to only append once
    if domains[0] != '.':
        freq_counts_broad.loc[row[0],"domains"] = "." + freq_counts_broad.loc[row[0],"domains"] + "."
        
# The union of domains in two or more groups
def combine(groups):
    domains_combined = "."
    for g in groups:
        # A list of terms can also be a string separated by periods
        terms = g.split(".")
        for t in terms:
            domains = freq_counts_broad.loc[freq_counts_broad.loc[:,"name"] == t,"domains"].values[0].split(".")
            for d in domains:
                if len(d) > 0 and not "." + d + "." in domains_combined:
                    domains_combined += d + "."
    return(domains_combined)

# Manually curate group forming
comb_test = combine(["nucleic acid binding","nucleotide binding","ATP binding"]).split(".")
comb_binding = combine(["ATP binding","DNA binding","protein binding","nucleic acid binding","metal ion binding"]).split(".")
comb_other = combine(["catalytic activity","metabolic process","membrane","signal transduction",
                      "intracellular"]).split(".")
comb_all = combine(["ATP binding","DNA binding","protein binding","nucleic acid binding","metal ion binding",
                    "nucleotide binding","catalytic activity","metabolic process","membrane","signal transduction","intracellular",
                    "enzyme regulator activity"]).split(".")
print(len(comb_test)-2)
print(len(comb_binding)-2)
print(len(comb_other)-2)
print(len(comb_all)-2)

69
221
220
410


In [None]:
# Remove one or more groups and all domains associated from freq_counts_broad
def remove(groups):
    freq_counts_removed = freq_counts_broad.copy()
    for g in groups:
        for d in freq_counts_removed.loc[freq_counts_removed.loc[:,"name"] == g,"domains"].values[0].split("."):
            for row in freq_counts_removed.iterrows():
                if "." + d + "." in freq_counts_removed.loc[row[0],"domains"]:
                    freq_counts_removed.loc[row[0],"count"] -= 1
                    freq_counts_removed.loc[row[0],"domains"] = freq_counts_removed.loc[row[0],"domains"].replace(d + ".","")
    # Remove rows with no entries left
    freq_counts_removed = freq_counts_removed.drop(freq_counts_removed.loc[freq_counts_removed.loc[:,"count"] == 0,:].index)
    return(freq_counts_removed)

# Check for more groups to include
freq_counts_removed = remove(["ATP binding","DNA binding","protein binding","nucleic acid binding","metal ion binding",
        "nucleotide binding", "lipid binding","catalytic activity","metabolic process","membrane","signal transduction","intracellular",
        "extracellular region", "enzyme regulator activity","transport"]).sort_values(["count"],ascending=False)

In [50]:
# Organize categories into a table
import pandas as pd

# List of groups — a '.' indicates a group is a combination of GO terms
groups = ["ATP binding","DNA binding","nucleic acid binding.nucleotide binding","protein binding.enzyme regulator activity",
          "metal ion binding","catalytic activity","metabolic process","membrane","signal transduction",
          "intracellular"]
copy = ["ATP binding","DNA binding","nucleic acid binding.nucleotide binding","protein binding.enzyme regulator activity",
        "metal ion binding","catalytic activity","metabolic process","membrane","signal transduction",
        "intracellular"]

comb = combine(groups).split(".")

# Initialize and populate dataframe
categories = pd.DataFrame(columns=["domains","total overlap","ATP binding overlap"])
for g in groups:
    terms = g.split(".")
    categories.loc[terms[0],"domains"] = 0
    for t in terms:
        categories.loc[terms[0],"domains"] += freq_counts_broad.loc[freq_counts_broad.loc[:,"name"] == t,"count"].values[0]
    term = combine(terms).split(".")
    copy.remove(g)
    others = combine(copy).split(".")
    categories.loc[terms[0],"total overlap"] = len(term) + len(others) - len(comb) - 2
    for other_group in copy:
        other_terms = other_group.split(".")
        other = combine(other_terms).split(".")
        both = combine(terms+other_terms).split(".")
        categories.loc[terms[0],other_terms[0]+" overlap"] = len(term) + len(other) - len(both) - 2
    copy.append(g)
categories

Unnamed: 0,domains,total overlap,ATP binding overlap,DNA binding overlap,nucleic acid binding overlap,protein binding overlap,metal ion binding overlap,catalytic activity overlap,metabolic process overlap,membrane overlap,signal transduction overlap,intracellular overlap
ATP binding,24,14,,2.0,14.0,1.0,0.0,9.0,3.0,1.0,0.0,2.0
DNA binding,22,22,2.0,,22.0,0.0,4.0,1.0,10.0,0.0,0.0,5.0
nucleic acid binding,62,45,14.0,22.0,,3.0,6.0,14.0,16.0,1.0,2.0,8.0
protein binding,111,10,1.0,0.0,3.0,,0.0,2.0,0.0,0.0,5.0,3.0
metal ion binding,66,16,0.0,4.0,6.0,0.0,,6.0,9.0,1.0,0.0,5.0
catalytic activity,93,73,9.0,1.0,14.0,2.0,6.0,,56.0,3.0,6.0,7.0
metabolic process,92,77,3.0,10.0,16.0,0.0,9.0,56.0,,1.0,5.0,14.0
membrane,56,14,1.0,0.0,1.0,0.0,1.0,3.0,1.0,,8.0,3.0
signal transduction,35,20,0.0,0.0,2.0,5.0,0.0,6.0,5.0,8.0,,1.0
intracellular,39,27,2.0,5.0,8.0,3.0,5.0,7.0,14.0,3.0,1.0,
