## Group domain families by GO terms

### Requirements:
1. List of domains
2. Mapping of pfam domains to GO terms

### Instructions:
Run the cells and R function in order.

### Output:
1. freq_counts: A csv mapping GO terms to domains before traversing the GO tree
2. domain_counts: A csv mapping domains to GO terms after traversing the GO tree
3. freq_counts_broad: A csv mapping GO terms to domains after traversing the GO tree and giving domains without GO terms the same terms as a domain in the same family

### Other utility:
Functions that combine, remove, and compute the overlap of GO terms facilitate the manual formation of groups.

In [1]:
import numpy as np
import pandas as pd
import cPickle as pickle
from collections import defaultdict
import operator

curr_dir = !pwd
instance_cutoff = "10"

# Read the list of filtered domains
with open(curr_dir[0] + "/../../5.domains_stats/filtered"+instance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

### Map GO terms to domains

In [2]:
domains_dict = defaultdict(defaultdict)
# Read GO terms line by line
with open(curr_dir[0] + '/../../pfam2GO/pfam2go.txt','r') as f:
    for line in f.readlines():
        # Skip heading
        if line[0] != 'P':
            continue
        # Skip domains that are not included in the analysis
        domain = line[13:line.find('>')-1]
        if not domain in filtered_domains_list:
            continue
        # Map domain to descriptions and GO numbers
        tokens = line.split('GO:')
        name = tokens[1][0:len(tokens[1])-3]
        number = int(tokens[2])
        # If domain not already present, create a new dictionary and add it
        if not domain in domains_dict:
            temp = defaultdict(list)
            temp['names'].append(name)
            temp['numbers'].append(number)
            domains_dict[domain] = temp
        # Otherwise, append to the existing dictionary
        else:
            domains_dict[domain]['names'].append(name)
            domains_dict[domain]['numbers'].append(number)
            
# Compute frequency counts
counts = {}
numbers_to_names = {}
numbers_to_domains = {}
for k in domains_dict.keys():
    for i in range(0,len(domains_dict[k]['numbers'])):
        num = domains_dict[k]['numbers'][i]
        if num in counts:
            counts[num] += 1
        else:
            counts[num] = 1
        if not num in numbers_to_names.keys():
            numbers_to_names[num] = domains_dict[k]['names'][i]
            numbers_to_domains[num] = k
        else:
            numbers_to_domains[num] += "." + k
sorted_counts = sorted(counts.items(),key=operator.itemgetter(1),reverse=True)

# Add to dataframe
all_counts = pd.DataFrame(columns=['number','count','name','domains'])
i = 1
for c in sorted_counts:
    all_counts.loc[i,'number'] = c[0]
    all_counts.loc[i,'count'] = c[1]
    all_counts.loc[i,'name'] = numbers_to_names[c[0]]
    all_counts.loc[i,'domains'] = numbers_to_domains[c[0]]
    i += 1
    
# Save to csv
all_counts.to_csv('freq_counts' + instance_cutoff + '.csv')

## Swim up GO tree

Before continuing, run domain_go.R to move domains associated with narrow GO terms to broader ones to facilitate grouping

In [3]:
# Reorganize by domain

# Read file
freq_counts_broad = pd.read_csv(curr_dir[0]+"/../freq_counts_broad"+instance_cutoff,index_col=0)

domain_counts = pd.DataFrame(columns=["count","groups"])
for row in freq_counts_broad.iterrows():
    for d in freq_counts_broad.loc[row[0],"domains"].split("."):
        if d in domain_counts.index:
            domain_counts.loc[d,"count"] += 1
            domain_counts.loc[d,"groups"] += "." + freq_counts_broad.loc[row[0],"name"]
        else:
            domain_counts.loc[d,"count"] = 1
            domain_counts.loc[d,"groups"] = freq_counts_broad.loc[row[0],"name"]

domain_counts = domain_counts.sort_index()

In [4]:
# For domains that have no pfam GO term, find domain with GO term from the same family if possible

# Map family to domain name
# Any match is fine, so overwriting on repeats is allowable
fam_map = {}
for row in domain_counts.iterrows():
    fam = row[0].replace('-','_').split('_')[0].lower()
    fam_map[fam] = row[0]
    
# Find matches
for domain in filtered_domains_list:
    if not domain in domain_counts.index:
        fam = domain.replace('-','_').split('_')[0].lower()
        if not fam in fam_map:
            continue
        close_domain = fam_map[fam]
        # Copy row
        domain_counts.loc[domain,"count"] = domain_counts.loc[close_domain,"count"]
        domain_counts.loc[domain,"groups"] = domain_counts.loc[close_domain,"groups"]
        # Update freq_counts
        terms = domain_counts.loc[domain,"groups"].split(".")
        for t in terms:
            freq_counts_broad.loc[freq_counts_broad.loc[:,"name"] == t,"domains"] += "." + domain
            freq_counts_broad.loc[freq_counts_broad.loc[:,"name"] == t,"count"] += 1
            
# Save to csv
freq_counts_broad.to_csv('freq_counts_expanded' + instance_cutoff + '.csv')

### Functions to help manual group formation

In [2]:
# Modify freq_counts_broad for logical operations
for row in freq_counts_broad.iterrows():
    domains = freq_counts_broad.loc[row[0],"domains"]
    # This cell may be rerun accidentally, so be sure to only append once
    if domains[0] != '.':
        freq_counts_broad.loc[row[0],"domains"] = "." + freq_counts_broad.loc[row[0],"domains"] + "."
        
# The union of domains in two or more groups
def combine(groups):
    domains_combined = "."
    for g in groups:
        # A list of terms can also be a string separated by periods
        terms = g.split(".")
        for t in terms:
            domains = freq_counts_broad.loc[freq_counts_broad.loc[:,"name"] == t,"domains"].values[0].split(".")
            for d in domains:
                if len(d) > 0 and not "." + d + "." in domains_combined:
                    domains_combined += d + "."
    return(domains_combined)

# Remove one or more groups and all domains associated from freq_counts_broad
def remove(groups):
    freq_counts_removed = freq_counts_broad.copy()
    for g in groups:
        for d in freq_counts_removed.loc[freq_counts_removed.loc[:,"name"] == g,"domains"].values[0].split("."):
            for row in freq_counts_removed.iterrows():
                if "." + d + "." in freq_counts_removed.loc[row[0],"domains"]:
                    freq_counts_removed.loc[row[0],"count"] -= 1
                    freq_counts_removed.loc[row[0],"domains"] = freq_counts_removed.loc[row[0],"domains"].replace(d + ".","")
    # Remove rows with no entries left
    freq_counts_removed = freq_counts_removed.drop(freq_counts_removed.loc[freq_counts_removed.loc[:,"count"] == 0,:].index)
    return(freq_counts_removed)

In [9]:
# Organize categories into a table
import pandas as pd

# List of groups — a '.' indicates a group is a combination of GO terms
groups = ["ATP binding","DNA binding","nucleic acid binding.nucleotide binding","protein binding",
          "metal ion binding","catalytic activity","metabolic process","membrane","signal transduction",
          "intracellular"]
copy = ["ATP binding","DNA binding","nucleic acid binding.nucleotide binding","protein binding",
        "metal ion binding","catalytic activity","metabolic process","membrane","signal transduction",
        "intracellular"]

comb = combine(groups).split(".")

# Initialize and populate dataframe
categories = pd.DataFrame(columns=["domains","total overlap","ATP binding overlap"])
for g in groups:
    # Find number of domains in group
    terms = g.split(".")
    categories.loc[terms[0],"domains"] = 0
    for t in terms:
        categories.loc[terms[0],"domains"] += freq_counts_broad.loc[freq_counts_broad.loc[:,"name"] == t,"count"].values[0]
    term = combine(terms).split(".")
    
    # Compute overlap — total and with other groups
    copy.remove(g)
    others = combine(copy).split(".")
    categories.loc[terms[0],"total overlap"] = len(term) + len(others) - len(comb) - 2
    for other_group in copy:
        other_terms = other_group.split(".")
        other = combine(other_terms).split(".")
        both = combine(terms+other_terms).split(".")
        categories.loc[terms[0],other_terms[0]+" overlap"] = len(term) + len(other) - len(both) - 2
    copy.append(g)
categories

Unnamed: 0,domains,total overlap,ATP binding overlap,DNA binding overlap,nucleic acid binding overlap,protein binding overlap,metal ion binding overlap,catalytic activity overlap,metabolic process overlap,membrane overlap,signal transduction overlap,intracellular overlap
ATP binding,7,7,,1.0,7.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0
DNA binding,5,5,1.0,,5.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0
nucleic acid binding,20,14,7.0,5.0,,0.0,0.0,4.0,3.0,1.0,2.0,1.0
protein binding,51,2,0.0,0.0,0.0,,2.0,0.0,0.0,0.0,0.0,0.0
metal ion binding,31,16,0.0,0.0,0.0,2.0,,1.0,1.0,5.0,0.0,8.0
catalytic activity,10,10,4.0,1.0,4.0,0.0,1.0,,6.0,1.0,0.0,0.0
metabolic process,11,10,0.0,2.0,3.0,0.0,1.0,6.0,,0.0,1.0,1.0
membrane,14,10,1.0,0.0,1.0,0.0,5.0,1.0,0.0,,4.0,0.0
signal transduction,11,7,0.0,0.0,2.0,0.0,0.0,0.0,1.0,4.0,,0.0
intracellular,12,9,0.0,0.0,1.0,0.0,8.0,0.0,1.0,0.0,0.0,
