## Compute clustering of non-synonomous mutations

### Requirements:
1. List of domains
2. Domain dictionaries

### Instructions:
Run cells in order

### Output:
A csv file containing a clustering score for each domain

In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
import datetime
import math
import sys
curr_dir = !pwd
sys.path.append(curr_dir[0] + "/../5.HMM_alter_align") 
from calc_exac_freq_func import codon_table
from dnds_func import calculate_ns, seq_ns
from collections import defaultdict

#Getting path
curr_dir = !pwd
intance_cutoff = "10"

#Reading the list of filtered domains
with open(curr_dir[0]+"/../5.domains_stats/filtered"+intance_cutoff+"_list.pik", 'rb') as handle:
    filtered_domains_list = pickle.load(handle)
filtered_domains_list.sort()

#Reading the table of all domains stats
filtered_domains_df = pd.read_csv(curr_dir[0]+"/../5.domains_stats/filtered"+intance_cutoff+"_domains_df.csv",
                                  sep='\t', index_col=0)
input_path = curr_dir[0]+"/../5.HMM_alter_align/domains_states_dicts/pfam-v30/"

### Compute P<sub>u</sub> from Wagner, 2007

In [2]:
# Computes the variance of the distance array
def calc_stat(sample):
    # Compute x
    x = [0]
    for i in range(0,len(sample)):
        if sample[i] == 1:
            x.append(i)
    x.append(len(sample)-1)

    # Compute d
    d = np.zeros(len(x)-1)
    for i in range(0,len(x)-1):
        d[i] = x[i+1]-x[i]

    return(np.std(d)*np.std(d))

# Sample 10,000 times from a uniform distribution and compare
def gen_random(sample):
    stats = []
    for i in range(0,10000):
        np.random.shuffle(sample)
        stats.append(calc_stat(sample))
    return(stats)

# Performs the permutation test to calculate Pu
def p_val(sample):
    stat = calc_stat(sample)
    rands = gen_random(sample)
    counter = 0
    for rand in rands:
        # Less than or equals fixes the case if there are 0 mutations
        if stat <= rand:
            counter += 1
    return(counter/10000.0)

### Find the most recent file for a given domain

In [3]:
def recent_file(path,domain):
    # Reading the domain states dictionary
    domain_dirfiles = !ls -t $path$domain
    # Find the most recent file
    recent_priority = -1
    recent_filename = ""
    for f in domain_dirfiles:
        tokens = f.split("_")
        date = tokens[len(tokens)-1].split(".")
        month = int(date[0])
        day = int(date[1])
        # Not all files have years, but those that do are the most recent
        if date[2] != "pik":
            year = int(date[2])
        else:
            year = 0
        priority = year*1000 + month*50 + day
        if priority > recent_priority:
            recent_priority = priority
            recent_filename = f
    return(recent_filename)

### Average non-synonomous mutation rates across instances

In [7]:
# Nonsyn mutation rates averaged across instances for every position in every domain
nonsyn_rates = []

for domain_name in filtered_domains_list:
    # Load file
    with open(input_path+domain_name+"/"+recent_file(input_path,domain_name), 'rb') as handle:
        states_dict = pickle.load(handle)
    
    # Average non-synonomous mutation rates across instances
    for state in states_dict:
        nonsyns = 0
        for d in states_dict[state]:
            for val in d['alterations_af_adj_dict'].values():
                nonsyns += val[0]
        if nonsyns == 0:
            nonsyn_rates.append(0)
        else:
            nonsyn_rates.append(nonsyns / len(states_dict[state]))

# Get percentiles
rates = np.asarray(nonsyn_rates)
percentiles = np.percentile(rates,[95,90,85,80])

### Compute clustering from data binned with fixed thresholds

In [None]:
def clustering_fixed_cutoff(thresh,labels):
    scores = pd.DataFrame()
    for domain_name in filtered_domains_list:
        # Open recent file
        with open(input_path+domain_name+"/"+recent_file(input_path,domain_name), 'rb') as handle:
            states_dict = pickle.load(handle)
        
        # Get rate of nonsynonomous mutations for each position
        nonsyn_rates = []
        for state in states_dict:
            nonsyns = 0
            for d in states_dict[state]:
                for val in d['alterations_af_adj_dict'].values():
                    nonsyns += val[0]
            if nonsyns == 0:
                nonsyn_rates.append(0)
            else:
                nonsyn_rates.append(nonsyns / len(states_dict[state]))
        
        # Bin the rates for each threshold and calculate clustering
        for i in range(0,len(thresh)):
            t = thresh[i]
            bin_rates = []
            for j in range(0,len(nonsyn_rates)):
                if nonsyn_rates[j] >= t:
                    bin_rates.append(1)
                else:
                    bin_rates.append(0)
            scores.loc[domain_name,str(labels[i])] = p_val(bin_rates)
            scores.loc[domain_name,str(labels[i])+" ratio"] = float(sum(bin_rates)) / len(bin_rates)
    return(scores)

# Use percentiles as a threshold, for instance
scores = clustering_fixed_cutoff(percentiles,["95","90","85","80"])
scores.to_csv("clustering_percentile.csv")

### Compute clustering with data binned with a binomial test from Tamborero et al, 2013

In [None]:
from scipy.stats import binom

thresh = np.mean(nonsyn_rates)
scores = pd.DataFrame()
for domain_name in filtered_domains_list:
    # Open recent file
    with open(input_path+domain_name+"/"+recent_file(input_path,domain_name), 'rb') as handle:
        states_dict = pickle.load(handle)
    
    # Get total number of mutations and total number of people sampled per position
    mutations = []
    people = []
    for state in states_dict:
        mut = 0
        total = 0
        for d in states_dict[state]:
            for val in d['alterations_af_adj_dict'].values():
                mut += val[0]*max(d['an_adj'])
            if len(d['alterations_af_adj_dict']) > 0:
                total += max(d['an_adj'])
        mutations.append(mut)
        people.append(total)
    
    # Bin data
    bin_rates = []
    for i in range(0,len(mutations)):
        if binom.cdf(np.round(mutations[i]),people[i],thresh) >= 0.99:
            bin_rates.append(1)
        else:
            bin_rates.append(0)

    # Compute clustering
    scores.loc[domain_name,"0.99"] = p_val(bin_rates)
    scores.loc[domain_name,"0.99 ratio"] = float(sum(bin_rates)) / len(bin_rates)
    
# Save to csv
scores.to_csv("clustering_binom.csv")