In [18]:
from Bio import SeqIO
import itertools
import numpy as np
import pandas as pd

In [122]:
np.set_printoptions(suppress=True)

In [227]:
class Neighbor_Joining_Tree(object):
    
    def __init__(self, fna):
        self.D = None
        self.sums = None
        self.n = 0
        self.node_ids = None
        self.sequences, self.identifiers = self.parse_sequences(fna)
        self.tree = np.zeros(shape=(1, 3))
        self.Q = None
    
    
    def initialize_last_variables(self):
        self.sums = self.D.sum(axis=0)
        self.n = len(self.D[0])
        self.node_ids = np.arange(1, len(self.D)+1, 1) # 1 based counting, not 0 based
    
    
    def parse_sequences(self, fna):
        sequences = []
        identifiers = []
        fna_obj = list(SeqIO.parse(fna, 'fasta'))
        for i, seq in enumerate(fna_obj):
            sequences.append(seq.seq)
            identifiers.append(seq.id)
        return sequences, identifiers
    
    
    def calculate_dissimilarity_score(self, a, b):
        '''
        If letters do not match, add 1. Otherwise add 0. 
        Sequences a and b are the same length.
        Returns dissimilarity score as float. 
        '''
        score = 0.00
        length = len(a)

        for i, char_a in enumerate(a):
            char_b = b[i]
            if char_a != char_b:
                score += 1

        return score/length
    
    
    def calculate_D_matrix(self):
        print('Calculating D matrix...')
        
        distances = []
        for pair in itertools.product(self.sequences, repeat=2): # repeat=2? ... and we are doing too much work here but its ok for now, can just do half of the work and flip the matrix across the diagonal
            distances.append(self.calculate_dissimilarity_score(pair[0], pair[1]))
        
        # Reshape distances array into matrix based on number of sequences
        self.D = np.reshape(distances, (len(self.sequences), len(self.sequences)))

        
    def print_D_matrix(self):
        filename = 'pairwise_dissimilarity.txt'
        print('Printing D matrix to ' + filename + '...')
        
        distances_df = pd.DataFrame(self.D, columns=self.identifiers) 
        distances_df.insert(0, 'id', self.identifiers) # Add row index
        distances_df = distances_df.reset_index(drop=True).set_index('id')
        
        # Save pairwise dissimilarities to tab-delimited file 
        distances_df.to_csv(filename, sep='\t')
        
    
    def calculate_q_cell(self, distance, sums_i, sums_j):
        '''
        Calculates and returns cell value for the Q matrix. 

        n — number of sequences
        distance — pairwise dissimilarity between pair
        a_sums — summed distances from node a to all the other nodes
        b_sums — summed distances from node b to all the other nodes
        '''

        return (self.n-2) * distance - sums_i - sums_j

    
    def calculate_Q_matrix(self):
        Q_matrix = np.zeros_like(self.D)

        for i, row in enumerate(self.D):
            sums_i = self.sums[i]
            for j, distance in enumerate(row):
                # If diagonal cell, set to infinity (to avoid it from ever being the minimum)
                if i == j:
                    Q_matrix[j, i] = None
                else: 
                    # Get terms for neighbor joining equation
                    sums_j = self.sums[j]
                    # Calculate and set value for this cell in Q matrix
                    Q_matrix[j, i] = self.calculate_q_cell(distance, sums_i, sums_j)

        return Q_matrix
    
    
    def calculate_first_branch_length(self, distance, sums_f, sums_g):
        return distance/2 + (sums_f - sums_g)/(2*(self.n-2))

    
    def calculate_second_branch_length(self, distance, first_branch_length):
        return abs(distance - first_branch_length) 
    
    def calculate_uk(self, fk, gk, fg):
        '''
        Calculates and returns distance of node u to node k. 

        dist_fk — distance of node f to node k
        dist_gk — distance of node g to node k
        dist_fg — distance of node f to node g

        where f and g are members of the pair just joined.
        '''

        return (fk + gk - fg)/2
    
    
    def get_branches(self, f, g):
        # Get ids for each element
        u_id = self.node_ids[-1]
        self.node_ids = np.append(self.node_ids, self.node_ids[-1] + 1)
        u_id = self.node_ids[-1]
        f_id = self.node_ids[f]
        g_id = self.node_ids[g]

        # Calculate lengths of branches joining f and g to u
        distance_fg = self.D[f, g]
        delta_fu = self.calculate_first_branch_length(distance_fg, self.sums[f], self.sums[g])
        delta_gu = self.calculate_second_branch_length(distance_fg, delta_fu)

        branch_fu = [u_id, f_id, delta_fu]
        branch_gu = [u_id, g_id, delta_gu]
        return branch_fu, branch_gu
        
    def update_D(self, f, g):
        row_u = np.zeros_like(self.D[0]) 
        for k,_ in enumerate(self.D):
            row_u[k] = self.calculate_uk(self.D[f, k], self.D[g, k], self.D[f, g])

        # Update D matrix with these distances
        column_u = row_u[:, np.newaxis] 
        column_u = np.vstack([column_u, 0]) # Add its own diagonal value
        self.D = np.vstack((self.D, row_u))
        self.D = np.hstack((self.D, column_u))

        
    def run(self):
        # Task 1
        self.calculate_D_matrix()
        self.print_D_matrix()
        
        # Task 2
        self.initialize_last_variables() # which rely on D
        
        # Loop until D contains 59 elements
        while len(self.D) >= 3:
            # Calculate the join score of each pair (#1)
            #print('\nCalculating Q matrix...')
            self.Q = np.round(self.calculate_Q_matrix(), 10) #1 (rounding cuz weirdly it had an issue with precision past this amount, would be slightly different row vs column)
            self.Q = np.ma.masked_invalid(self.Q) # need to mask nans cuz nan cannot be used in np.where
            # np.savetxt('foo ' + str(len(self.Q)) + '.csv', self.Q, delimiter=",")
            
            # Find the pair with the minimum join score (#2)
            #print(np.ma.where(self.Q == np.amin(self.Q)))
            f, g = np.ma.where(self.Q == np.amin(self.Q))[0]
            #print('The two nodes to be joined have ids: ' + str(f) + ' and ' + str(g))
            
            # Add branch lengths for this pair to their ancestral node u (#3)
            #print('Adding branch lengths from this pair to new, ancestral node...')
            branch_fu, branch_gu = self.get_branches(f, g)
            self.tree = np.vstack((self.tree, branch_fu))
            self.tree = np.vstack((self.tree, branch_gu))
            
            # Calculate distance between u and every other node k outside of this pair 
            # and update D matrix with these distances (#4)
            #print('Updating distances between new, ancestral node and every other node...')
            self.update_D(f, g)
            
            # Remove this pair 
            #print('Removing two nodes...')
            self.D = np.delete(self.D, [f, g], axis=1)
            self.D = np.delete(self.D, [f, g], axis=0)
            self.sums = np.delete(self.sums, [f, g])
            self.node_ids = np.delete(self.node_ids, [f, g])
            # Update distance sums
            self.sums = np.append(self.sums, self.D[-1].sum())
        
        self.tree = np.delete(njt.tree, 0, axis=0) # remove that initialized beginning [0,0,0]
        print(self.tree)
        
        

In [221]:
alignment_file = 'hw3.fna'
njt = Neighbor_Joining_Tree(alignment_file)
njt.run()

Calculating D matrix...
Printing D matrix to pairwise_dissimilarity.txt...
[[  0.           0.           0.        ]
 [ 62.           1.           0.11558729]
 [ 62.           2.           0.12465497]
 [ 63.           3.           0.13464938]
 [ 63.          62.           0.01911912]
 [ 64.          16.           0.04752264]
 [ 64.          33.           0.03390401]
 [ 65.          30.           0.03442868]
 [ 65.          60.           0.0261366 ]
 [ 66.           9.           0.08185437]
 [ 66.          59.           0.07561535]
 [ 67.          25.           0.02273764]
 [ 67.          29.           0.020331  ]
 [ 68.          10.           0.0613865 ]
 [ 68.          57.           0.05436047]
 [ 69.          17.           0.03507881]
 [ 69.          18.           0.03154299]
 [ 70.          23.           0.03406369]
 [ 70.          28.           0.03996054]
 [ 71.          35.           0.06020456]
 [ 71.          65.           0.00204308]
 [ 72.          34.           0.04173985]
 

In [226]:
np.delete(njt.tree, 0, axis=0)

array([[ 62.        ,   1.        ,   0.11558729],
       [ 62.        ,   2.        ,   0.12465497],
       [ 63.        ,   3.        ,   0.13464938],
       [ 63.        ,  62.        ,   0.01911912],
       [ 64.        ,  16.        ,   0.04752264],
       [ 64.        ,  33.        ,   0.03390401],
       [ 65.        ,  30.        ,   0.03442868],
       [ 65.        ,  60.        ,   0.0261366 ],
       [ 66.        ,   9.        ,   0.08185437],
       [ 66.        ,  59.        ,   0.07561535],
       [ 67.        ,  25.        ,   0.02273764],
       [ 67.        ,  29.        ,   0.020331  ],
       [ 68.        ,  10.        ,   0.0613865 ],
       [ 68.        ,  57.        ,   0.05436047],
       [ 69.        ,  17.        ,   0.03507881],
       [ 69.        ,  18.        ,   0.03154299],
       [ 70.        ,  23.        ,   0.03406369],
       [ 70.        ,  28.        ,   0.03996054],
       [ 71.        ,  35.        ,   0.06020456],
       [ 71.        ,  65.     

Tomorrow: 
- Need to figure out how to assign new u nodes ids that counting backwards from 119. And why do I have 120?
