In [77]:
from newick import loads
from newick import read

In [2]:
class Node():
    ''' Initializes a node with given parameters. From A3 2a.py

    Arguments:
        name: name of node (only relevant for leaves)
        left: left child (Node)
        right: right child (Node)
    '''
    def __init__(self, name, left, right):
        self.name = name
        self.left = left
        self.right = right

In [3]:
# indexed at 0, so it has a dummy H0 that will not be used
H = []
for i in range(54):
  H.append(Node("H" + str(i), None, None))

In [4]:
newick_string = "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;"
tree = loads(newick_string)[0]
print(tree.ascii_art())

# Access nodes in the tree
print(tree.name)  # F
print([n.descendants for n in tree.descendants])  # ['A', 'B', 'E']
print(tree.descendants)

    ┌─A
──F─┼─B
    │   ┌─C
    └─E─┤
        └─D
F
[[], [], [Node("C"), Node("D")]]
[Node("A"), Node("B"), Node("E")]


In [5]:
def post_order(root):
    if root != None:
        result = [root.name]
        for n in root.descendants:
            result = post_order(n) + result
        return result
    else:
        return []
    
nj_dict = {}
count = 1
def create_tree(root):
    if root != None:
        name = root.name

        if name == None:
            name = "I" + str(count)

        nj_dict[name] = []

        for n in root.descendants:
            if n.name == None:
                count = count + 1
            nj_dict[name].append(create_tree(n))

        return name
    
    else:
        return []

In [6]:
class Tree():
    ''' Initializes a tree with given parameters.

    Arguments:
        count: starting count for internal nodes (int)
        root: root of tree (Node)
        format_dict: each key is a label for a node mapping to a list of 
            the names of its children nodes
            (dict with keys as strings and items as lists of strings)
    '''
    def __init__(self, count, root):
        self.count = count
        self.root = root
        self.format_dict = {}

    '''Recursively creates a format_dict from the given root node
        input:  parent: the root node of the tree (Node from the newick library)
    '''
    def create_tree(self, parent):
        if parent != None:
            name = parent.name

            if name == None:
                name = "I" + str(self.count)

            self.format_dict[name] = []

            for n in parent.descendants:
                if n.name == None:
                    self.count = self.count + 1
                self.format_dict[name].append(self.create_tree(n))

            return name
        
        else:
            return []

In [7]:
post_order(tree)

['D', 'C', 'E', 'B', 'A', 'F']

In [8]:
tst = Tree(1, tree)
tst.create_tree(tree)
print(tst.format_dict)

{'F': ['A', 'B', 'E'], 'A': [], 'B': [], 'E': ['C', 'D'], 'C': [], 'D': []}


In [9]:
nj_newick = "((HQ992964.1:0.009801, ((HQ992967.1:0.006329, AF291586.2:0.007497):0.001608, (((HQ992975.1:0.007220, (AY849734.1:0.006738, AY849716.1:0.007087):0.012594):0.006436, (((MT891307.1:0.006635, HQ992970.1:0.011856):0.006557, ((((HQ992978.1:0.002646, (AF291581.2:0.001918, HQ992965.1:0.004963):0.001948):0.003689, (HQ992969.1:0.003126, AF291585.2:0.003755):0.000361):0.002854, (HQ992966.1:0.017244, (HQ992974.1:0.002841, AY849721.1:0.004040):0.003681):0.001843):0.003716, ((MT891296.1:0.005974, (MT891298.1:0.002834, MT891300.1:0.006354):0.000938):0.007321, ((MT891295.1:0.004965, (HQ992985.1:0.007388, (MT891297.1:0.004644, MT891294.1:0.004544):0.001847):0.001948):0.004581, (MT891306.1:0.002503, (MT891304.1:0.014300, (MT891299.1:0.002205, (MT891305.1:0.004514, (MT891302.1:0.002112, (MT891301.1:0.006916, MT891303.1:0.011575):0.000149):0.002405):0.000076):0.001495):0.001144):0.001425):0.003603):0.003762):0.000907):0.001928, (AF291583.2:0.010963, ((MT891308.1:0.008849, (((HQ992980.1:0.005906, AY849731.1:0.010248):0.003757, (HQ992984.1:0.005731, MT891309.1:0.008094):0.000920):0.003093, (HQ992982.1:0.008279, ((HQ992968.1:0.005489, HQ992971.1:0.010665):0.002199, (AY849715.1:0.005954, (AF291579.2:0.006098, AF291582.2:0.007727):0.002123):0.000141):0.002138):0.001019):0.000690):0.005387, (AF291584.2:0.007851, HQ992973.1:0.012983):0.001698):0.001661):0.001065):0.001099):0.000975, ((HQ992983.1:0.007949, AY849733.1:0.010542):0.002695, (AY849729.1:0.007352, (MT891310.1:0.006215, (HQ992976.1:0.006035, (HQ992977.1:0.007522, AY849725.1:0.006304):0.002042):0.001864):0.000143):0.002140):0.001676):0.008108):0.003072):0.005517, HQ992981.1:0.005517);"

In [10]:
print(nj_newick)

((HQ992964.1:0.009801, ((HQ992967.1:0.006329, AF291586.2:0.007497):0.001608, (((HQ992975.1:0.007220, (AY849734.1:0.006738, AY849716.1:0.007087):0.012594):0.006436, (((MT891307.1:0.006635, HQ992970.1:0.011856):0.006557, ((((HQ992978.1:0.002646, (AF291581.2:0.001918, HQ992965.1:0.004963):0.001948):0.003689, (HQ992969.1:0.003126, AF291585.2:0.003755):0.000361):0.002854, (HQ992966.1:0.017244, (HQ992974.1:0.002841, AY849721.1:0.004040):0.003681):0.001843):0.003716, ((MT891296.1:0.005974, (MT891298.1:0.002834, MT891300.1:0.006354):0.000938):0.007321, ((MT891295.1:0.004965, (HQ992985.1:0.007388, (MT891297.1:0.004644, MT891294.1:0.004544):0.001847):0.001948):0.004581, (MT891306.1:0.002503, (MT891304.1:0.014300, (MT891299.1:0.002205, (MT891305.1:0.004514, (MT891302.1:0.002112, (MT891301.1:0.006916, MT891303.1:0.011575):0.000149):0.002405):0.000076):0.001495):0.001144):0.001425):0.003603):0.003762):0.000907):0.001928, (AF291583.2:0.010963, ((MT891308.1:0.008849, (((HQ992980.1:0.005906, AY849731.

In [11]:
nj_tree = loads(nj_newick)
nj_post_order = post_order(nj_tree[0])
print(nj_post_order)


['HQ992981.1', 'AY849725.1', 'HQ992977.1', None, 'HQ992976.1', None, 'MT891310.1', None, 'AY849729.1', None, 'AY849733.1', 'HQ992983.1', None, None, 'HQ992973.1', 'AF291584.2', None, 'AF291582.2', 'AF291579.2', None, 'AY849715.1', None, 'HQ992971.1', 'HQ992968.1', None, None, 'HQ992982.1', None, 'MT891309.1', 'HQ992984.1', None, 'AY849731.1', 'HQ992980.1', None, None, None, 'MT891308.1', None, None, 'AF291583.2', None, 'MT891303.1', 'MT891301.1', None, 'MT891302.1', None, 'MT891305.1', None, 'MT891299.1', None, 'MT891304.1', None, 'MT891306.1', None, 'MT891294.1', 'MT891297.1', None, 'HQ992985.1', None, 'MT891295.1', None, None, 'MT891300.1', 'MT891298.1', None, 'MT891296.1', None, None, 'AY849721.1', 'HQ992974.1', None, 'HQ992966.1', None, 'AF291585.2', 'HQ992969.1', None, 'HQ992965.1', 'AF291581.2', None, 'HQ992978.1', None, None, None, None, 'HQ992970.1', 'MT891307.1', None, None, None, 'AY849716.1', 'AY849734.1', None, 'HQ992975.1', None, None, None, 'AF291586.2', 'HQ992967.1', Non

In [12]:
print(nj_tree[0].ascii_art())

                          ┌─HQ992964.1
                          │                         ┌─HQ992967.1
                          │            ┌────────────┤
                          │            │            └─AF291586.2
                          │            │                                      ┌─HQ992975.1
                          │            │                         ┌────────────┤
                          │            │                         │            │            ┌─AY849734.1
                          │            │                         │            └────────────┤
                          │            │                         │                         └─AY849716.1
                          │            │                         │                                      ┌─MT891307.1
                          │            │                         │                         ┌────────────┤
                          │            │                         │                

In [13]:
t = Tree(1, nj_tree[0])
t.create_tree(nj_tree[0])
nj_dict = t.format_dict
print(nj_dict)

{'I1': ['I2', 'HQ992981.1'], 'I2': ['HQ992964.1', 'I3'], 'HQ992964.1': [], 'I3': ['I4', 'I5'], 'I4': ['HQ992967.1', 'AF291586.2'], 'HQ992967.1': [], 'AF291586.2': [], 'I5': ['I6', 'I46'], 'I6': ['I7', 'I9'], 'I7': ['HQ992975.1', 'I8'], 'HQ992975.1': [], 'I8': ['AY849734.1', 'AY849716.1'], 'AY849734.1': [], 'AY849716.1': [], 'I9': ['I10', 'I33'], 'I10': ['I11', 'I12'], 'I11': ['MT891307.1', 'HQ992970.1'], 'MT891307.1': [], 'HQ992970.1': [], 'I12': ['I13', 'I20'], 'I13': ['I14', 'I18'], 'I14': ['I15', 'I17'], 'I15': ['HQ992978.1', 'I16'], 'HQ992978.1': [], 'I16': ['AF291581.2', 'HQ992965.1'], 'AF291581.2': [], 'HQ992965.1': [], 'I17': ['HQ992969.1', 'AF291585.2'], 'HQ992969.1': [], 'AF291585.2': [], 'I18': ['HQ992966.1', 'I19'], 'HQ992966.1': [], 'I19': ['HQ992974.1', 'AY849721.1'], 'HQ992974.1': [], 'AY849721.1': [], 'I20': ['I21', 'I23'], 'I21': ['MT891296.1', 'I22'], 'MT891296.1': [], 'I22': ['MT891298.1', 'MT891300.1'], 'MT891298.1': [], 'MT891300.1': [], 'I23': ['I24', 'I27'], 'I24'

In [63]:
'''
    input: filename: filename which has the mapping between haplotypes and IDs
    output: map_hap: a dictionary which maps ID -> haplotype
'''
def read_map_txt(filename):
    map_hap = {}
    with open(filename, "r") as f:
      for l in f.readlines():
        temp_lst = l.split()
        map_hap[temp_lst[1]] = temp_lst[0]
      return map_hap
    
def read_rev_map_txt(filename):
    map_hap = {}
    with open(filename, "r") as f:
      for l in f.readlines():
        temp_lst = l.split()
        map_hap[temp_lst[0]] = temp_lst[1]
      return map_hap

In [49]:
map_hap = read_map_txt("mapping_haplotypes.txt")

'''Returns the IDs of a nodes left and right children
    input:  key: the name of the parent node (string)
            t_dict: the dictionary mapping nodes to their children
            map_hap: the dictionary mapping haplotypes to their ID
'''
def get_kids(key, t_dict, map_hap):
    kid1 = t_dict[key][0]
    kid2 = t_dict[key][1]

    if kid1[0] != 'I':
        kid1 = map_hap[kid1]
    if kid2[0] != 'I':
        kid2 = map_hap[kid2]

    return kid1, kid2

''' Returns the node for the given ID
    inputs: kid: the ID of a node (string)
            I: a list of internal nodes
            H: a list of haplotype leaf nodes
    output: kid: the node corresponding to that ID
'''
def kid_node(kid, I, H):
    id = int(kid[1:])

    if kid[0] =='I':
        kid = I[id]
    else: kid = H[id]

    return kid

H = [None] * 54
I = [None] * (len(nj_dict.keys()) - 51)

for key in list(reversed(nj_dict.keys())):
    print(key)
    if key[0] == 'I':
        kid1, kid2 = get_kids(key)
        I[int(key[1:])] = Node(key, kid_node(kid1), kid_node(kid2))
    else:
        hap = map_hap[key]
        H[int(hap[1:])] = Node(hap, None, None)


HQ992981.1
AY849725.1
HQ992977.1
I51
HQ992976.1
I50
MT891310.1
I49
AY849729.1
I48
AY849733.1
HQ992983.1
I47
I46
HQ992973.1
AF291584.2
I45
AF291582.2
AF291579.2
I44
AY849715.1
I43
HQ992971.1
HQ992968.1
I42
I41
HQ992982.1
I40
MT891309.1
HQ992984.1
I39
AY849731.1
HQ992980.1
I38
I37
I36
MT891308.1
I35
I34
AF291583.2
I33
MT891303.1
MT891301.1
I32
MT891302.1
I31
MT891305.1
I30
MT891299.1
I29
MT891304.1
I28
MT891306.1
I27
MT891294.1
MT891297.1
I26
HQ992985.1
I25
MT891295.1
I24
I23
MT891300.1
MT891298.1
I22
MT891296.1
I21
I20
AY849721.1
HQ992974.1
I19
HQ992966.1
I18
AF291585.2
HQ992969.1
I17
HQ992965.1
AF291581.2
I16
HQ992978.1
I15
I14
I13
I12
HQ992970.1
MT891307.1
I11
I10
I9
AY849716.1
AY849734.1
I8
HQ992975.1
I7
I6
I5
AF291586.2
HQ992967.1
I4
I3
HQ992964.1
I2
I1


In [56]:
for x in I:
    if x != None:
        print(x.name)

I1
I2
I3
I4
I5
I6
I7
I8
I9
I10
I11
I12
I13
I14
I15
I16
I17
I18
I19
I20
I21
I22
I23
I24
I25
I26
I27
I28
I29
I30
I31
I32
I33
I34
I35
I36
I37
I38
I39
I40
I41
I42
I43
I44
I45
I46
I47
I48
I49
I50
I51


In [57]:
for x in H:
    if x != None:
        print(x.name)

H2
H3
H4
H5
H6
H7
H8
H9
H10
H11
H12
H13
H14
H15
H16
H17
H18
H19
H20
H21
H22
H23
H24
H25
H26
H27
H28
H29
H30
H31
H32
H33
H34
H35
H36
H37
H38
H39
H40
H41
H42
H43
H44
H45
H46
H47
H48
H49
H50
H51
H52
H53


In [58]:
len(nj_dict.keys())

103

In [67]:
def generate_newick(root, mapping = None):
    ''' Complete this function. '''
    def display(root): #this is from discussion
        if root.right is None and root.left is None:
            if mapping == None: 
                return str(root.name)
            else: 
                return mapping[root.name]
        elif root.right is None or root.left is None:
            return '%s' % (generate_newick(display[root][0]))
        [left_child, right_child] = [root.left, root.right]
        return '(%s, %s)' % (display(left_child), display(right_child))
    return display(root) + ';'

In [68]:
rev_map_hap = read_rev_map_txt("mapping_haplotypes.txt")
nj_nwk_res = generate_newick(I[1], rev_map_hap)

In [69]:
print(nj_nwk_res)

((HQ992964.1, ((HQ992967.1, AF291586.2), (((HQ992975.1, (AY849734.1, AY849716.1)), (((MT891307.1, HQ992970.1), ((((HQ992978.1, (AF291581.2, HQ992965.1)), (HQ992969.1, AF291585.2)), (HQ992966.1, (HQ992974.1, AY849721.1))), ((MT891296.1, (MT891298.1, MT891300.1)), ((MT891295.1, (HQ992985.1, (MT891297.1, MT891294.1))), (MT891306.1, (MT891304.1, (MT891299.1, (MT891305.1, (MT891302.1, (MT891301.1, MT891303.1)))))))))), (AF291583.2, ((MT891308.1, (((HQ992980.1, AY849731.1), (HQ992984.1, MT891309.1)), (HQ992982.1, ((HQ992968.1, HQ992971.1), (AY849715.1, (AF291579.2, AF291582.2)))))), (AF291584.2, HQ992973.1))))), ((HQ992983.1, AY849733.1), (AY849729.1, (MT891310.1, (HQ992976.1, (HQ992977.1, AY849725.1)))))))), HQ992981.1);


In [66]:
rev_map_hap

{'H1': 'MT891293.1',
 'H2': 'MT891294.1',
 'H3': 'MT891295.1',
 'H4': 'MT891296.1',
 'H5': 'MT891297.1',
 'H6': 'MT891298.1',
 'H7': 'MT891299.1',
 'H8': 'MT891300.1',
 'H9': 'MT891307.1',
 'H10': 'MT891308.1',
 'H11': 'MT891301.1',
 'H12': 'MT891302.1',
 'H13': 'MT891303.1',
 'H14': 'MT891304.1',
 'H15': 'MT891309.1',
 'H16': 'MT891305.1',
 'H17': 'MT891306.1',
 'H18': 'MT891310.1',
 'H19': 'HQ992985.1',
 'H20': 'HQ992977.1',
 'H21': 'AY849729.1',
 'H22': 'HQ992978.1',
 'H23': 'HQ992973.1',
 'H24': 'AY849725.1',
 'H25': 'AF291584.2',
 'H26': 'HQ992976.1',
 'H27': 'AF291581.2',
 'H28': 'HQ992984.1',
 'H29': 'HQ992982.1',
 'H30': 'HQ992980.1',
 'H31': 'HQ992975.1',
 'H32': 'HQ992974.1',
 'H33': 'HQ992970.1',
 'H34': 'HQ992969.1',
 'H35': 'HQ992965.1',
 'H36': 'AY849734.1',
 'H37': 'AY849715.1',
 'H38': 'AF291583.2',
 'H39': 'HQ992983.1',
 'H40': 'HQ992968.1',
 'H41': 'AY849721.1',
 'H42': 'AY849716.1',
 'H43': 'AF291585.2',
 'H44': 'AF291579.2',
 'H45': 'HQ992967.1',
 'H46': 'HQ992966.1

In [71]:
weigh_nwk = "((HQ992975.1:0.012469,(HQ992967.1:0.011902,(AF291586.2:0.014078,(HQ992964.1:0.019344,(HQ992966.1:0.024529,(HQ992981.1:0.022613,(AY849734.1:0.006718,AY849716.1:0.007107):0.025716):0.026652):0.021807):0.019590):0.017275):0.020218):0.023156,(((MT891294.1:0.011633,(HQ992985.1:0.019944,AF291583.2:0.019909):0.012843):0.009883,(MT891293.1:0.007192,(MT891297.1:0.008317,MT891295.1:0.007838):0.007820):0.009190):0.016433,((MT891302.1:0.012191,(HQ992965.1:0.017702,(MT891301.1:0.020946,MT891307.1:0.021319):0.018557):0.015039):0.015340,(AY849721.1:0.012695,(MT891299.1:0.011433,MT891310.1:0.011753):0.015219):0.011444):0.014000):0.016587,(((HQ992982.1:0.012611,(AF291579.2:0.014543,(HQ992980.1:0.015653,((MT891303.1:0.025992,AY849731.1:0.025998):0.019992,(MT891304.1:0.025837,HQ992971.1:0.026152):0.019389):0.018918):0.017743):0.016960):0.016260,((HQ992984.1:0.011533,(HQ992968.1:0.013680,(MT891309.1:0.016582,(AF291582.2:0.020567,HQ992970.1:0.021698):0.017300):0.015087):0.013468):0.012842,(AY849715.1:0.009327,MT891308.1:0.009164):0.010623):0.014810):0.021668,(((HQ992983.1:0.011208,(MT891300.1:0.015839,(HQ992973.1:0.017817,AY849733.1:0.017237):0.016830):0.013562):0.016322,(MT891306.1:0.006630,(MT891305.1:0.011847,(MT891296.1:0.005127,MT891298.1:0.004062):0.013701):0.010325):0.012812):0.014971,((AY849729.1:0.007568,(HQ992976.1:0.007984,(HQ992977.1:0.010446,(AF291584.2:0.012761,AY849725.1:0.012783):0.010421):0.009738):0.010929):0.015098,((HQ992978.1:0.004693,(AF291581.2:0.007693,HQ992974.1:0.008461):0.005660):0.006709,(HQ992969.1:0.002925,AF291585.2:0.003956):0.005185):0.014433):0.014086):0.014676):0.017828);"

In [72]:
weigh_tree = loads(weigh_nwk)
weigh_post_order = post_order(weigh_tree[0])
print(weigh_post_order)

tw = Tree(1, weigh_tree[0])
tw.create_tree(weigh_tree[0])
weigh_dict = tw.format_dict
print(weigh_dict)

['AF291585.2', 'HQ992969.1', None, 'HQ992974.1', 'AF291581.2', None, 'HQ992978.1', None, None, 'AY849725.1', 'AF291584.2', None, 'HQ992977.1', None, 'HQ992976.1', None, 'AY849729.1', None, None, 'MT891298.1', 'MT891296.1', None, 'MT891305.1', None, 'MT891306.1', None, 'AY849733.1', 'HQ992973.1', None, 'MT891300.1', None, 'HQ992983.1', None, None, None, 'MT891308.1', 'AY849715.1', None, 'HQ992970.1', 'AF291582.2', None, 'MT891309.1', None, 'HQ992968.1', None, 'HQ992984.1', None, None, 'HQ992971.1', 'MT891304.1', None, 'AY849731.1', 'MT891303.1', None, None, 'HQ992980.1', None, 'AF291579.2', None, 'HQ992982.1', None, None, None, 'MT891310.1', 'MT891299.1', None, 'AY849721.1', None, 'MT891307.1', 'MT891301.1', None, 'HQ992965.1', None, 'MT891302.1', None, None, 'MT891295.1', 'MT891297.1', None, 'MT891293.1', None, 'AF291583.2', 'HQ992985.1', None, 'MT891294.1', None, None, None, 'AY849716.1', 'AY849734.1', None, 'HQ992981.1', None, 'HQ992966.1', None, 'HQ992964.1', None, 'AF291586.2', Non

In [73]:
Hw = [None] * 54
Iw = [None] * (len(weigh_dict.keys()) - 51)

for key in list(reversed(weigh_dict.keys())):
    print(key)
    if key[0] == 'I':
        kid1, kid2 = get_kids(key)
        Iw[int(key[1:])] = Node(key, kid_node(kid1), kid_node(kid2))
    else:
        hap = map_hap[key]
        Hw[int(hap[1:])] = Node(hap, None, None)

AF291585.2
HQ992969.1
I51
HQ992974.1
AF291581.2
I50
HQ992978.1
I49
I48
AY849725.1
AF291584.2
I47
HQ992977.1
I46
HQ992976.1
I45
AY849729.1
I44
I43
MT891298.1
MT891296.1
I42
MT891305.1
I41
MT891306.1
I40
AY849733.1
HQ992973.1
I39
MT891300.1
I38
HQ992983.1
I37
I36
I35
MT891308.1
AY849715.1
I34
HQ992970.1
AF291582.2
I33
MT891309.1
I32
HQ992968.1
I31
HQ992984.1
I30
I29
HQ992971.1
MT891304.1
I28
AY849731.1
MT891303.1
I27
I26
HQ992980.1
I25
AF291579.2
I24
HQ992982.1
I23
I22
I21
MT891310.1
MT891299.1
I20
AY849721.1
I19
MT891307.1
MT891301.1
I18
HQ992965.1
I17
MT891302.1
I16
I15
MT891295.1
MT891297.1
I14
MT891293.1
I13
AF291583.2
HQ992985.1
I12
MT891294.1
I11
I10
I9
AY849716.1
AY849734.1
I8
HQ992981.1
I7
HQ992966.1
I6
HQ992964.1
I5
AF291586.2
I4
HQ992967.1
I3
HQ992975.1
I2
I1


In [75]:
for x in Iw:
    if x != None:
        print(x.name)

I1
I2
I3
I4
I5
I6
I7
I8
I9
I10
I11
I12
I13
I14
I15
I16
I17
I18
I19
I20
I21
I22
I23
I24
I25
I26
I27
I28
I29
I30
I31
I32
I33
I34
I35
I36
I37
I38
I39
I40
I41
I42
I43
I44
I45
I46
I47
I48
I49
I50
I51


/Users/anabellafalk/cs4775/cs4775_final/cs4775_final_project_red_pandas/parsimony


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
