# In this notebook, I worked with the simulated data from NestedBD and formatted it as a CAFE5 input file (leaves_states.txt)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_table("actual_states_raw.txt")
df.head()

Unnamed: 0,CHR,START,END,leaf0,leaf1,leaf2,leaf3,leaf4,leaf5,leaf6,...,leaf92,leaf93,leaf94,leaf95,leaf96,leaf97,leaf98,leaf99,leaf100,leaf101
0,chr1,1,594516,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
1,chr1,594517,866156,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,chr1,866157,1046273,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
3,chr1,1046274,1226548,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,chr1,1226549,1409181,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [3]:
# Open a tree with nodes/leaves from 2 to 101

with open('gt_unscaled.nw') as f:
    lines = f.read()
tree = lines[:-2]

In [4]:
### Now, let's figure out which "leaves" are actually internal nodes.

with open('gt_unscaled.nw') as f:
    lines = f.read()
tree = lines[:-2]

nodes_list = []

for i in range(len(tree)):
    if tree[i] == "n":
        node = tree[i:i+6]
        if node[-1] == ":":
            node = node[:-1]
        nodes_list.append(node[4:])

In [5]:
### let's rename the internal nodes from "leaf#" to "node#" 

new_col_names1 = {}
for i in nodes_list:
    new_col_names1["leaf"+ i + " "] = "node"+i

df.rename(columns = new_col_names1, inplace=True)

In [6]:
### For formatting reasons, we should also delete the space after leafNumber ("leaf101 " --> "leaf101")

new_col_names2 = {}
for i in df.columns[2:]:
    if "leaf" in i:
        new_col_names2[i] = i[:-1]

new_col_names2
df.rename(columns = new_col_names2, inplace=True)

In [7]:
### CAFE5 input file uses Desc and Family ID 
### The first three columns of the dataframe will be changed to match CAFE5 format

df["Family ID"] = df["CHR"].apply(str) + '_' + df["START"].apply(str) + '_' + df["END "].apply(str)

id_column = df.pop('Family ID')
df.insert(1, 'Family ID', id_column)

del df["START"]
del df["END "]
del df["CHR"]


df["Desc"] = df["leaf9"]
df["Desc"] = "(null)"
id_column = df.pop('Desc')
df.insert(0, 'Desc', id_column)

In [8]:
df.head()

Unnamed: 0,Desc,Family ID,node0,leaf1,node2,node3,node4,node5,node6,node7,...,leaf92,leaf93,node94,node95,leaf96,leaf97,leaf98,leaf99,leaf100,leaf101
0,(null),chr1_1_594516,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
1,(null),chr1_594517_866156,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,(null),chr1_866157_1046273,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
3,(null),chr1_1046274_1226548,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,(null),chr1_1226549_1409181,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [9]:
df.to_csv("all_states.txt", sep ='\t', index=False, line_terminator = "\n")

In [10]:
### Let's create a dataframe with leaves only
for i in nodes_list:
    col = 'node'+i
    del df[col]
df.head()

Unnamed: 0,Desc,Family ID,leaf1,leaf9,leaf11,leaf13,leaf14,leaf15,leaf17,leaf18,...,leaf89,leaf90,leaf92,leaf93,leaf96,leaf97,leaf98,leaf99,leaf100,leaf101
0,(null),chr1_1_594516,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
1,(null),chr1_594517_866156,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,(null),chr1_866157_1046273,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
3,(null),chr1_1046274_1226548,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,(null),chr1_1226549_1409181,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [11]:
df.to_csv("leaves_states.txt", sep ='\t', index=False, line_terminator = "\n")