# Testing of Size Matched Model

# Imports

In [1]:
import sys
import os
import time
import random
import copy
import math

import numpy as np
import scipy
import pandas as pd

%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

import ete3

In [2]:
# Configure graphics parameters

params = {
    'font.size': 12,
    'axes.titlesize': 12,
    'axes.labelsize': 12,
    'legend.fontsize': 12,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'font.family': "Helvetica",
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
    'figure.dpi': 150
   }
mpl.rcParams.update(params)
mpl.rc('savefig', dpi=500)
sns.set_style("ticks")
output_dir = "outs"
output_suffix = ""
output_formats = [".png", ".pdf"]
savefig_args = {"dpi": 500, "bbox_inches": "tight", "pad_inches": 0.05}

def save_figure(fig, name, output_dir=output_dir, output_suffix=output_suffix, output_formats=output_formats, savefig_args=savefig_args):
    for output_format in output_formats:
        fig.savefig(output_dir + "/" + name + output_suffix + output_format, **savefig_args)
    return None

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
sys.path.append("../../jungle/") # specify path to jungle
import jungle as jg

# Testing

In [4]:
# Create size matched model
M = jg.SizeMatchedModel(bins = [0, 1, 5, 10], params = [(0, 1), (0, 2), (0, 3)], distribution=scipy.stats.norm)

In [5]:
# Compute P value
M.pvalue(x=3, size=0.5, strict_bounds=False, invert_cdf=True)

0.0013498980316301035

In [6]:
# Compute P value
M.pvalue(x=3, size=3, strict_bounds=False, invert_cdf=True)

0.06680720126885809

In [7]:
# Compute P value
M.pvalue(x=3, size=7, strict_bounds=False, invert_cdf=True)

0.15865525393145707

In [8]:
# Compute P value
M.pvalue(x=3, size=20, strict_bounds=False, invert_cdf=True)

0.15865525393145707

In [9]:
# Compute P value with strict bounds (should raise error)
M.pvalue(x=3, size=20, strict_bounds=True, invert_cdf=True)

ValueError: Size must be within bounds of bins (if strict_bounds=True)

In [10]:
# Compute model mean
M.model_mean(size=3, strict_bounds=True)

0.0

In [11]:
# Compute model mean
M.model_mean(size=20, strict_bounds=False)

0.0

In [12]:
# Compute model mean with strict bounds (should raise error)
M.model_mean(size=20, strict_bounds=True)

ValueError: Size must be within bounds of bins (if strict_bounds=True)

In [13]:
# Write to JSON
M.to_json("test.json")

In [14]:
# Load from JSON
M2 = jg.SizeMatchedModel.from_json("test.json")

ValueError: malformed string

In [15]:
# Compare results after loading
print "M", M.pvalue(3, 9)
print "M2", M2.pvalue(3, 9)

M 0.8413447460685429
M2

NameError: name 'M2' is not defined

# Testing annotation and calculating P value

In [16]:
# Generate tree

n_trees = 1
n_leaves = 100

F_kingman = jg.Forest.generate(n_trees=n_trees, name="kingman", params={"n_leaves": n_leaves, "alpha": 2.0})




In [17]:
# Annotate standard features
F_kingman.annotate_standard_node_features()

In [18]:
# Annotate colless
F_kingman.annotate_colless()

In [24]:
# Create size matched model
model = jg.SizeMatchedModel(bins = [0, 5, 10, 20, 50, 101],
                            params = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)],
                            distribution=scipy.stats.norm,
                            name="kingman")

In [25]:
# Compute P value
F_kingman.pvalue("colless", model, strict_bounds=False)

In [26]:
# Get node features
node_features = F_kingman.node_features()
print node_features.shape
node_features.head()

(199, 16)


Unnamed: 0_level_0,Unnamed: 1_level_0,dist,num_descendants,difference_num_descendants,colless_model_mean_kingman,colless_pvalue_kingman,colless_pvalue_my_suffix,name,colless,support,depth_rank,depth_normalized,depth,num_children,num_leaf_descendants,colless_model_mean_my_suffix,is_leaf
name_tree,id_node,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
,0,0.0,198,68.0,4.0,1.0,1.0,,552,1.0,0,0.0,0.0,2,100,4.0,False
,1,2.32722,64,14.0,3.0,1.0,1.0,_0,72,1.0,1,0.901045,2.32722,2,33,3.0,False
,2,0.80036,132,94.0,4.0,1.0,1.0,_1,412,1.0,2,0.309881,0.80036,2,67,4.0,False
,3,0.06589,24,6.0,2.0,1.0,1.0,_2,24,1.0,3,0.926556,2.39311,2,13,2.0,False
,4,0.12681,38,4.0,3.0,1.0,1.0,_3,34,1.0,4,0.950143,2.45403,2,20,3.0,False


In [27]:
# Compute P value again, adding custom suffix
F_kingman.pvalue("colless", model, suffix="my_suffix", strict_bounds=False)

In [28]:
# Get node features
node_features = F_kingman.node_features()
print node_features.shape
node_features.head()

(199, 16)


Unnamed: 0_level_0,Unnamed: 1_level_0,dist,num_descendants,difference_num_descendants,colless_model_mean_kingman,colless_pvalue_kingman,colless_pvalue_my_suffix,name,colless,support,depth_rank,depth_normalized,depth,num_children,num_leaf_descendants,colless_model_mean_my_suffix,is_leaf
name_tree,id_node,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
,0,0.0,198,68.0,4.0,1.0,1.0,,552,1.0,0,0.0,0.0,2,100,4.0,False
,1,2.32722,64,14.0,3.0,1.0,1.0,_0,72,1.0,1,0.901045,2.32722,2,33,3.0,False
,2,0.80036,132,94.0,4.0,1.0,1.0,_1,412,1.0,2,0.309881,0.80036,2,67,4.0,False
,3,0.06589,24,6.0,2.0,1.0,1.0,_2,24,1.0,3,0.926556,2.39311,2,13,2.0,False
,4,0.12681,38,4.0,3.0,1.0,1.0,_3,34,1.0,4,0.950143,2.45403,2,20,3.0,False
