In [38]:
%magic

In [1]:
"""
@author: Abtin Khodadadi
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
def get_df_info(df):
    df_info = pd.DataFrame(columns = ['distinct_count', 'nan_count', 'min_val', 'max_val'])    
    for att in list(df):
        #we give the tuple (att, 'nan_count') as the index for at
        df_info.at[att,'nan_count'] = sum(df[att].isnull())
        df_info.at[att,'distinct_count'] = len(df[att].unique())
        df_info.at[att,'min_val'] = min(df[att])
        df_info.at[att,'max_val'] = max(df[att])    
    df_info = df_info.sort_values(['distinct_count', 'nan_count'], ascending = [0,0])
    return df_info

def test_script():
    dft  = pd.DataFrame([[np.nan, 1.0, 'a', True],['-2', np.nan, 'b',False],['3-',3.0,np.nan, True],['+4',4.0,'c',np.nan]],\
                   columns = ['A','B','C','D'])
    
    print(dft)
test_script()

print('************************************************************')
df_raw = pd.read_csv('Biomarker_Data_Fern_8_29_16.csv', na_values=[".m", ".e", ".a", ".t"])
print('({0}r, {1}c) all data'.format(df_raw.shape[0], df_raw.shape[1]))
df = df_raw.dropna(subset = ['sbp', 'ttodeath', 'death']).round(
            {
                'ttostroke':1,
                'ttohf':1,
                'ttomi':1,
                'pa_std':1,
                'crp':1,
                'ttodeath': 1,
                'bmi':1,                
                'mmse':0,
                'height':0,
                'weight':0,
            }).copy()
print('({0}r, {1}c) removed rows with NaN for sbp, ttodeath or death'.format(df.shape[0], df.shape[1]))
x = set(list(df))-set(['sbp', 'ttodeath', 'death'])
no_nan_df = df.dropna(subset=x)
print('({0}r, {1}c) removed rows with NaN values'.format(no_nan_df.shape[0], no_nan_df.shape[1]))
# pd.isnull(df).any()
df_info = get_df_info(df)
df_info.drop('id',axis=0, inplace=True)

print(sum(df_info['distinct_count']))
print(df_info)


     A    B    C      D
0  NaN  1.0    a   True
1   -2  NaN    b  False
2   3-  3.0  NaN   True
3   +4  4.0    c    NaN
************************************************************
(10752r, 50c) all data
(10599r, 50c) removed rows with NaN for sbp, ttodeath or death
(2573r, 50c) removed rows with NaN values


  result = _values_from_object(self).round(decimals)


6762
            distinct_count nan_count min_val max_val
ldl                    746       236     4.1  336.76
chol                   657       107   58.76     439
pkyrs                  591      1700     NaN     NaN
trig                   511       112     6.8    1529
ttohf                  493       275       0    2368
ins                    425       518     0.1     584
crp                    395       216     0.1   157.5
bmi                    343        37    14.6    82.8
glucose                296       129      20     657
cystatin               266       943    0.38     8.7
weight                 222        31      72     341
ttomi                  192       561       0      19
ttostroke              192       249       0      19
ttodeath               192         0       0    19.1
gait                   190      2135     NaN     NaN
sbp                    182         0      76     236
creat                  127       128    0.29    10.6
hdl                    121       118     

In [None]:
# %load AgingData.py

"""
Created on Tue Jun 28 14:09:08 2016

@author: Chad Kunde
"""

from __future__ import print_function, division

import os, sys
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import ShuffleSplit, cross_val_score, train_test_split, cross_val_predict
from sklearn.externals.six import StringIO
import pydot_ng as pydot
import Splitters as spl
import functools
import time
from CustomTree import Tree as CustomTree
from datetime import datetime
#import cPickle as pickle
import pickle

class DTtest:
    def __init__(self):
        pd.set_option('display.max_columns', 10)
        pd.set_option('precision', 3)
        np.set_printoptions(precision=3,suppress=True)

        self.raw = pd.read_csv("Biomarker_Data_Fern_8_29_16.csv", na_values=[".m", ".e", ".a", ".t"])

        # Drop all rows with missing values
        self.data = self.raw.dropna(subset=["sbp", "death", "ttodeath"]).round(2).round(
            {
                'mmse':0,
                'height':1,
                'weight':1,
            }).copy()

        self.train, self.test = train_test_split(self.data, train_size=0.80, random_state=7)
        self.labels = list(self.data)

        return


    def MakeTree(self, crit, name, depth=None, leaf_samples=1, leaf_nodes=None):
        rf = tree.DecisionTreeClassifier(criterion=crit,
                                    max_depth=depth,
                                    min_samples_leaf=leaf_samples,
                                    max_leaf_nodes=leaf_nodes)

        rf.fit(self.train[:,2:-2], self.train[:,-2])

        print("Test:", name)
        print("5-way cross-val scores:", cross_val_score( rf, self.data.iloc[:, 2:-2], self.data.iloc[:,-2], cv = 5 ))
        print("Training score:", rf.score( self.train[:,2:-2], self.train[:,-2] ))
        print("Testing score:", rf.score( self.test[:,2:-2],  self.test[:,-2]  ))

        dot_data = StringIO()
        tree.export_graphviz(rf, out_file=dot_data,
                             feature_names=list(self.data)[2:-2],
                             class_names=["Alive", "Deceased"],
                             filled=True, rounded=True,
                             special_characters=True)
        graph = pydot.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf(name+".pdf")
        graph.write_png(name+".png")

        return

def Summary(trNode, dpth, trNum, side):
     ret = pd.DataFrame.from_dict({ "Feature": [trNode.node.splitr.feature],
                                         "Value": [trNode.node.splitr.value],
                                         "Level": [dpth],
                                         "TrainLeft": [trNode.node.left.splitr.coef],
                                         "TrainRight": [trNode.node.right.splitr.coef],
                                         "TrainDiff": [trNode.node.left.splitr.coef - trNode.node.right.splitr.coef],
                                         "TestLeft": [trNode.left.val],
                                         "TestRight": [trNode.right.val],
                                         "TestDiff": [trNode.left.val - trNode.right.val],
                                         "TrainBase": [trNode.node.splitr.coef],
                                         "TestBase": [trNode.val],
                                         "TreeNum": [trNum],
                                         "Side": [side],
                                         })
     ret["Direction"] = ~((ret.TestDiff<0) ^ (ret.TrainDiff<0))
     ret["Difference"] = abs(ret.TestDiff/ret.TrainDiff)
     ret["LeftScale"] = abs(ret.TestLeft/ret.TrainLeft)
     ret["RightScale"] = abs(ret.TestRight/ret.TrainRight)
     return ret
    
def Testing():
    test = DTtest()
    data = test.data[list(test.data)[3:-6]]
    from Splitters import SplitCoef_statsmod as smod
    
    from CustomTree import TreeNode, Tree as CustomTree

    filler = np.zeros(1)
    
    tr = CustomTree(smod, smod, max_depth=2)
    tr.root = TreeNode(0, None, smod, filler, filler, filler)
    tr.root.splitr = smod(filler, filler, filler)
    tr.root.left = TreeNode(0,tr.root, smod, filler,filler,filler)
    tr.root.left.splitr = smod(filler, filler, filler)
    tr.root.right = TreeNode(0,tr.root, smod, filler,filler,filler)
    tr.root.right.splitr = smod(filler, filler, filler)
    return
    
    train, test = train_test_split(data, train_size=0.60, random_state=7)
    tr.fit(train[~np.isnan(train.dsst)], train.death[~np.isnan(train.dsst)])
    result = tr.test(test[~np.isnan(test.dsst)], test.death[~np.isnan(test.dsst)])

    summaries = pd.DataFrame()

    summaries = summaries.append(Summary(result,1))
    summaries = summaries.append(Summary(result.left,2))
    summaries = summaries.append(Summary(result.right,2))

    print(summaries)

def cross_val(idxs, data, pool, min_split):
    n, (train_idx, test_idx) = idxs
    tr = CustomTree(spl.SplitCoef_statsmod, spl.SplitCoef_statsmod, max_depth=2, treeNum=n)
    tr.fit(data.iloc[train_idx], data.death.iloc[train_idx], pool=pool, min_split=min_split)
    t = tr.test(data.iloc[test_idx], data.death.iloc[test_idx])
    print(datetime.now().time())
    return t

def finish(res, dest):
    if not os.path.exists(dest):
        os.makedirs(dest)
    summaries = pd.DataFrame()
    trees = list()
    for t in res:
        try:
            graph = pydot.graph_from_dot_data(repr(t.tree))
            graph.write_svg("{0}/CrossVal{1}_train.svg".format(dest, t.tree.treeNum))
            graph = pydot.graph_from_dot_data(repr(t))
            graph.write_svg("{0}/CrossVal{1}_test.svg".format(dest, t.tree.treeNum))
        except:
            print("Unexpected error:", t.tree.treeNum, sys.exc_info()[0])
        summaries = summaries.append(Summary(t, 1, t.tree.treeNum, "Root"))
        summaries = summaries.append(Summary(t.left, 2, t.tree.treeNum, "Left"))
        summaries = summaries.append(Summary(t.right, 2, t.tree.treeNum, "Right"))
        trees.append(t)
 
    print(summaries)
    summaries.to_csv("./{0}/summary.csv".format(dest))
    with open("{0}/trees.pkl".format(dest), 'wb') as f:
        pickle.dump(trees, f, pickle.HIGHEST_PROTOCOL)


def process(args):

    test = DTtest()
    data = test.data[list(test.data)[3:-6]]

    if args.sub is not None:
        data = data[:args.sub]

    if not os.path.exists(args.dest):
        os.makedirs(args.dest)

    ss = ShuffleSplit(n_splits=args.nsplits, test_size=args.test, random_state=args.seed)
    build_tree = functools.partial(cross_val, data=data, min_split=args.min_split)


    if args.dask is not None:
        from distributed import Client
        client = Client(args.dask)
        print(client)
        build_tree = functools.partial(cross_val, data=data, pool=client, min_split=args.min_split)
    elif args.nprocs is not None:
        from multiprocessing import Pool
        pool = Pool(processes=args.nprocs)
        build_tree = functools.partial(cross_val, data=data, pool=pool, min_split=args.min_split)
    else:
        from multiprocessing import Pool
        pool = Pool(processes=1)
        build_tree = functools.partial(cross_val, data=data, pool=pool, min_split=args.min_split)
        print(hasattr(pool, "gather"))
        
    print(datetime.now().time())
    cv_scores = [build_tree(idx) for idx in enumerate(ss.split(data))]
    finish(cv_scores, args.dest)
    return
    
    
if __name__ == "__main__":
#    Testing()
#    sys.exit(0)

    import argparse
    parser = argparse.ArgumentParser(description='Build decision trees using Cox Proportional Hazard models.')
    parser.add_argument('--dest', help='Output folder prefix for trees and summary.', default='CrossVal')
    parser.add_argument('--dask', help='Dask scheduler (ip:port)', default=None)
    parser.add_argument('--nprocs', help='Parallel processes (for local only).', default=None, type=int)
    parser.add_argument('--sub', help='Subset of data to process.', default=None, type=int) 
    parser.add_argument('--nsplits', help='Number of random subsets.', default=2, type=int)
    parser.add_argument('--test', help='Portion of data in test set.', default=0.4, type=float)
    parser.add_argument('--seed', help='Random seed for splits.', default=7, type=int)
    parser.add_argument('--min_split', help='Minimum portion of data in a split. (Default = 0.25)', default=0.25, type=float)
    args = parser.parse_args()
    
    args.dest = (args.dest + "_n" + str(args.nsplits) +
                     "_s" + str(args.seed) +
                     "_" + datetime.now().strftime('%Y%m%d_%H%M%Z'))
    type(args)
    print(args.dest)
    raw_input()
    
    process(args)
            



In [None]:
# %load CustomTree.py
"""
Created on Wed July 06 09:32:06 2016

@author: Chad Kunde
"""

from __future__ import print_function

from itertools import count
from string import Template
import numpy as np
from functools import partial

np.set_printoptions(precision=6, suppress=True)

class Tree:
    def __init__(self, stump_func, leaf_func=None, max_depth=2, classes=["Alive", "Deceased"], treeNum=None):
        self.stump_func = stump_func
        self.stump_f = stump_func
        self.leaf_func = leaf_func if leaf_func is not None else stump_func
        self.leaf_f = leaf_func
        self.max_depth = max_depth
        self.count = count()
        self.classes = classes
        self.root = None
        self.labels = []
        self.treeNum = treeNum

    def get_params(self, deep=None):
        return { "stump_func" : self.stump_func,
                 "leaf_func" : self.leaf_func,
                 "max_depth" : self.max_depth,
                 "classes"   : self.classes  }

    def fit(self, data, class_lbl, min_split=None, labels=None, weights=None, pool=None):
        if min_split is not None:
            self.stump_func = partial(self.stump_f, min_split=min_split)
            self.leaf_func = partial(self.leaf_f, min_split=min_split)
        if weights is None:
            self.root = TreeNode(self, next(self.count), None, self.stump_func, data, class_lbl, np.ones(len(data)))
        else:
            self.root = TreeNode(self, next(self.count), None, self.stump_func, data, class_lbl, weights) 
        self.labels = labels
        self.fit_node(self.root, pool=pool)

    def fit_node(self, node, depth=1, pool=None):
        if depth > self.max_depth:
            return
        if depth == self.max_depth:
            node.split(self.count, pool, split_func=self.leaf_func)
            return
        left, right = node.split(self.count, pool, split_func=self.stump_func)
        self.fit_node(left, depth+1, pool)
        self.fit_node(right, depth+1, pool)
        return

    def test(self, data, class_lbl=None, weights=None):
        countr = count()
        if weights is None:
            root = TestNode(self, next(countr), None, self.root, data, class_lbl, np.ones(len(data)))
        else:
            root = TestNode(self, next(countr), None, self.root, data, class_lbl, weights)
        self.test_node(root, countr)
        return root
    
    def test_node(self, test_node, countr):
        if test_node.node.left is None:
            return
        left, right = test_node.split(countr)
        self.test_node(left, countr)
        self.test_node(right, countr)
        return

    def score(self, data, class_lbl):
        node = self.test(data, class_lbl)
        return node.test_score()

    def conf_matrix(self, node=None, mat=None):
        if mat is None:
            mat = np.zeros((2,2))
            if node is None:
                return self.conf_matrix(self.root, mat)
            else:
                return self.conf_matrix(node, mat)
        if node.left is None:
            mat[:,node.node_lbl] += node.counts
            return mat
        return self.conf_matrix(node.right, self.conf_matrix(node.left, mat))

    def print_tree(self, node=None):
        if node is None:
            node = self.root
        ret = '{0}'.format(node.desc(self.labels, self.classes))
        if node.parent is not None and node.parent.parent is None:
            if node == node.parent.left:
                ret += '{0} -> {1} [labeldistance=2.5, labelangle=45, headlabel="True"];\n'.format(node.parent.node_id, node.node_id)
            else:
                ret += '{0} -> {1} [labeldistance=2.5, labelangle=-45, headlabel="False"];\n'.format(node.parent.node_id, node.node_id)
        elif node.parent is not None:
            ret += "{0} -> {1};\n".format(node.parent.node_id, node.node_id)
        if node.left:
            ret += self.print_tree(node.left)
        if node.right:
            ret += self.print_tree(node.right)
        return ret
    def __repr__(self):
        return Template('''digraph Tree {
node [shape=box, style="rounded", color="black", fontname=helvetica] ;
edge [fontname=helvetica] ;
$tree
}''').substitute(tree=self.print_tree(self.root))


class TreeNode:
    def __init__(self, tree, node_id, parent, split_func, data, class_lbl, weights):
        print(tree.treeNum, data.shape, weights.shape, weights[weights==1].sum())
        self.node_id = node_id
        self.parent = parent
        self.data = data
        self.class_lbl = class_lbl
        self.left = None
        self.right = None
        self.tree = tree

        if split_func is not None:
            self.split_func = split_func
        else:
            self.split_func = parent.split_func

        if len(data) == 0:
            self.counts = [0,0]
            self.node_lbl = 0
            self.class_props = 0
            self.weights = np.ones(0)
            self.splitr = None
            self.val = None
        else:
            self.counts = np.bincount(np.array(class_lbl, dtype=np.int32))
            self.node_lbl = np.argmax(self.counts)
            self.class_props = self.counts[self.node_lbl] / float(self.counts.sum())
            self.splitr = split_func(data, class_lbl, weights)
            self.val=self.splitr.score()
            self.weights = np.ones(len(data))if weights is None else weights
            
    def split(self, countr, pool, split_func=None):
        if split_func is not None:
            self.split_func = split_func
            self.splitr = split_func(self.data, self.class_lbl, self.weights)
            self.val=self.splitr.score()
        self.splitr.fit(pool)
        self.left = TreeNode(self.tree, next(countr), self, self.split_func,
                                 *self.splitr.mask_true(self.data, self.class_lbl, np.copy(self.weights)))
        self.right = TreeNode(self.tree, next(countr), self, self.split_func,
                                  *self.splitr.mask_false(self.data, self.class_lbl, np.copy(self.weights)))
        return self.left, self.right
    def label(self):
        return self.node_lbl
    def test_score(self, data, class_lbl):
        return self.splitr.test_score(data, class_lbl)
    def score(self):
        return self.class_props
    def desc(self, labels, classes):
        if labels is not None:
            lbl = classes[self.node_lbl]
        else:
            lbl = self.node_lbl
        return "{0} [label=<{1} <br/> {2} ({3}) <br/> {4:.4f}>];\n".format(self.node_id,
                                                                         self.splitr.full_desc(labels),
                                                                         self.counts,
                                                                         sum(self.counts),
                                                                         self.class_props)

class TestNode:
    def __init__(self, tree, node_id, parent, node, data, class_lbl, weights):
        self.tree = tree
        self.node_id = node_id
        self.parent = parent
        self.node = node
        self.node_lbl = node.node_lbl
        self.splitr = node.splitr
        self.data = data
        self.class_lbl = class_lbl
        self.weights = np.ones(len(data)) if weights is None else weights
        self.val = node.splitr.test_score(data, class_lbl, self.weights)
        if len(data) == 0:
            self.counts = np.zeros(2)
            self.class_props = np.nan
        else:
            self.counts = np.bincount(np.array(class_lbl, dtype=np.int64))
            self.counts.resize(2)
            self.class_props = self.counts[self.node_lbl] / float(self.counts.sum())
        self.left = None
        self.right = None

    def split(self, countr, split_func=None):
        if split_func is not None:
            self.split_func = split_func
            self.splitr = split_func(self.data, self.class_lbl)
        self.left = TestNode(self.tree, next(countr), self, self.node.left,
                                 *self.splitr.mask_true(self.data, self.class_lbl, self.weights))
        self.right = TestNode(self.tree, next(countr), self, self.node.right,
                                  *self.splitr.mask_false(self.data, self.class_lbl, self.weights))
        return self.left, self.right

    def score(self):
        return self.class_props
    def test_score(self):
        try:
            score = np.nan_to_num(self.splitr.score())
            if self.left is None:
                return abs(score-self.val)
            return abs(score-self.val)+self.left.test_score()+self.right.test_score()
        except:
            return 0
    def desc(self, labels, classes):
        if labels is not None:
            lbl = classes[self.node_lbl]
        else:
            lbl = self.node_lbl
        score = self.splitr.score()
        val = score-self.val if score is not None else self.val
        return "{0} [label=<{1} <br/> Diff: {2:.6f} <br/> {3} ({4}) <br/> {5:.4f}>];\n".format(
            self.node_id,
            self.splitr.full_desc(labels, self.val),
            val,
            self.counts,
            self.counts.sum(),
            self.class_props)
    def __repr__(self):
        return Template('''digraph Tree {
node [shape=box, style="rounded", color="black", fontname=helvetica] ;
edge [fontname=helvetica] ;
$tree
}''').substitute(tree=self.tree.print_tree(self))


<pre>
---
| This | is   |
|------|------|
|   a  | table|
---
</pre>