In [None]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor

import time
from numba import jit
import itertools
import random   
import time
import warnings
import gc
import math

warnings.filterwarnings("ignore")
from sklearn.linear_model import lasso_path

def get_node_depths(tree1):
    """
    Get the node depths of the decision tree

    >>> d = DecisionTreeClassifier()
    >>> d.fit([[1,2,3],[4,5,6],[7,8,9]], [1,2,3])
    >>> get_node_depths(d.tree_)
    array([0, 1, 1, 2, 2])
    """
    def get_node_depths_(current_node, current_depth, l, r, depths):
        depths += [current_depth]
        if l[current_node] != -1 and r[current_node] != -1:
            get_node_depths_(l[current_node], current_depth + 1, l, r, depths)
            get_node_depths_(r[current_node], current_depth + 1, l, r, depths)

    depths = []
    get_node_depths_(0, 0, tree1.tree_.children_left, tree1.tree_.children_right, depths) 
    return np.array(depths)

def get_node_count(tree_list,best_vars):
    num_nodes = 0
    depths = np.sum(best_vars,axis = 1)
    for i in range(len(best_vars)):
        tree1 = tree_list[i]
        node_depths = get_node_depths(tree1)
        depth_cutoff = depths[i]
        if depth_cutoff > 0:
            num_nodes = num_nodes + sum(node_depths <= depth_cutoff)
    return num_nodes


def difference_array_list(X,tree_list):
    diff_array_list = []
    for tree1 in tree_list:
        diff_array_list.append(difference_array(X,tree1))
    return np.array(diff_array_list)

def difference_array(X, tree_learner):
    """function that takes a decision tree and returns an 
    [m,d]
    Each row is an instance and each column is a depth level.
    We take the difference in internal node values to get the delta for each depth level.
    the column sum of the output is the prediction of the tree
    """
    
    node_indicator = tree_learner.decision_path(X)
    values = tree_learner.tree_.value
    vdiffs = []
    
    for i in range(0,len(X)):
        node_ids = node_indicator.indices[node_indicator.indptr[i] : node_indicator.indptr[i + 1]]
        instance_values = np.ndarray.flatten(values[node_ids])
        diffs = [j-i for i, j in zip(instance_values[:-1], instance_values[1:])]
        row = np.zeros(tree_learner.max_depth)
        row[:len(diffs)] = diffs
        vdiffs.append(row)
        
    return np.array(vdiffs)

@jit(nopython=True)
def evaluate_test_error(difference_array_list,Y,vars_z,learning_rate):
    pred = np.zeros(len(Y))
    for i in range(len(vars_z)):
        pred += np.dot(difference_array_list[i],vars_z[i])*learning_rate     
    return np.square(np.subtract(Y, pred)).mean()

from numba import jit
import itertools
import random    
@jit(nopython=True)
def precompute_predictions(diff_array_list,temp_vars,learning_rate,cycle_ind):
    
    precompute_pred = np.zeros(len(diff_array_list[0]))    
    for i in range(len(diff_array_list)):
        if i != cycle_ind:
            precompute_pred += np.dot(diff_array_list[i],temp_vars[i])*learning_rate 
   
    return precompute_pred

@jit(nopython=True)
def evaluate_candidates(diff_array_list,temp_vars,learning_rate,cycle_ind,candidates,
                        precompute_pred,Y,alpha,W_array, normalization):
    scores = []
    for candidate in candidates:
        temp_vars[cycle_ind] = candidate
        pred_candidate = np.dot(diff_array_list[cycle_ind],candidate)*learning_rate
        pred = np.add(precompute_pred,pred_candidate)
        err = np.sum((Y-pred)**2)/len(Y) + (alpha/normalization)*np.sum(np.dot(W_array[cycle_ind],candidate))
        scores.append(err)
    return scores

@jit(nopython=True)
def eval_obj(Y,diff_array_list,vars_z,learning_rate,alpha,W_array,normalization):
    pred = np.zeros(len(Y))
    regularization = 0
    for i in range(len(vars_z)):
        pred+= learning_rate*np.dot(diff_array_list[i],vars_z[i])
        regularization += np.sum(np.dot(W_array[i],vars_z[i]))
    
    bias = np.sum((Y-pred)**2)/len(Y)
    
    return bias + regularization*alpha/normalization

@jit(nopython=True)
def converge_test(sequence, threshold,tail_length):
    diff = np.diff(sequence)
    if len(diff) < (tail_length+1):
        return False
    else:
        return (np.max(np.abs(diff[-tail_length:])) < threshold)


def solve_weighted(Y,tree_list,diff_array_list,alpha,learning_rate,
                                          W_array,normalization,warm_start= []):
    max_depth = tree_list[0].max_depth
    Y = np.array(Y.values)
    
    vars_z = np.zeros((len(tree_list),max_depth))
    if len(warm_start) > 0:
        vars_z = np.array(warm_start)
    
    candidates = np.vstack([np.zeros(max_depth),np.tril(np.ones((max_depth,max_depth)))])
    
    convergence_scores = np.array([])
    converged = False
    ind_counter = 0
    local_best = 9999
    total_inds = 0
    while converged == False:
        
        cycle_ind = ind_counter % len(vars_z)   

        temp_vars= vars_z.copy()
        precompute_pred = precompute_predictions(diff_array_list,temp_vars,learning_rate,cycle_ind)
        scores = evaluate_candidates(diff_array_list,temp_vars,learning_rate,cycle_ind,
                                     candidates,precompute_pred,Y,alpha,W_array,normalization)
        
        vars_z[cycle_ind] = candidates[np.argmin(scores)]
        convergence_scores = np.append(convergence_scores,eval_obj(Y,diff_array_list,
                                                                   vars_z,learning_rate,alpha,W_array,normalization))
        converged = converge_test(np.array(convergence_scores),10**-6,3)
        
        ind_counter = ind_counter + 1
        total_inds = total_inds + 1
        
        #local search
        if converged == True:
            support_indicies = np.where(~np.all(vars_z == 0, axis=1))[0]
            zero_indicies = np.where(np.all(vars_z == 0, axis=1))[0]
            
            if convergence_scores[-1] > local_best:
                converged = True
            
            elif len(support_indicies)> 0:
                local_ind = random.choice(support_indicies)
                vars_z[local_ind] = np.zeros(max_depth)
                
                if len(zero_indicies) > 0:
                    ind_counter = min(zero_indicies)
                    converged = False
                    local_best = convergence_scores[-1]
                
                else:
                    converged = True
     
    return vars_z , total_inds

# Weight Penalties

def nodes_per_layer(tree_list):
    max_depth = tree_list[0].max_depth
    results = []
    for tree1 in tree_list:
        depths = get_node_depths(tree1)
        values,counts = np.unique(depths,return_counts = True)
        diag = np.zeros(max_depth)
        counts = counts[1:]
        diag[:len(counts)] = counts
        results.append(np.diag(diag))
    
    return np.array(results)

def total_nodes(tree_list):
    return np.sum(tree1.tree_.node_count for tree1 in tree_list) - len(tree_list)

def prune_polish(difference_array_list,Y,vars_z,learning_rate):
    pred_array = []
    for i in range(len(vars_z)):
        if sum(vars_z[i])>0:
            pred_array.append(np.dot(difference_array_list[i],vars_z[i])*learning_rate)
    
    if len(pred_array) == 0:
        return np.zeros(len(vars_z))
    
    pred_array = np.transpose(pred_array)
    lm = Ridge(alpha = 0.01, fit_intercept = False).fit(pred_array,Y)
    coef = lm.coef_
    return coef

@jit(nopython=True)
def evaluate_test_error_polished(difference_array_list,Y,vars_z,coef,learning_rate):
    pred = np.zeros(len(Y))
    j = 0
    for i in range(len(vars_z)):
        if sum(vars_z[i])>0:
            pred += np.dot(difference_array_list[i],vars_z[i])*learning_rate*coef[j]  
            j+=1
    return np.square(np.subtract(Y, pred)).mean(), pred

import time
def subensemble_predict(X,tree_list,learning_rate,ntrees):
    pred = np.zeros(len(X))
    for tree1 in tree_list[:ntrees]:
        pred += tree1.predict(X)*learning_rate
    return pred

def get_node_count_all(tree_list):
    num_nodes = 0
    for tree1 in tree_list:
        num_nodes = num_nodes + tree1.tree_.node_count
    return num_nodes

def lasso_predict(X,tree_list,coef):
    pred = np.zeros(len(X))
    for i in range(len(tree_list)):
        pred += tree_list[i].predict(X)*coef[i]
    return pred

import gurobipy as gp
from gurobipy import GRB
from itertools import product

def miqp(features, response, non_zero, warm_up=None, verbose=False):
    """
    Deploy and optimize the MIQP formulation of L0-Regression.
    """
    assert isinstance(non_zero, (int, np.integer))
    regressor = gp.Model()
    samples, dim = features.shape
    assert samples == response.shape[0]
    assert non_zero <= dim

    # Append a column of ones to the feature matrix to account for the y-intercept
    X = features
    
    # Decision variables
    beta = regressor.addVars(dim, lb=-GRB.INFINITY, name="beta") # Weights

    # iszero[i] = 1 if beta[i] = 0  
    iszero = regressor.addVars(dim, vtype=GRB.BINARY, name="iszero") 
    
    # Objective Function (OF): minimize 1/2 * RSS using the fact that
    # if x* is a minimizer of f(x), it is also a minimizer of k*f(x) iff k > 0
    Quad = np.dot(X.T, X)
    lin = np.dot(response.T, X)
    obj = sum(0.5 * Quad[i,j] * beta[i] * beta[j]
              for i, j in product(range(dim ), repeat=2))
    obj -= sum(lin[i] * beta[i] for i in range(dim))
    obj += 0.5 * np.dot(response, response)
    regressor.setObjective(obj, GRB.MINIMIZE)
    
    # Constraint sets
    for i in range(dim):
        # If iszero[i]=1, then beta[i] = 0
        regressor.addSOS(GRB.SOS_TYPE1, [beta[i], iszero[i]])
    regressor.addConstr(iszero.sum() == dim - non_zero) # Budget constraint

    # We may use the Lasso or prev solution with fewer features as warm start
    if warm_up is not None and len(warm_up) == dim:
        for i in range(dim):
            iszero[i].start = (abs(warm_up[i]) < 1e-6)
    
    if not verbose:
        regressor.params.OutputFlag = 0
    regressor.params.timelimit = 180
    regressor.params.mipgap = 0.001
    regressor.optimize()

    coeff = np.array([beta[i].X for i in range(dim)])
    return  coeff  


def miqp_nneg(features, response, non_zero, warm_up=None, verbose=True,time_limit = 60):
    """
    Deploy and optimize the MIQP formulation of L0-Regression.
    """
    assert isinstance(non_zero, (int, np.integer))
    regressor = gp.Model()
    samples, dim = features.shape
    assert samples == response.shape[0]
    assert non_zero <= dim

    # Append a column of ones to the feature matrix to account for the y-intercept
    X = features

    
    # Decision variables
    beta = regressor.addVars(dim, lb=0, name="beta") # Weights

    # iszero[i] = 1 if beta[i] = 0  
    iszero = regressor.addVars(dim, vtype=GRB.BINARY, name="iszero") 
    
    # Objective Function (OF): minimize 1/2 * RSS using the fact that
    # if x* is a minimizer of f(x), it is also a minimizer of k*f(x) iff k > 0
    Quad = np.dot(X.T, X)
    lin = np.dot(response.T, X)
    obj = sum(0.5 * Quad[i,j] * beta[i] * beta[j]
              for i, j in product(range(dim ), repeat=2))
    obj -= sum(lin[i] * beta[i] for i in range(dim))
    obj += 0.5 * np.dot(response, response)
    regressor.setObjective(obj, GRB.MINIMIZE)
    
    # Constraint sets
    for i in range(dim):
        # If iszero[i]=1, then beta[i] = 0
        regressor.addSOS(GRB.SOS_TYPE1, [beta[i], iszero[i]])
    regressor.addConstr(iszero.sum() == dim - non_zero) # Budget constraint

    # We may use the Lasso or prev solution with fewer features as warm start
    if warm_up is not None and len(warm_up) == dim:
        for i in range(dim):
            iszero[i].start = (abs(warm_up[i]) < 1e-6)
    
    if not verbose:
        regressor.params.OutputFlag = 0
    regressor.params.timelimit = time_limit
    regressor.params.mipgap = 0.001
    regressor.optimize()

    coeff = np.array([beta[i].X for i in range(dim)])
    return  coeff     



def prune_polish_l0(difference_array_list,Y,vars_z,learning_rate, K, time_limit):
    
    pred_array = []
    for i in range(len(vars_z)):
        if sum(vars_z[i])>0:
            pred_array.append(np.dot(difference_array_list[i],vars_z[i])*learning_rate)
    
    if len(pred_array) == 0:
        return np.zeros(len(vars_z))
    
    pred_array = np.transpose(pred_array) 
    coef = miqp_nneg(pred_array,Y,K, time_limit = time_limit)
    
    return coef


In [None]:
import os
import pandas as pd
import numpy as np
import pickle as pk
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
import matplotlib.pyplot as plt



"""
load data from  https://www2.census.gov/adrm/PDB/2019/ [pdb2019trv3_us]
"""

def load_data(load_directory='',
              filename='pdb2019trv3_us.csv',
              remove_margin_of_error_variables=False): 
    """Loads Census data, and retrieves covariates and responses.
    
    Args:
        load_directory: Data directory for loading Census file, str.
        filename: file to load, default is 'pdb2019trv3_us.csv'.
        remove_margin_of_error_variables: whether to remove margin of error variables, bool scaler.
        
    Returns:
        df_X, covariates, pandas dataframe.
        df_y, target response, pandas dataframe.
    """
    file = os.path.join(load_directory, filename)
    df = pd.read_csv(file, encoding = "ISO-8859-1")
    df = df.set_index('GIDTR')
    
    # Drop location variables
    drop_location_variables = ['State', 'State_name', 'County', 'County_name', 'Tract', 'Flag', 'AIAN_LAND']
    df = df.drop(drop_location_variables, axis=1)
    
    target_response = 'Self_Response_Rate_ACS_13_17'
    # Remove extra response variables 
    # Remove response columns 'FRST_FRMS_CEN_2010' (Number of addresses in a 2010 Census Mailout/Mailback area where the first form mailed was completed and returned) and 'RPLCMNT_FRMS_CEN_2010' (Number of addresses in a 2010 Census Mailout/Mailback area where the replacement form was completed and returned)

    extra_response_variables = [
        'Census_Mail_Returns_CEN_2010',
        'Mail_Return_Rate_CEN_2010',
        'pct_Census_Mail_Returns_CEN_2010',
        'Low_Response_Score',
        'Self_Response_Rate_ACSMOE_13_17',
        'BILQ_Frms_CEN_2010',
        'FRST_FRMS_CEN_2010',
        'RPLCMNT_FRMS_CEN_2010',
        'pct_FRST_FRMS_CEN_2010',
        'pct_RPLCMNT_FRMS_CEN_2010']
    df = df.drop(extra_response_variables, axis=1)
    
    if remove_margin_of_error_variables:
        df = df[np.array([c for c in df.columns if 'MOE' not in c])]

    # Change types of covariate columns with dollar signs in their values e.g. income, housing price  
    df[df.select_dtypes('object').columns] = df[df.select_dtypes('object').columns].replace('[\$,]', '', regex=True).astype(np.float64)

    # Remove entries with missing predictions
    df_full = df.copy()
    df = df.dropna(subset=[target_response])

    df_y = df[[target_response]]
    df_X = df.drop([target_response], axis=1)


    return df_X, df_y, df_full

def process_data(df_X,
                 df_y,
                 val_ratio=0.1, 
                 test_ratio=0.1, 
                 seed=None,
                 standardize_response=False):
    """Preprocesses covariates and response and generates training, validation and testing sets.
    
      Features are processed as follows:
      Missing values are imputed using the mean. After imputation, all features are standardized. 
      Responses are processed as follow:
      Either standardized or not depending on user choice selected by standardize_response.
    Args:
        val_ratio: Percentage of samples to be used for validation, float scalar.
        test_ratio: Percentage of samples to be used for testing, float scalar.
        seed: for reproducibility of results, int scalar.
        standardize_response: whether to standardize target response or not, bool scalar.
        
    Returns:
        X_train: Training processed covariates, float numpy array of shape (N, p).
        y_train: Training (processed) responses, float numpy array of shape (N, ).
        X_val: Validation processed covariates, float numpy array of shape (Nval, p).
        y_val: Validation (processed) responses, float numpy array of shape (N, ).
        X_test: Test processed covariates, float numpy array of shape (Ntest, p).
        y_test: Test (processed) responses, float numpy array of shape (N, ).
        x_preprocessor: processor for covariates, sklearn transformer.
        y_preprocessor: processor for responses, sklearn transformer.
    """        
        
    N, p = df_X.shape
    df_X_temp, df_X_test, df_y_temp, df_y_test = train_test_split(df_X, df_y, test_size=int(test_ratio*N), random_state=seed)
    df_X_train, df_X_val, df_y_train, df_y_val = train_test_split(df_X_temp, df_y_temp, test_size=int(val_ratio*N), random_state=seed)
    
    print("Number of training samples:", df_X_train.shape[0])
    print("Number of validation samples:", df_X_val.shape[0])
    print("Number of test samples:", df_X_test.shape[0])
    print("Number of covariates:", p)
        
    ''' Processing Covariates '''    
    continuous_features = df_X.columns
    continuous_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean'))])

    x_preprocessor = ColumnTransformer(
        transformers=[
            ('continuous', continuous_transformer, continuous_features)])

    X_train = x_preprocessor.fit_transform(df_X_train)
    X_val = x_preprocessor.transform(df_X_val)
    X_test = x_preprocessor.transform(df_X_test)
    
    x_scaler = StandardScaler()
    X_train = x_scaler.fit_transform(X_train)
    X_val = x_scaler.transform(X_val)
    X_test = x_scaler.transform(X_test)    
    X_train = np.round(X_train, decimals=6)
    X_val = np.round(X_val, decimals=6)
    X_test = np.round(X_test, decimals=6)
    
    ''' Processing Target Responses '''
    if standardize_response:
        y_preprocessor = StandardScaler()
    else:
        def identity_func(x):
            return np.array(x)
        y_preprocessor = FunctionTransformer(lambda x: np.array(x)) # acts as identity

    y_train = y_preprocessor.fit_transform(df_y_train)
    y_val = y_preprocessor.transform(df_y_val)
    y_test = y_preprocessor.transform(df_y_test)
                
    return X_train, y_train, X_val, y_val, X_test, y_test, (x_preprocessor, x_scaler), y_preprocessor

In [None]:
df_X, df_y, _ = load_data(
                                  filename='pdb2019trv3_us.csv',
                                  remove_margin_of_error_variables=True)
seed = 10
np.random.seed(seed)
X, Y, Xval, Yval, Xtest, Ytest, _, y_scaler = process_data(
    df_X,
    df_y,
    val_ratio=0.01, 
    test_ratio=0.2,
    seed=seed,
    standardize_response=True)

colnames = df_X.columns
xTrain = pd.DataFrame(X,columns = df_X.columns)
xTest = pd.DataFrame(Xtest,columns = df_X.columns)
yTrain = np.ravel(Y.copy())
yTest = np.ravel(Ytest.copy())

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import multiprocessing as mp
mp.cpu_count()

### Random Forest

In [None]:
np.random.seed(seed)

xTrain_sub = xTrain.copy()
xTest_sub = xTest.copy()

rf = RandomForestRegressor(max_depth = 6, n_estimators = 500, max_features = 'sqrt', n_jobs = -1).fit(xTrain_sub,yTrain)
pred = rf.predict(xTest_sub)
print(mean_squared_error(yTest,pred))

yTest_org = np.ndarray.flatten(y_scaler.inverse_transform(yTest.reshape(-1,1)))
pred_org = np.ndarray.flatten(y_scaler.inverse_transform(pred.reshape(-1,1)))
print(np.sqrt(mean_squared_error(yTest_org,pred_org)))


In [None]:
from sklearn.tree import DecisionTreeRegressor
tree1 = DecisionTreeRegressor(max_depth = 4).fit(xTrain,yTrain)
pred = tree1.predict(xTest_sub)
print(mean_squared_error(yTest,pred))

yTest_org = np.ndarray.flatten(y_scaler.inverse_transform(yTest.reshape(-1,1)))
pred_org = np.ndarray.flatten(y_scaler.inverse_transform(pred.reshape(-1,1)))
print(np.sqrt(mean_squared_error(yTest_org,pred_org)))


In [None]:
feature_importance = pd.DataFrame(np.column_stack((rf.feature_importances_, xTrain_sub.columns)),
             columns = ['imp','feats'])
feature_importance.sort_values('imp', ascending = False).plot.bar()

In [None]:
feature_importance

In [None]:
rf_small = RandomForestRegressor(max_depth = 2, n_estimators = 10, max_features = 'sqrt'
                                 , n_jobs = -1).fit(xTrain_sub,yTrain)

pred = rf_small.predict(xTest_sub)
print(mean_squared_error(yTest,pred))

yTest_org = np.ndarray.flatten(y_scaler.inverse_transform(yTest.reshape(-1,1)))
pred_org = np.ndarray.flatten(y_scaler.inverse_transform(pred.reshape(-1,1)))
print(np.sqrt(mean_squared_error(yTest_org,pred_org)))

### Single Plot

In [None]:
tree_list = np.array(rf.estimators_)
tree1 = tree_list[0]

import sys
from EnsemblePlot.EnsemblePlot import EnsemblePlot
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
L1 = np.arange(len(xTrain_sub.columns))
##############################################################
L2 = []
for s in xTrain.columns:
    s = s.replace('_CEN_2010', '')
    s = s.replace('_ACS_13_17','')
    s = s.replace('_','-')
    s = s.replace('pct','Pct')
    L2.append(s)
##############################################################
feature_names = {k:v for k,v in zip(L1,L2)}
feature_names[-2] = 'leaf'
ep = EnsemblePlot([],[],[],[])

G_all, graphs = ep.prune_ensemble_graph([tree1], [[1,1,1,1,1,1]],feature_names, method = 'layers')


node_colors,cmap, legend_array = ep.get_colors(G_all, plot_legend = False)

mapping = legend_array[0]


scalarMap = legend_array[1]
pos1 = ep.pos_grid_layout(G_all, 2)

fig = plt.figure(figsize = (18,14))

n1 = nx.draw_networkx_nodes(G_all,pos1, node_color = node_colors, cmap = cmap,
                            node_size = 800, edgecolors = 'black' )
e1 = nx.draw_networkx_edges(G_all,pos1)
for label in mapping:
    plt.plot([0],[0],color=scalarMap.to_rgba(mapping[label]),label=label,linewidth = 10)
plt.legend( prop={'size': 18},handlelength = .5,bbox_to_anchor=(1, 1.05))

plt.savefig(path+'/census_single_tree.pdf', bbox_inches='tight')

In [None]:
print('features_used:', sum(tree1.feature_importances_>0))

### Forest Prune

In [None]:
tree_list = np.array(rf.estimators_)
W_array = nodes_per_layer(tree_list)
normalization = total_nodes(tree_list)

learning_rate = 1/len(tree_list)

base_err = sklearn.metrics.mean_squared_error(yTest,rf.predict(xTest_sub))

base_rf_nodes = 0
for tree1 in tree_list:
    base_rf_nodes = base_rf_nodes + tree1.tree_.node_count


diff_array_list = difference_array_list(xTrain_sub,tree_list)
diff_test_array_list = difference_array_list(xTest_sub,tree_list)


In [None]:
%%time
np.random.seed(seed)
best_alpha = 15
vars_best,_ = solve_weighted(pd.Series(yTrain),tree_list,diff_array_list,best_alpha,learning_rate,
                            W_array,normalization,)


In [None]:
K = 10
coef_best = prune_polish_l0(diff_array_list,yTrain,vars_best,learning_rate,K, time_limit = 60)


In [None]:
pruned_err, pred_polish = evaluate_test_error_polished(diff_test_array_list,yTest,vars_best,
                                                              coef_best,learning_rate)
print(pruned_err)

pred_polish_org = np.ndarray.flatten(y_scaler.inverse_transform(pred_polish.reshape(-1,1)))
print(np.sqrt(mean_squared_error(yTest_org,pred_polish_org)))

In [None]:
import sys
from EnsemblePlot.EnsemblePlot import EnsemblePlot
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
L1 = np.arange(len(xTrain_sub.columns))
##############################################################
L2 = []
for s in xTrain.columns:
    s = s.replace('_CEN_2010', '')
    s = s.replace('_ACS_13_17','')
    s = s.replace('_','-')
    s = s.replace('pct','Pct')
    L2.append(s)
##############################################################
feature_names = {k:v for k,v in zip(L1,L2)}
feature_names[-2] = 'leaf'
ep = EnsemblePlot([],[],[],[])

In [None]:
trees_ordered = tree_list[[sum(i) != 0 for i in vars_best]][np.argsort(-np.abs(coef_best))]
vars_ordered = vars_best[[sum(i) != 0 for i in vars_best]][np.argsort(-np.abs(coef_best))]
coefs = -np.sort(-np.abs(coef_best))

In [None]:
import matplotlib
new_rc_params = {'text.usetex': True,
         'svg.fonttype': 'none',
         'font.size': 16,
         'font.family': "Times Roman",
         'mathtext.fontset': 'custom',
         'mathtext.rm': 'Times Roman',
         'mathtext.it': 'Times Roman:italic',
         'mathtext.bf': 'Times Roman:bold'
         }
matplotlib.rcParams.update(new_rc_params)


In [None]:
G_all, graphs = ep.prune_ensemble_graph(trees_ordered[:K], vars_ordered[:K],feature_names, method = 'layers')


node_colors,cmap, legend_array = ep.get_colors(G_all, plot_legend = False)

mapping = legend_array[0]


scalarMap = legend_array[1]
pos1 = ep.pos_grid_layout(G_all, 2)



fig = plt.figure(figsize = (16,8))
ax = plt.subplot(111)
nx.draw(G_all,pos1,node_color = node_colors, 
        cmap = cmap,node_size = 400 , edgecolors = 'black' ,  with_labels = False)
for label in mapping:
    plt.plot([0],[0],color=scalarMap.to_rgba(mapping[label]),label=label,linewidth = 10)
    
#plt.legend( prop={'size': 12})
ax.legend(bbox_to_anchor=(.9, 1.))
plt.show()

In [None]:
fig = plt.figure(figsize = (16,8))
ax = plt.subplot(111)

n1 = nx.draw_networkx_nodes(G_all,pos1, node_color = node_colors, cmap = cmap,
                            node_size = 400, edgecolors = 'black' )
e1 = nx.draw_networkx_edges(G_all,pos1)
for label in mapping:
    plt.plot([0],[0],color=scalarMap.to_rgba(mapping[label]),label=label,linewidth = 10)
plt.legend( prop={'size': 12})
ax.legend(bbox_to_anchor=(1.25, 1.))

### Analyze Trees

In [None]:
import matplotlib
new_rc_params = {'text.usetex': False,
         'svg.fonttype': 'none',
         'font.size': 40,
         'font.family': "Times Roman",
         'mathtext.fontset': 'custom',
         'mathtext.rm': 'Times Roman',
         'mathtext.it': 'Times Roman:italic',
         'mathtext.bf': 'Times Roman:bold'
         }
matplotlib.rcParams.update(new_rc_params)

In [None]:
ind = 7

tree1 = trees_ordered[ind]
vars1 = vars_ordered[ind]
G_all, graphs = ep.prune_ensemble_graph([tree1], [vars1],feature_names, method = 'layers')
print(coefs[ind])

node_colors,cmap, legend_array = ep.get_colors(G_all, plot_legend = False)

mapping = legend_array[0]


scalarMap = legend_array[1]
pos1 = ep.pos_grid_layout(G_all, 2)



fig = plt.figure(figsize = (16,8))
ax = plt.subplot(111)
nx.draw(G_all,pos1,node_color = node_colors, 
        cmap = cmap,node_size = 400 , edgecolors = 'black' ,  with_labels = False)
for label in mapping:
    plt.plot([0],[0],color=scalarMap.to_rgba(mapping[label]),label=label,linewidth = 10)
    
#plt.legend( prop={'size': 12})
ax.legend(bbox_to_anchor=(.9, 1.))
plt.show()

In [None]:
plt.figure(figsize = (32,12))
a = sklearn.tree.plot_tree(tree1,max_depth = sum(vars1),feature_names = xTrain.columns)

### Additive Model From Ensemble



# Full Model

In [None]:
ntrees = 6
G_all_full, graphs_full = ep.prune_ensemble_graph(trees_ordered[:ntrees], 
                        [np.ones(6) for i in range(ntrees)] ,feature_names, method = 'layers')

node_colors1,cmap1, legend_array = ep.get_colors(G_all_full, plot_legend = False)
mapping1 = legend_array[0]

pos2  = ep.pos_grid_layout(G_all_full, 3)

fig = plt.figure(figsize = (16,8))
ax = plt.subplot(111)

n1 = nx.draw_networkx_nodes(G_all_full,pos2,  node_color = node_colors1, cmap = cmap1,
                            node_size = 90, edgecolors = 'black' , linewidths = .15)

e1 = nx.draw_networkx_edges(G_all_full,pos2, width = 0.5, arrowsize = 2)