In [None]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn.linear_model import lasso_path
from sklearn.linear_model import Ridge

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import time
from numba import jit
import itertools
import random   
import time
import warnings
import gc
import random
import pickle

warnings.filterwarnings("ignore")


# Experiment Functions
def load_openml(data_id,ordinal = True, y_label = ''):
    """Load dataset by id from OpenML. If ordinal == True, encode categorical columns 
    via ordinal encoding. If ordinal == False then encode categorical columns with dummy vars.
    """
    
    dataset1 = sklearn.datasets.fetch_openml(data_id = data_id,as_frame = True)
    name = dataset1.details['name']
    X, y = dataset1.data, dataset1.target 
    data = pd.DataFrame(X,columns = dataset1.feature_names)

    if len(y_label) == 0:
        data['y'] = y
    else:
        data['y'] = y[y_label]

    #shuffle index
    data = data.sample(frac = 1)
    y = data['y']
    y = y.astype(float)
    X = data.drop('y',axis = 1)

    #encode categorical columns
    cat = list(set(X.columns) - set(X.select_dtypes(include=np.number).columns.tolist()))
    
    if ordinal == True:
        for col in cat:
            X[col] = X[col].astype('category').cat.codes
            X[col] = X[col].fillna(max(X[col]+1))
        
    elif ordinal == False:
        X = pd.get_dummies(X,columns = cat)
    
    #impute median
    X = X.fillna(X.median()) 
    return X,y,name
  
def get_node_depths(tree1):
    """
    Get the node depths of the decision tree

    >>> d = DecisionTreeClassifier()
    >>> d.fit([[1,2,3],[4,5,6],[7,8,9]], [1,2,3])
    >>> get_node_depths(d.tree_)
    array([0, 1, 1, 2, 2])
    """
    def get_node_depths_(current_node, current_depth, l, r, depths):
        depths += [current_depth]
        if l[current_node] != -1 and r[current_node] != -1:
            get_node_depths_(l[current_node], current_depth + 1, l, r, depths)
            get_node_depths_(r[current_node], current_depth + 1, l, r, depths)

    depths = []
    get_node_depths_(0, 0, tree1.tree_.children_left, tree1.tree_.children_right, depths) 
    return np.array(depths)

def get_node_count_pruned(tree_list,best_vars):
    num_nodes = 0
    depths = np.sum(best_vars,axis = 1)
    
    for i in range(len(best_vars)):
        tree1 = tree_list[i]
        node_depths = get_node_depths(tree1)
        depth_cutoff = depths[i]
        if depth_cutoff >0:
            num_nodes = num_nodes + sum(node_depths <= depth_cutoff)
    return num_nodes

def get_node_count(tree_list):
    num_nodes = 0
    for tree1 in tree_list:
        num_nodes = num_nodes + tree1.tree_.node_count
    return num_nodes

#Experiment Helper Functions
def lasso_predict(X,tree_list,coef):
    pred = np.zeros(len(X))
    for i in range(len(tree_list)):
        pred += tree_list[i].predict(X)*coef[i]
    return pred

def subensemble_predict(X,tree_list,learning_rate,ntrees):
    pred = np.zeros(len(X))
    for tree1 in tree_list[:ntrees]:
        pred += tree1.predict(X)*learning_rate
    return pred

from numba import jit
import itertools
import random    

#initalization and evaluation functions
def difference_array_list(X,tree_list):
    diff_array_list = []
    for tree1 in tree_list:
        diff_array_list.append(difference_array(X,tree1))
    return np.array(diff_array_list)

def difference_array(X, tree_learner):
    """function that takes a decision tree and returns an 
    [m,d]
    Each row is an instance and each column is a depth level.
    We take the difference in internal node values to get the delta for each depth level.
    the column sum of the output is the prediction of the tree
    """
    
    node_indicator = tree_learner.decision_path(X)
    values = tree_learner.tree_.value
    vdiffs = []
    
    for i in range(0,len(X)):
        node_ids = node_indicator.indices[node_indicator.indptr[i] : node_indicator.indptr[i + 1]]
        instance_values = np.ndarray.flatten(values[node_ids])
        diffs = [j-i for i, j in zip(instance_values[:-1], instance_values[1:])]
        row = np.zeros(tree_learner.max_depth)
        row[:len(diffs)] = diffs
        vdiffs.append(row)
    return np.array(vdiffs)

def nodes_per_layer(tree_list):
    max_depth = tree_list[0].max_depth
    results = []
    for tree1 in tree_list:
        depths = get_node_depths(tree1)
        values,counts = np.unique(depths,return_counts = True)
        diag = np.zeros(max_depth)
        counts = counts[1:]
        diag[:len(counts)] = counts
        results.append(np.diag(diag))
    
    return np.array(results)

def total_nodes(tree_list):
    return np.sum(tree1.tree_.node_count for tree1 in tree_list) - len(tree_list)

@jit(nopython=True)
def evaluate_test_error(difference_array_list,Y,vars_z,learning_rate):
    pred = np.zeros(len(Y))
    for i in range(len(vars_z)):
        pred += np.dot(difference_array_list[i],vars_z[i])*learning_rate     
    return np.square(np.subtract(Y, pred)).mean()


from numba import jit
import itertools
import random    
@jit(nopython=True)
def precompute_predictions(diff_array_list,temp_vars,learning_rate,cycle_ind):
    
    precompute_pred = np.zeros(len(diff_array_list[0]))    
    for i in range(len(diff_array_list)):
        if i != cycle_ind:
            precompute_pred += np.dot(diff_array_list[i],temp_vars[i])*learning_rate 
   
    return precompute_pred

@jit(nopython=True)
def evaluate_candidates(diff_array_list,temp_vars,learning_rate,cycle_ind,candidates,
                        precompute_pred,Y,alpha,W_array, normalization):
    scores = []
    for candidate in candidates:
        temp_vars[cycle_ind] = candidate
        pred_candidate = np.dot(diff_array_list[cycle_ind],candidate)*learning_rate
        pred = np.add(precompute_pred,pred_candidate)
        err = np.sum((Y-pred)**2)/len(Y) + (alpha/normalization)*np.sum(np.dot(W_array[cycle_ind],candidate))
        scores.append(err)
    return scores

@jit(nopython=True)
def eval_obj(Y,diff_array_list,vars_z,learning_rate,alpha,W_array,normalization):
    pred = np.zeros(len(Y))
    regularization = 0
    for i in range(len(vars_z)):
        pred+= learning_rate*np.dot(diff_array_list[i],vars_z[i])
        regularization += np.sum(np.dot(W_array[i],vars_z[i]))
    
    bias = np.sum((Y-pred)**2)/len(Y)
    
    return bias + regularization*alpha/normalization

@jit(nopython=True)
def converge_test(sequence, threshold,tail_length):
    diff = np.diff(sequence)
    if len(diff) < (tail_length+1):
        return False
    else:
        return (np.max(np.abs(diff[-tail_length:])) < threshold)


def solve_weighted(Y,tree_list,diff_array_list,alpha,learning_rate,
                                          W_array,normalization,warm_start= []):
    max_depth = tree_list[0].max_depth
    Y = np.array(Y.values)
    
    vars_z = np.zeros((len(tree_list),max_depth))
    if len(warm_start) > 0:
        vars_z = np.array(warm_start)
    
    candidates = np.vstack([np.zeros(max_depth),np.tril(np.ones((max_depth,max_depth)))])
    
    convergence_scores = np.array([])
    converged = False
    ind_counter = 0
    local_best = 9999
    total_inds = 0
    while converged == False:
        
        cycle_ind = ind_counter % len(vars_z)   

        temp_vars= vars_z.copy()
        precompute_pred = precompute_predictions(diff_array_list,temp_vars,learning_rate,cycle_ind)
        scores = evaluate_candidates(diff_array_list,temp_vars,learning_rate,cycle_ind,
                                     candidates,precompute_pred,Y,alpha,W_array,normalization)
        
        vars_z[cycle_ind] = candidates[np.argmin(scores)]
        convergence_scores = np.append(convergence_scores,eval_obj(Y,diff_array_list,
                                                                   vars_z,learning_rate,alpha,W_array,normalization))
        converged = converge_test(np.array(convergence_scores),10**-6,3)
        
        ind_counter = ind_counter + 1
        total_inds = total_inds + 1
        
        #local search
        if converged == True:
            support_indicies = np.where(~np.all(vars_z == 0, axis=1))[0]
            zero_indicies = np.where(np.all(vars_z == 0, axis=1))[0]
            
            if convergence_scores[-1] > local_best:
                converged = True
            
            elif len(support_indicies)> 0:
                local_ind = random.choice(support_indicies)
                vars_z[local_ind] = np.zeros(max_depth)
                
                if len(zero_indicies) > 0:
                    ind_counter = min(zero_indicies)
                    converged = False
                    local_best = convergence_scores[-1]
                
                else:
                    converged = True
        
        if total_inds > 10000:
            break
     
    return vars_z , total_inds

def prune_polish(difference_array_list,Y,vars_z,learning_rate):
    pred_array = []
    for i in range(len(vars_z)):
        if sum(vars_z[i])>0:
            pred_array.append(np.dot(difference_array_list[i],vars_z[i])*learning_rate)
    
    if len(pred_array) == 0:
        return np.zeros(len(vars_z))
    
    pred_array = np.transpose(pred_array)
    lm = Ridge(alpha = 0.01, fit_intercept = False).fit(pred_array,Y)
    coef = lm.coef_
    return coef

@jit(nopython=True)
def evaluate_test_error_polished(difference_array_list,Y,vars_z,coef,learning_rate):
    pred = np.zeros(len(Y))
    j = 0
    for i in range(len(vars_z)):
        if sum(vars_z[i])>0:
            pred += np.dot(difference_array_list[i],vars_z[i])*learning_rate*coef[j]  
            j+=1
    return np.square(np.subtract(Y, pred)).mean()

import gurobipy as gp
from gurobipy import GRB
from itertools import product

def l0_ensemble_select(features, response,node_count, node_limit, warm_up=None, verbose=False, time_limit = 60):
    """
    Deploy and optimize the MIQP formulation of L0-Regression.
    """
    t1 = time.time()
    assert isinstance(node_limit, (int, np.integer))
    regressor = gp.Model()
    samples, dim = features.shape
    assert samples == response.shape[0]


    # Append a column of ones to the feature matrix to account for the y-intercept
    X = np.concatenate([features, np.ones((samples, 1))], axis=1)  
    
    # Decision variables
    beta = regressor.addVars(dim, lb=-GRB.INFINITY, name="beta") # Weights
 
    # iszero[i] = 1 if beta[i] = 0  
    iszero = regressor.addVars(dim, vtype=GRB.BINARY, name="iszero") 
    
    # Objective Function (OF): minimize 1/2 * RSS using the fact that
    # if x* is a minimizer of f(x), it is also a minimizer of k*f(x) iff k > 0
    Quad = np.dot(X.T, X)
    lin = np.dot(response.T, X)
    obj = sum(0.5 * Quad[i,j] * beta[i] * beta[j]
              for i, j in product(range(dim), repeat=2))
    obj -= sum(lin[i] * beta[i] for i in range(dim))
    obj += 0.5 * np.dot(response, response)
    regressor.setObjective(obj, GRB.MINIMIZE)
    
    # Constraint sets
    for i in range(dim):
        # If iszero[i]=1, then beta[i] = 0
        regressor.addSOS(GRB.SOS_TYPE1, [beta[i], iszero[i]])
        
    regressor.addConstr(sum([node_count[i]*(1-iszero[i]) \
                             for i in range(len(node_count))]) <= node_limit) # Budget constraint
    
    # We may use the Lasso or prev solution with fewer features as warm start
    if warm_up is not None and len(warm_up) == dim:
        for i in range(dim):
            iszero[i].start = (abs(warm_up[i]) < 1e-6)    
    if not verbose:
        regressor.params.OutputFlag = 0
    regressor.params.timelimit = time_limit
    regressor.params.mipgap = 0.001
  
    regressor.optimize()

    coeff = np.array([beta[i].X for i in range(dim)])
    t2 = time.time()
    return  coeff, (t2 - t1)

In [None]:
n_splits = 5
max_depth = 5
n_estimators = 250
learning_rate = 0.1
subsample = .25

ntree_range = list(range(1,n_estimators,10))

n_alphas = 100
alpha_range = np.flip(np.logspace(-6,2.5,100))


ids = [
196
,547
,531,
223,
541
,41021
,315
,512
,507
,183
,42570
,405
,287
,503
,189
,227
,308
,558
,201,
216,
537,
574]




In [None]:
sparse_results = []

for i in ids:
    np.random.seed(41)
    X,y,name = load_openml(i,False)
    y = pd.Series(y)
    y.index = X.index

    kf = KFold(n_splits=n_splits)
    print(name, i)
    
    base_results_df = pd.DataFrame()
    pruned_results_df = pd.DataFrame()
    lasso_results_df = pd.DataFrame()
    competing_results_df = pd.DataFrame()
    
    fold = 0
    
    for train_index, test_index in kf.split(X):
        
        xTrain, xTest = X.iloc[train_index], X.iloc[test_index]
        yTrain, yTest = y.iloc[train_index], y.iloc[test_index]    
        xTest, xTest1, yTest, yTest1 = train_test_split(xTest, yTest, test_size=0.5)
        
        xTrain = preprocessing.StandardScaler().fit_transform(xTrain)
        xTrain = pd.DataFrame(xTrain,columns = X.columns)

        xTest = preprocessing.StandardScaler().fit_transform(xTest)
        xTest = pd.DataFrame(xTest,columns = X.columns)
        
        xTest1 = preprocessing.StandardScaler().fit_transform(xTest1)
        xTest1 = pd.DataFrame(xTest1,columns = X.columns)
                
        yTrain = preprocessing.StandardScaler().fit_transform(yTrain.values.reshape(-1, 1))
        yTest = preprocessing.StandardScaler().fit_transform(yTest.values.reshape(-1, 1))
        yTest1 = preprocessing.StandardScaler().fit_transform(yTest1.values.reshape(-1, 1))

        yTrain = pd.Series(yTrain.flatten())
        yTrain.index = xTrain.index
        yTest = pd.Series(yTest.flatten())
        yTest.index = xTest.index
        
        yTest1 = pd.Series(yTest1.flatten())
        yTest1.index = xTest1.index
        
        
        
        GBR = GradientBoostingRegressor(max_depth = max_depth ,n_estimators = n_estimators,
                    learning_rate = learning_rate,subsample = subsample).fit(xTrain,yTrain)
        
        tree_list = np.ndarray.flatten(GBR.estimators_)
        
#Base Tuning
        base_results = []
        for ntrees in ntree_range:
            pred_test = subensemble_predict(xTest,tree_list,learning_rate,ntrees)
            score = mean_squared_error(yTest,pred_test)
            nnodes = get_node_count(tree_list[:ntrees])
            base_results.append([ntrees,score,nnodes])
        base_results = pd.DataFrame(base_results,columns = ['ntrees','score','nnodes'])
        base_results['fold'] = fold
        base_results_df = base_results_df.append(base_results)
        
        
#Pruning
        warm_start = []
        pruned_results = []
        W_array = nodes_per_layer(tree_list)
        normalization = total_nodes(tree_list)

        diff_array_list = difference_array_list(xTrain,tree_list)
        diff_test_array_list = difference_array_list(xTest,tree_list)
        diff_test_array_list1 = difference_array_list(xTest1,tree_list)
        
        
        for alpha in alpha_range:
            vars1,iters = solve_weighted(yTrain,tree_list,diff_array_list,
                                            alpha,learning_rate,W_array,normalization, warm_start = warm_start)
            warm_start = []
            score_prune = evaluate_test_error(diff_test_array_list,yTest.values,vars1,learning_rate)
            nodes_prune = get_node_count_pruned(tree_list,vars1)
            ntrees_prune = sum([sum(v) != 0 for v in vars1])

            coef = prune_polish(diff_array_list,yTrain,vars1,learning_rate)
            score_polished = evaluate_test_error_polished(diff_test_array_list,yTest.values,vars1,
                                                          coef,learning_rate)

            pruned_results.append([ntrees_prune,score_prune,nodes_prune,score_polished])
            gc.collect()
            
        pruned_results = pd.DataFrame(pruned_results,columns = ['ntrees','score','nnodes','score_polished'])
        pruned_results['arange'] = alpha_range
        pruned_results['fold'] = fold
        pruned_results_df = pruned_results_df.append(pruned_results)
#Lasso
        pred_array = []
        for tree1 in tree_list:
            pred_array.append(tree1.predict(xTrain))
        pred_array = np.transpose(pred_array)
        alphas, coef_path, _ = lasso_path(pred_array, yTrain ,n_alphas = n_alphas, eps = 10**-10)
        coef_path = np.transpose(coef_path) 

        lasso_results = []
        for coef in coef_path:
            subforest = tree_list[coef > 0]
            lasso_pred = lasso_predict(xTest,subforest,coef[coef>0])
            lasso_score = mean_squared_error(yTest,lasso_pred)
            lasso_nnodes = get_node_count(subforest)
            lasso_ntrees = sum(coef>0)
            lasso_results.append([lasso_ntrees,lasso_score,lasso_nnodes])
        lasso_results = pd.DataFrame(lasso_results,columns = ['ntrees','score','nnodes'])
        lasso_results['arange'] = alphas
        lasso_results['fold'] = fold
        lasso_results_df = lasso_results_df.append(lasso_results)
        
        
        
# BSTS
        best_node_budget = pruned_results.iloc[np.argmin(pruned_results['score_polished'])]['nnodes']
        node_count = np.array([tree1.tree_.node_count for tree1 in tree_list])
        pred_test = np.transpose([tree1.predict(xTest1) for tree1 in tree_list])
        
        competing_algo_results = []
        
        for nu in [50,100,1000,5000,best_node_budget]:
            l0_coef,l0_time = l0_ensemble_select(pred_array,yTrain,node_count,node_limit = int(nu),
                                                             time_limit = 60)
            l0_score = mean_squared_error(yTest1,pred_test@l0_coef)
            
            alpha_fp = pruned_results.loc[pruned_results['nnodes'] <= nu]['arange'].min()
            alpha_l1 = lasso_results.loc[lasso_results['nnodes'] <= nu]['arange'].min()
            ntree_param = base_results.loc[base_results['nnodes']<= nu]['ntrees'].max()
            
            
            if np.isnan(alpha_fp) == False:
                vars_f,_ = solve_weighted(yTrain,tree_list,diff_array_list,
                                                alpha_fp,learning_rate,W_array,normalization, warm_start = [])
                coef_f = prune_polish(diff_array_list,yTrain,vars_f,learning_rate)
                prune_score = evaluate_test_error_polished(diff_test_array_list1,yTest1.values,vars_f,
                                                              coef_f,learning_rate)
            else:
                prune_score = 1.0
                
            if np.isnan(alpha_l1) == False:
                lasso_f  = sklearn.linear_model.Lasso(alpha = alpha_l1,fit_intercept=False ).fit(pred_array,yTrain)
                coef_f = lasso_f.coef_
                subforest_f = tree_list[coef_f > 0]
                lasso_pred_f = lasso_predict(xTest1,subforest_f,coef_f[coef_f>0])
                lasso_score = mean_squared_error(yTest1,lasso_pred_f)
            else:
                lasso_score = 1.0
            
            if np.isnan(ntree_param) == False:
                pred_baseline_f = subensemble_predict(xTest1,tree_list,learning_rate,ntree_param)
                baseline_score = mean_squared_error(yTest1,pred_baseline_f)
            else:
                baseline_score = 1.0
            
            competing_algo_results.append([nu,l0_score,prune_score,lasso_score,baseline_score])
            
        
        
        competing_algo_results1 = pd.DataFrame(competing_algo_results, columns = ['budget','l0_score',
                                                                        'prune_score','lasso_score','baseline_score'])
        competing_algo_results1['fold'] = fold
        competing_results_df = competing_results_df.append(competing_algo_results1)
        fold = fold + 1
        print(fold)

    to_save = {}
    to_save['base_results_df'] = base_results_df
    to_save['pruned_results_df'] = pruned_results_df
    to_save['lasso_results_df'] = lasso_results_df
    to_save['competing_results_df'] = competing_results_df
    
    with open('ESLR-Results-Revisions-Range-V3/' + name + '.pickle', 'wb') as handle:
        pickle.dump(to_save, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    
    