In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statistics
import sys
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.metrics import mean_squared_error
sys.path.append("../")
from honest_trees import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Implement Honest Trees  

In [59]:
def sample_X(n,d):
    X = np.random.uniform(0,1.0,(n,d))
    return X

In [60]:
def linear_model(X,s,beta,sigma):
    
    '''
    This method is used to crete responses from a linear model with hard sparsity
    Parameters:
    X: X matrix
    s: sparsity 
    beta: coefficient vector. If beta not a vector, then assumed that 
    sigma: s.d. of added noise 
    Returns: 
    numpy array of shape (n)        
    '''
    
    def create_y(x,s,beta):
        linear_term = 0
        for i in range(s):
            linear_term += x[i]*beta
        return linear_term
    y_train = np.array([create_y(X[i, :],s,beta) for i in range(len(X))])
    y_train = y_train + sigma * np.random.randn((len(X)))
    return y_train

In [63]:
n = 1000
n_h = 1000
n_test = 500
d = 50
beta = 1
sigma = 0.1
s = 5

In [80]:
X_train = sample_X(n,d)
y_train  = linear_model(X_train,s,beta,sigma)
X_honest = sample_X(n_h,d)
y_honest = linear_model(X_honest,s,beta,sigma)
X_test = sample_X(n_test,d)
y_test = linear_model(X_test,s,beta,0)


In [81]:
CART = DecisionTreeRegressor()
CART.fit(X_train,y_train)


DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [90]:
get_honest_test_MSE(CART,X_train,y_train,X_test,y_test)

0.1365041045970644

In [91]:
mean_squared_error(CART.predict(X_test),y_test)

0.1365041045970644

In [None]:
X_honest = X_train
y_honest = y
X_honest_leaf_ids = CART.apply(X_honest)
unique_leaf_ids = np.unique(X_honest_leaf_ids)
X_honest_leaf_node_ids = {k: v for k, v in enumerate(X_honest_leaf_ids)}
leaf_id_to_honest_av = {}
for leaf_id in unique_leaf_ids:
    leaf_id_to_honest_av[leaf_id] = y_honest[[k for k,v in X_honest_leaf_node_ids.items() if v == leaf_id]].mean()


In [None]:
X_honest_leaf_node_ids.items()

In [None]:
leaf_id_to_honest_av

In [None]:
X_honest_leaf_node_ids

In [97]:
CART = DecisionTreeRegressor(min_samples_leaf = 5,ccp_alpha = sigma**2/n)
scores = cross_val_score(CART, X_train, y_train, cv=5)

In [98]:
scores

array([0.64959426, 0.62939412, 0.6897648 , 0.59933809, 0.53755791])