In [27]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import pandas as pd

### boston

In [28]:
from sklearn.datasets import load_boston
boston = load_boston(return_X_y = False)

print(boston['data'].shape)

X_train, X_test, y_train, y_test = train_test_split(boston['data'], boston['target'], test_size = 0.2)

per_sub = [0.2, 0.4, 0.6, 0.8, 1.0]
rep = 20
n = len(X_train)
p = len(X_train[0])

(506, 13)


In [119]:
U_train = []
U_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestRegressor(n_estimators = 100, max_features = int(p / 3))
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = False)
        temp_train.append(np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
        temp_test.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    U_train.append(temp_train)
    U_test.append(temp_test)     

In [120]:
print("rmse: " + str(np.mean(U_test, axis = 1)))
print("rmse(std): " + str(np.std(U_test, axis = 1)))

rmse: [3.96961079 3.57541841 3.32254591 3.21026343 3.05876659]
rmse(std): [0.11123226 0.07883159 0.11311473 0.10497909 0.07870243]


In [121]:
V_train = []
V_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestRegressor(n_estimators = 100, max_features = int(p / 3))
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = True)
        temp_train.append(np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
        temp_test.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    V_train.append(temp_train)
    V_test.append(temp_test)     

In [122]:
print("rmse: " + str(np.mean(V_test, axis = 1)))
print("rmse(std): " + str(np.std(V_test, axis = 1)))

rmse: [4.03583206 3.67994177 3.47046104 3.37382652 3.29436911]
rmse(std): [0.06563793 0.07579034 0.09356659 0.1056693  0.08871229]


### iris

In [197]:
from sklearn.datasets import load_iris
iris = load_iris(return_X_y = False)

print(iris['data'].shape)

X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], test_size = 0.2)

per_sub = [0.2, 0.4, 0.6, 0.8, 1.0]
rep = 20
n = len(X_train)
p = len(X_train[0])

(150, 4)


In [198]:
U_train = []
U_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = False)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    U_train.append(temp_train)
    U_test.append(temp_test)     

In [199]:
print("accuracy: " + str(np.mean(U_test, axis = 1)))
print("accuracy(std): " + str(np.std(U_test, axis = 1)))

accuracy: [0.96666667 0.96666667 0.96333333 0.95       0.93333333]
accuracy(std): [2.22044605e-16 2.22044605e-16 1.00000000e-02 1.66666667e-02
 0.00000000e+00]


In [200]:
V_train = []
V_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = True)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    V_train.append(temp_train)
    V_test.append(temp_test)     

In [201]:
print("accuracy: " + str(np.mean(V_test, axis = 1)))
print("accuracy(std): " + str(np.std(V_test, axis = 1)))

accuracy: [0.965      0.96666667 0.96166667 0.96666667 0.96333333]
accuracy(std): [7.26483157e-03 2.22044605e-16 1.19023807e-02 2.22044605e-16
 1.00000000e-02]


### diabetes

In [38]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes(return_X_y = False)

print(diabetes['data'].shape)

X_train, X_test, y_train, y_test = train_test_split(diabetes['data'], diabetes['target'], test_size = 0.2)

per_sub = [0.2, 0.4, 0.6, 0.8, 1.0]
rep = 20
n = len(X_train)
p = len(X_train[0])

(442, 10)


In [129]:
U_train = []
U_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestRegressor(n_estimators = 100, max_features = int(p / 3))
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = False)
        temp_train.append(np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
        temp_test.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    U_train.append(temp_train)
    U_test.append(temp_test)     

In [130]:
print("rmse: " + str(np.mean(U_test, axis = 1)))
print("rmse(std): " + str(np.std(U_test, axis = 1)))

rmse: [54.61365047 54.6289104  55.51395925 56.90173789 58.74941694]
rmse(std): [0.69364211 0.51640017 0.60971521 0.56813848 0.81870382]


In [131]:
V_train = []
V_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestRegressor(n_estimators = 100, max_features = int(p / 3))
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = True)
        temp_train.append(np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
        temp_test.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    V_train.append(temp_train)
    V_test.append(temp_test)     

In [132]:
print("rmse: " + str(np.mean(V_test, axis = 1)))
print("rmse(std): " + str(np.std(V_test, axis = 1)))

rmse: [54.38298276 54.42818446 54.52198443 55.07816249 55.41301178]
rmse(std): [0.70996321 0.63472758 0.77394448 0.56420299 0.8876921 ]


### digits

In [158]:
from sklearn.datasets import load_digits
digits = load_digits(return_X_y = False)

print(digits['data'].shape)

X_train, X_test, y_train, y_test = train_test_split(digits['data'], digits['target'], test_size = 0.2)

per_sub = [0.2, 0.4, 0.6, 0.8, 1.0]
rep = 20
n = len(X_train)
p = len(X_train[0])

(1797, 64)


In [159]:
U_train = []
U_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = False)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    U_train.append(temp_train)
    U_test.append(temp_test)     

In [160]:
print("accuracy: " + str(np.mean(U_test, axis = 1)))
print("accuracy(std): " + str(np.std(U_test, axis = 1)))

accuracy: [0.95069444 0.96597222 0.96875    0.97347222 0.97305556]
accuracy(std): [0.00533413 0.00338786 0.00429209 0.00476897 0.00448764]


In [161]:
V_train = []
V_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = True)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    V_train.append(temp_train)
    V_test.append(temp_test)     

In [162]:
print("rmse: " + str(np.mean(V_test, axis = 1)))
print("rmse(std): " + str(np.std(V_test, axis = 1)))

rmse: [0.94805556 0.95875    0.96513889 0.96805556 0.96888889]
rmse(std): [0.00535038 0.00577851 0.00492811 0.00477099 0.00357892]


### wine

In [167]:
from sklearn.datasets import load_wine
wine = load_wine(return_X_y = False)

print(wine['data'].shape)

X_train, X_test, y_train, y_test = train_test_split(wine['data'], wine['target'], test_size = 0.2)

per_sub = [0.2, 0.4, 0.6, 0.8, 1.0]
rep = 20
n = len(X_train)
p = len(X_train[0])

(178, 13)


In [170]:
U_train = []
U_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = False)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    U_train.append(temp_train)
    U_test.append(temp_test)     

In [171]:
print("accuracy: " + str(np.mean(U_test, axis = 1)))
print("accuracy(std): " + str(np.std(U_test, axis = 1)))

accuracy: [0.96527778 0.99722222 1.         1.         1.        ]
accuracy(std): [0.02129126 0.00833333 0.         0.         0.        ]


In [172]:
V_train = []
V_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = True)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    V_train.append(temp_train)
    V_test.append(temp_test)     

In [173]:
print("rmse: " + str(np.mean(V_test, axis = 1)))
print("rmse(std): " + str(np.std(V_test, axis = 1)))

rmse: [0.97777778 0.99722222 0.99722222 0.99861111 0.99722222]
rmse(std): [0.02078699 0.00833333 0.00833333 0.00605403 0.00833333]


### breast_cancer

In [176]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer(return_X_y = False)

print(breast_cancer['data'].shape)

X_train, X_test, y_train, y_test = train_test_split(breast_cancer['data'], breast_cancer['target'], test_size = 0.2)

per_sub = [0.2, 0.4, 0.6, 0.8, 1.0]
rep = 20
n = len(X_train)
p = len(X_train[0])

(569, 30)


In [177]:
U_train = []
U_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = False)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    U_train.append(temp_train)
    U_test.append(temp_test)     

In [178]:
print("accuracy: " + str(np.mean(U_test, axis = 1)))
print("accuracy(std): " + str(np.std(U_test, axis = 1)))

accuracy: [0.97719298 0.9745614  0.975      0.97412281 0.9745614 ]
accuracy(std): [0.00701754 0.0038236  0.00637098 0.0019118  0.00263158]


In [179]:
V_train = []
V_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = True)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    V_train.append(temp_train)
    V_test.append(temp_test)     

In [180]:
print("rmse: " + str(np.mean(V_test, axis = 1)))
print("rmse(std): " + str(np.std(V_test, axis = 1)))

rmse: [0.97412281 0.97368421 0.9745614  0.97587719 0.9754386 ]
rmse(std): [0.00705854 0.00392293 0.0038236  0.00470342 0.00350877]


### MARS

In [192]:
n = 500
p = 5

x1 = np.random.uniform(size = n).reshape((n, 1))
x2 = np.random.uniform(size = n).reshape((n, 1))
x3 = np.random.uniform(size = n).reshape((n, 1))
x4 = np.random.uniform(size = n).reshape((n, 1))
x5 = np.random.uniform(size = n).reshape((n, 1))

X = np.concatenate((x1, x2, x3, x4, x5), axis = 1)

epi = np.random.randn(n).reshape((n, 1))

y = 10 * np.sin(np.pi * x1 * x2) + 20 * (x3 - 0.05) ** 2 + 10 * x4 + 5 * x5 + epi

y = y.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [193]:
n = len(X_train)

U_train = []
U_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestRegressor(n_estimators = 100, max_features = int(p / 3))
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = False)
        temp_train.append(np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
        temp_test.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    U_train.append(temp_train)
    U_test.append(temp_test)     

In [194]:
print("rmse: " + str(np.mean(U_test, axis = 1)))
print("rmse(std): " + str(np.std(U_test, axis = 1)))

rmse: [3.2105685  2.88504208 2.72109538 2.63288321 2.54646483]
rmse(std): [0.06476772 0.09801547 0.0844877  0.04823382 0.04843217]


In [195]:
V_train = []
V_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestRegressor(n_estimators = 100, max_features = int(p / 3))
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = True)
        temp_train.append(np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
        temp_test.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    V_train.append(temp_train)
    V_test.append(temp_test)     

In [196]:
print("rmse: " + str(np.mean(V_test, axis = 1)))
print("rmse(std): " + str(np.std(V_test, axis = 1)))

rmse: [3.29075517 2.97063161 2.82012377 2.74983429 2.69654239]
rmse(std): [0.10586008 0.06443378 0.06829424 0.05909813 0.04248608]


### Retinopathy

In [32]:
data = pd.read_csv("messidor_features.data", header = None)
y = data[19].values
X = data[range(19)].values
print(X.shape)

(1151, 19)


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

per_sub = [0.2, 0.4, 0.6, 0.8, 1.0]
rep = 20
n = len(X_train)
p = len(X_train[0])

In [34]:
U_train = []
U_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = False)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    U_train.append(temp_train)
    U_test.append(temp_test)     

In [35]:
print("accuracy: " + str(np.mean(U_test, axis = 1)))
print("accuracy(std): " + str(np.std(U_test, axis = 1)))

accuracy: [0.69350649 0.68246753 0.67662338 0.67077922 0.66558442]
accuracy(std): [0.01558442 0.01122832 0.0113796  0.0081536  0.01253731]


In [36]:
V_train = []
V_test = []

for sub in per_sub:
    temp_train = []
    temp_test = []
    for r in range(rep):
        model = RandomForestClassifier(n_estimators = 100)
        model.fit(X_train, y_train, n_subsamples = int(n * sub), replace = True)
        temp_train.append(accuracy_score(y_train, model.predict(X_train)))
        temp_test.append(accuracy_score(y_test, model.predict(X_test)))
    V_train.append(temp_train)
    V_test.append(temp_test)     

In [37]:
print("rmse: " + str(np.mean(V_test, axis = 1)))
print("rmse(std): " + str(np.std(V_test, axis = 1)))

rmse: [0.68917749 0.69458874 0.68679654 0.68463203 0.68203463]
rmse(std): [0.01345478 0.01265632 0.01256717 0.0108894  0.01165419]
