In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_log_error
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV
import pickle



In [2]:
data_npz = np.load('./preprocessed/preprocessed_data_grouped_industry.npz')
X = data_npz['inputs'].astype(np.float)
y = data_npz['targets'].astype(np.float)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def custom_rmsle_scorer(y_true,y_predicted):
    y_predicted[y_predicted < 0] = 0
    return np.sqrt(mean_squared_log_error(y_true, y_predicted))

In [5]:
model = RandomForestRegressor()
scorer = make_scorer(custom_rmsle_scorer, greater_is_better=False)
params = {
    'n_estimators': [10],
    'min_samples_split': range(10, 100, 10),
    'min_samples_leaf': range(10, 100, 10)
}
grid_search_cv = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=3, scoring=scorer)

In [None]:
grid_search_cv.fit(X, y)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 19.7min


In [None]:
grid_search_cv

In [None]:
reg_10 = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=7, verbose=2, min_samples_leaf=10)
reg_10.fit(X_train, y_train)

pred_10 = reg_10.predict(X_train)
pred_10[pred_10 < 0] = 0
print('Training Loss:')
print(np.sqrt(mean_squared_log_error(y_train, pred_10)))

pred_10 = reg_10.predict(X_test)
pred_10[pred_10 < 0] = 0
print('Testing Loss:')
print(np.sqrt(mean_squared_log_error(y_test, pred_10)))

# Training Loss:
# 0.2374216809169934

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.


building tree 1 of 10building tree 2 of 10

building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10building tree 7 of 10



In [12]:
reg_100 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=7, verbose=2, min_samples_leaf=20)
reg_100.fit(X_train, y_train)

pred_100 = reg_100.predict(X_train)
pred_100[pred_100 < 0] = 0
print('Training Loss:')
print(np.sqrt(mean_squared_log_error(y_train, pred_100)))

pred_100 = reg_100.predict(X_test)
pred_100[pred_100 < 0] = 0
print('Testing Loss:')
print(np.sqrt(mean_squared_log_error(y_test, pred_100)))

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:    0.9s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    3.3s finished


Training Loss:
0.47415543059837073


[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:    0.4s


Testing Loss:
0.4888781421665288


[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    1.0s finished


In [14]:
reg_300 = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=7, verbose=2, min_samples_leaf=20)
reg_300.fit(X_train, y_train)

pred_300 = reg_300.predict(X_train)
pred_300[pred_300 < 0] = 0
print('Training Loss:')
print(np.sqrt(mean_squared_log_error(y_train, pred_300)))

pred_300 = reg_300.predict(X_test)
pred_300[pred_300 < 0] = 0
print('Testing Loss:')
print(np.sqrt(mean_squared_log_error(y_test, pred_300)))

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.


building tree 1 of 300building tree 2 of 300
building tree 3 of 300
building tree 4 of 300
building tree 5 of 300

building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300


[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:  2.7min


building tree 35 of 300
building tree 36 of 300
building tree 37 of 300
building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 300
building tree 47 of 300
building tree 48 of 300
building tree 49 of 300
building tree 50 of 300
building tree 51 of 300
building tree 52 of 300
building tree 53 of 300
building tree 54 of 300
building tree 55 of 300
building tree 56 of 300
building tree 57 of 300
building tree 58 of 300building tree 59 of 300

building tree 60 of 300
building tree 61 of 300
building tree 62 of 300
building tree 63 of 300
building tree 64 of 300
building tree 65 of 300
building tree 66 of 300
building tree 67 of 300
building tree 68 of 300
building tree 69 of 300
building tree 70 of 300
building tree 71 of 300
building tree 72 of 300
building tree 73 of 300
building tree 74 of 300
building tree 75 of 300
building tree 76

[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed: 15.0min


building tree 155 of 300
building tree 156 of 300
building tree 157 of 300
building tree 158 of 300
building tree 159 of 300
building tree 160 of 300
building tree 161 of 300
building tree 162 of 300
building tree 163 of 300
building tree 164 of 300
building tree 165 of 300
building tree 166 of 300
building tree 167 of 300
building tree 168 of 300
building tree 169 of 300
building tree 170 of 300
building tree 171 of 300
building tree 172 of 300
building tree 173 of 300
building tree 174 of 300
building tree 175 of 300
building tree 176 of 300
building tree 177 of 300
building tree 178 of 300
building tree 179 of 300
building tree 180 of 300
building tree 181 of 300
building tree 182 of 300
building tree 183 of 300
building tree 184 of 300
building tree 185 of 300
building tree 186 of 300
building tree 187 of 300
building tree 188 of 300
building tree 189 of 300
building tree 190 of 300
building tree 191 of 300
building tree 192 of 300
building tree 193 of 300
building tree 194 of 300


[Parallel(n_jobs=7)]: Done 300 out of 300 | elapsed: 28.9min finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:    0.4s
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:    3.2s
[Parallel(n_jobs=7)]: Done 300 out of 300 | elapsed:    6.0s finished


Training Loss:
0.4741015677527492


[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:    0.4s


Testing Loss:
0.48879376104830585


[Parallel(n_jobs=7)]: Done 300 out of 300 | elapsed:    1.0s finished


In [10]:
cv_reg_100 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=6, verbose=2)
cv_scores_100 = cross_val_score(cv_reg_100, X, y, scoring=make_scorer(custom_rmsle_scorer, greater_is_better=False))

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  4.7min
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed: 18.7min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    3.5s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    9.2s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  5.7min
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed: 20.4min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    4.5s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   12.9s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  6.8min
[Parallel(n_jobs=6)]: Done 100

In [11]:
cv_scores_100

# array([-0.51784263, -0.5187038 , -0.51876523, -0.52485249, -0.52453937])

array([-0.53218955, -0.52676988, -0.53277349, -0.5391297 , -0.53694207])

In [8]:
reg_500 = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1, verbose=2, min_samples_leaf=20)
reg_500.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 500building tree 2 of 500

building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500building tree 7 of 500
building tree 8 of 500

building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.6min


building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
building tree 43 of 500
building tree 44 of 500
building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 48 of 500
building tree 49 of 500
building tree 50 of 500
building tree 51 of 500
building tree 52 of 500
building tree 53 of 500
building tree 54 of 500
building tree 55 of 500
building tree 56 of 500
building tree 57 of 500
building tree 58 of 500
building tree 59 of 500
building tree 60 of 500
building tree 61 of 500
building tree 62 of 500
building tree 63 of 500
building tree 64 of 500
building tree 65 of 500
building tree 66 of 500
building tree 67 of 500
building tree 68 of 500
building tree 69 of 500
building tree 70 of 500
building tree 71 of 500
building tree 72 of 500
building tree 73 of 500
building tree 74

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 17.1min


building tree 154 of 500
building tree 155 of 500
building tree 156 of 500
building tree 157 of 500
building tree 158 of 500
building tree 159 of 500
building tree 160 of 500
building tree 161 of 500
building tree 162 of 500
building tree 163 of 500
building tree 164 of 500
building tree 165 of 500
building tree 166 of 500
building tree 167 of 500
building tree 168 of 500
building tree 169 of 500
building tree 170 of 500
building tree 171 of 500
building tree 172 of 500
building tree 173 of 500
building tree 174 of 500
building tree 175 of 500
building tree 176 of 500
building tree 177 of 500
building tree 178 of 500
building tree 179 of 500
building tree 180 of 500
building tree 181 of 500
building tree 182 of 500
building tree 183 of 500
building tree 184 of 500
building tree 185 of 500
building tree 186 of 500
building tree 187 of 500
building tree 188 of 500
building tree 189 of 500
building tree 190 of 500
building tree 191 of 500
building tree 192 of 500
building tree 193 of 500


[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 40.9min


building tree 357 of 500
building tree 358 of 500
building tree 359 of 500
building tree 360 of 500
building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500
building tree 380 of 500
building tree 381 of 500
building tree 382 of 500
building tree 383 of 500
building tree 384 of 500
building tree 385 of 500
building tree 386 of 500
building tree 387 of 500
building tree 388 of 500
building tree 389 of 500
building tree 390 of 500
building tree 391 of 500
building tree 392 of 500
building tree 393 of 500
building tree 394 of 500
building tree 395 of 500
building tree 396 of 500


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 58.1min finished


RandomForestRegressor(min_samples_leaf=20, n_estimators=500, n_jobs=-1,
                      random_state=42, verbose=2)

In [9]:
pred_500 = reg_500.predict(X_train)
pred_500[pred_500 < 0] = 0
print('Training Loss:')
print(np.sqrt(mean_squared_log_error(y_train, pred_500)))

pred_500 = reg_500.predict(X_test)
pred_500[pred_500 < 0] = 0
print('Testing Loss:')
print(np.sqrt(mean_squared_log_error(y_test, pred_500)))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    9.5s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   12.8s finished


Training Loss:
0.4740789710278843


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    1.6s


Testing Loss:
0.48876738883936305


[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    2.4s finished


In [10]:
pickle.dump(reg_500, open('./models/rf_regressor_with_grouped_industry_500.pkl', 'wb'))