In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_log_error
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV
import pickle



In [2]:
data_npz = np.load('./preprocessed/preprocessed_data_with_industry.npz')
X = data_npz['inputs'].astype(np.float)
y = data_npz['targets'].astype(np.float)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def custom_rmsle_scorer(y_true,y_predicted):
    y_predicted[y_predicted < 0] = 0
    return np.sqrt(mean_squared_log_error(y_true, y_predicted))

In [None]:
model = RandomForestRegressor()
params = {
    'n_estimators': [100,300,500]
}
grid_search_cv = GridSearchCV(model, params, n_jobs=7, verbose=3, scoring=make_scorer(custom_rmsle_scorer, greater_is_better=False))

In [None]:
grid_search_cv.fit(X, y)

In [5]:
reg_10 = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=6, verbose=2)
reg_10.fit(X_train, y_train)

pred_10 = reg_10.predict(X_train)
pred_10[pred_10 < 0] = 0
print('Training Loss:')
print(np.sqrt(mean_squared_log_error(y_train, pred_10)))

pred_10 = reg_10.predict(X_test)
pred_10[pred_10 < 0] = 0
print('Testing Loss:')
print(np.sqrt(mean_squared_log_error(y_test, pred_10)))

# Training Loss:
# 0.2374216809169934

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


building tree 1 of 10building tree 2 of 10building tree 3 of 10building tree 4 of 10

building tree 5 of 10building tree 6 of 10



building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=6)]: Done   5 out of  10 | elapsed: 20.8min remaining: 20.8min
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed: 32.2min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   5 out of  10 | elapsed:    1.9s remaining:    1.9s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:    3.4s finished


Training Loss:
0.23560031078248306


[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   5 out of  10 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:    0.9s finished


Testing Loss:
0.5186870890019066


In [None]:
reg_100 = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1, verbose=2)
reg_100.fit(X_train, y_train)

pred_100 = reg_100.predict(X_train)
pred_100[pred_100 < 0] = 0
print('Training Loss:')
print(np.sqrt(mean_squared_log_error(y_train, pred_100)))

pred_100 = reg_100.predict(X_test)
pred_100[pred_100 < 0] = 0
print('Testing Loss:')
print(np.sqrt(mean_squared_log_error(y_test, pred_100)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 10building tree 2 of 10building tree 3 of 10building tree 4 of 10building tree 5 of 10

building tree 6 of 10building tree 7 of 10

building tree 8 of 10





In [None]:
cv_reg_10 = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=6, verbose=2)
cv_scores_10 = cross_val_score(cv_reg_10, X, y, scoring=make_scorer(custom_rmsle_scorer, greater_is_better=False))

In [None]:
cv_scores_10

# array([-0.51784263, -0.5187038 , -0.51876523, -0.52485249, -0.52453937])

In [None]:
reg_100 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=7, verbose=2)
reg_100.fit(X, y)
pred_100 = reg_100.predict(X)
pred_100[pred_100 < 0] = 0
print('Training Loss:')
print(np.sqrt(mean_squared_log_error(y, pred_100)))

In [None]:
pickle.dump(reg_10, open('./models/rf_regressor_with_industry.pkl', 'wb'))