In [2]:
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from pickle import dump

# Load Data

In [3]:
dataset = []
with open("to_train_bak.txt", "r") as f:
    for line in f.readlines():
        dataset.append([float(a) for a in line.strip().split(" ")])
dataset = np.array(dataset, dtype=np.float32)

In [25]:
g_x = dataset[:, :3]
g_y = dataset[:, 3]
a_x = dataset[:, 4:7]
a_y = dataset[:, 7]

In [26]:
g_x_train, g_x_test, g_y_train, g_y_test = train_test_split(g_x, g_y, shuffle=True, test_size=0.2)
a_x_train, a_x_test, a_y_train, a_y_test = train_test_split(a_x, a_y, shuffle=True, test_size=0.2)

# Prepare Training

In [27]:
params = {
 'criterion': ['mse'],
 'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 21],
 'min_samples_leaf': [2, 4, 8],
 'min_samples_split': [3, 6, 9],
 'n_estimators': [10, 50, 100],
 'random_state': [42],}

In [28]:
cv_g = ShuffleSplit(g_x_train.shape[0], test_size=0.2, random_state=42)
cv_a = ShuffleSplit(a_x_train.shape[0], test_size=0.2, random_state=42)

In [29]:
regressor_g = GridSearchCV(estimator=RandomForestRegressor(), cv=cv_g, param_grid=params, verbose=2, n_jobs=50)
regressor_a = GridSearchCV(estimator=RandomForestRegressor(), cv=cv_a, param_grid=params, verbose=2, n_jobs=50)

# Training

In [30]:
regressor_g.fit(g_x_train, g_y_train)
regressor_a.fit(a_x_train, a_y_train)

Fitting 1325 folds for each of 108 candidates, totalling 143100 fits


[Parallel(n_jobs=50)]: Using backend LokyBackend with 50 concurrent workers.
[Parallel(n_jobs=50)]: Done  62 tasks      | elapsed:    3.2s
[Parallel(n_jobs=50)]: Done 265 tasks      | elapsed:    4.9s
[Parallel(n_jobs=50)]: Done 548 tasks      | elapsed:    6.1s
[Parallel(n_jobs=50)]: Done 913 tasks      | elapsed:    7.7s
[Parallel(n_jobs=50)]: Done 1358 tasks      | elapsed:   10.2s
[Parallel(n_jobs=50)]: Done 1885 tasks      | elapsed:   20.5s
[Parallel(n_jobs=50)]: Done 2492 tasks      | elapsed:   32.0s
[Parallel(n_jobs=50)]: Done 3181 tasks      | elapsed:   54.9s
[Parallel(n_jobs=50)]: Done 3950 tasks      | elapsed:  1.4min
[Parallel(n_jobs=50)]: Done 4801 tasks      | elapsed:  1.5min
[Parallel(n_jobs=50)]: Done 5732 tasks      | elapsed:  1.6min
[Parallel(n_jobs=50)]: Done 6745 tasks      | elapsed:  2.0min
[Parallel(n_jobs=50)]: Done 7838 tasks      | elapsed:  2.7min
[Parallel(n_jobs=50)]: Done 9013 tasks      | elapsed:  2.8min
[Parallel(n_jobs=50)]: Done 10268 tasks      

Fitting 1325 folds for each of 108 candidates, totalling 143100 fits


[Parallel(n_jobs=50)]: Done  62 tasks      | elapsed:    0.4s
[Parallel(n_jobs=50)]: Done 430 tasks      | elapsed:    2.1s
[Parallel(n_jobs=50)]: Done 996 tasks      | elapsed:    4.5s
[Parallel(n_jobs=50)]: Done 1663 tasks      | elapsed:   12.9s
[Parallel(n_jobs=50)]: Done 2108 tasks      | elapsed:   22.0s
[Parallel(n_jobs=50)]: Done 2635 tasks      | elapsed:   32.8s
[Parallel(n_jobs=50)]: Done 3242 tasks      | elapsed:   56.7s
[Parallel(n_jobs=50)]: Done 3931 tasks      | elapsed:  1.4min
[Parallel(n_jobs=50)]: Done 4700 tasks      | elapsed:  1.5min
[Parallel(n_jobs=50)]: Done 5551 tasks      | elapsed:  1.6min
[Parallel(n_jobs=50)]: Done 6482 tasks      | elapsed:  1.9min
[Parallel(n_jobs=50)]: Done 7495 tasks      | elapsed:  2.5min
[Parallel(n_jobs=50)]: Done 8588 tasks      | elapsed:  2.9min
[Parallel(n_jobs=50)]: Done 9763 tasks      | elapsed:  3.1min
[Parallel(n_jobs=50)]: Done 11018 tasks      | elapsed:  3.6min
[Parallel(n_jobs=50)]: Done 12355 tasks      | elapsed:  

GridSearchCV(cv=ShuffleSplit(n_splits=1325, random_state=42, test_size=0.2, train_size=None),
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_we...
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=50,
 

# Testing

In [45]:
regressor_g_best_est = regressor_g.best_estimator_
regressor_a_best_est = regressor_a.best_estimator_

In [46]:
g_y_predict = regressor_g_best_est.predict(g_x_test)
mean_squared_error(g_y_test, g_y_predict)

0.00016762060546507158

In [47]:
a_y_predict = regressor_a_best_est.predict(a_x_test)
mean_squared_error(a_y_test, a_y_predict)

0.00016189402244890286

# Save Model

In [48]:
with open("gamma_model.pkl", "wb") as f:
    dump(regressor_g_best_est, f)
with open("alpha_model.pkl", "wb") as f:
    dump(regressor_a_best_est, f)