In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import lightgbm as lgb
from funs import rmse, clip20, clip40
import pickle

## Data

In [2]:
X_train = pd.read_parquet('data2/X_train.parquet')
X_val = pd.read_parquet('data2/X_val.parquet')
y_train = np.load('data2/y_train.npy')
y_val = np.load('data2/y_val.npy')

## Models

Grid Search for Hyperparameter tuning

### Light Gradient Boosting

In [None]:
learning_rates = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
best_rmse = 9999999999999
for lr in learning_rates:
    print (lr)
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=clip40(y_train)), int(100 * (lr / 0.03)))
    pred_lgb_val = lgb_model.predict(X_val)
    score = rmse(clip20(y_val), clip20(pred_lgb_val))

    if score < best_rmse:
        best_rmse = score
        best_lr = lr
        best_lgb = lgb_model

In [None]:
best_lr

We train the best model with all the data.

In [3]:
X = X_train.append(X_val)
y = np.append(y_train, y_val)

In [4]:
best_lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': best_lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }
best_lgb = lgb.train(lgb_params, lgb.Dataset(X, label=clip40(y)), int(100 * (lr / 0.03)))

NameError: name 'best_lr' is not defined

Save the model

In [None]:
lgb_filename = "models/new_best_lgb.sav"  
with open(lgb_filename, 'wb') as file:
    pickle.dump(best_lgb, file)

In [None]:
best_lgbb = pickle.load(open('models/new_best_lgb.sav', 'rb'))
best_lgbb.predict(X_val)

### Random Forest

CV Iterator

In [5]:
X = X_train.append(X_val)
Y = np.concatenate([y_train, y_val])
train_ind=np.zeros(X.shape[0])
for i in range(0, len(X_train)):
    train_ind[i] = -1
ps = PredefinedSplit(test_fold=(train_ind))

In [None]:
X_data = X.to_csv('X_data.csv', index = True) 

In [None]:
np.save('Y', Y)

In [None]:
from memory_profiler import profile
from funs import clip40

def rf():
    #X = pd.read_csv('X_data.csv')
    #Y = np.load('Y')
    train_ind=np.zeros(X.shape[0])
    for i in range(0, len(X_train)):
        train_ind[i] = -1
    ps = PredefinedSplit(test_fold=(train_ind))
    param_grid={'bootstrap':[0.7, 0.8], 'max_features':[4, 6, 8], 
            'max_depth' : [None, 4, 6, 8, 10, 12]}
    gs = GridSearchCV(cv = ps, 
                  estimator = RandomForestRegressor(n_estimators=300, n_jobs=4), 
                  param_grid=param_grid, scoring='neg_mean_squared_error')
    gs.fit(X, clip40(Y))
    best_rf = gs.best_estimator_
    filename = 'models/best_rf.sav'
    return pickle.dump(best_rf, open(filename, 'wb'))
#rf()

Grid Search for Hyperparameter tuning

In [6]:
param_grid={'bootstrap':[0.7, 0.8], 'max_features':[4, 6, 8], 
            'max_depth' : [None, 4, 6, 8, 10, 12]}
gs = GridSearchCV(cv = ps, 
                  estimator = RandomForestRegressor(n_estimators=300, n_jobs=4), 
                  param_grid=param_grid, scoring='neg_mean_squared_error')

In [None]:
import time

from scitime import Estimator

# example for rf regressor
estimator = Estimator(meta_algo='RF', verbose=3)
rf = RandomForestRegressor()

X,y = np.random.rand(10000000,10),np.random.rand(10000000,1)
# run the estimation
estimation, lower_bound, upper_bound = estimator.time(rf, X, y)

In [None]:
gs.fit(X, clip40(Y))

In [None]:
gs.best_score_

In [None]:
best_rf = gs.best_estimator_
best_rf

We train the best model with all the data.

In [None]:
#best_rf = pickle.load(open('models/best_rf.sav', 'rb'))

In [None]:
best_rf.fit(X, clip40(y))

Save the model

In [None]:
filename = 'models/best_rf.sav'
pickle.dump(best_rf, open(filename, 'wb'))