In [None]:
from exp.features import create_train_features
from exp.run import run_experiment
from exp.mappings import alg_map
from exp.train import train_model
import pandas as pd
import json
import numpy as np
import os
import pickle

### Create Training Features

In [None]:
X_save = "X_tr.csv"
y_save = "y_tr.csv"
X_save_scaled = "X_tr_scaled.csv"
scale_params_pickle = "scale_params.pickle"
other_params_json = "other.json"
tr_scaler = None
classic_sta_lta5_mean_fill = None
classic_sta_lta7_mean_fill = None

if not (os.path.exists(X_save_scaled) and os.path.exists(y_save)):
    if os.path.exists(X_save) and os.path.exists(y_save):
        X_tr = pd.read_csv(X_save, index_col=0)
        y_tr = pd.read_csv(y_save, index_col=0)

        scale_params_pickle_on = open(scale_params_pickle, "rb")
        tr_scaler = pickle.load(scale_params_pickle_on)
        scale_params_pickle_on.close()
        
        X_train_scaled = pd.DataFrame(tr_scaler.transform(X_tr), columns=X_tr.columns)
        X_train_scaled.to_csv(X_save_scaled)
    else:
        X_tr, X_train_scaled, y_tr, tr_scaler, classic_sta_lta5_mean_fill, classic_sta_lta7_mean_fill  = create_train_features(r'C:\Users\arvin\dev\lanl\train.csv')
        X_tr.to_csv(X_save)
        y_tr.to_csv(y_save)
        X_train_scaled.to_csv(X_save_scaled)

        scale_params_pickle_on = open(scale_params_pickle, "wb")
        pickle.dump(tr_scaler, scale_params_pickle_on)
        scale_params_pickle_on.close()

        with open(other_params_json, 'w') as fp:
            json.dump({"classic_sta_lta5_mean_fill": classic_sta_lta5_mean_fill,
                       "classic_sta_lta7_mean_fill": classic_sta_lta7_mean_fill}, fp)
else:
    X_train_scaled = pd.read_csv(X_save_scaled, index_col=0)
    y_tr = pd.read_csv(y_save, index_col=0)

### hyper-parameter experiments

In [None]:
"""
Example of Cartesian Product of Hyper-parameters for Linear Regression

"lr": {"fit_intercept": [False, True], "normalize": [False, True]}

Cartesian Product: {fit_intercept} x {normalize}

Hyper-parameter choices:
"fit_intercept": False, "normalize": False
"fit_intercept": True, "normalize": False
"fit_intercept": False, "normalize": True
"fit_intercept": True, "normalize": True
"""
params={"lgb": {"n_estimators": [5000, 10000, 1000, 2000, 3000],
               "num_leaves": [5,10,25,50, 75, 100],
                "min_data_in_leaf": [10, 20, 40, 80, 160, 200],
                "objective": ["mse", "mae", "huber"],
                "max_depth": [-1, -1, -1, -1, 5, 10, 20],
                "learning_rate": [.01, .1, .001],
                "boosting": ["gbdt", "gbdt", "gbdt", "dart"],
                "bagging_freq": [5, 5, 5, 0, 2, 10],
                "bagging_fraction": [.80, 1.0, 0.50],
                "verbosity": [-1],
                "reg_alpha": [.15, .01, .4, .001, 0.0],
                "reg_lambda": [.35, .10, .20, 0.0, .01, .4]
               },
        "xgb": {"num_boost_round": [20000, 10000, 5000, 50000, 30000, 40000],
                "eta": [.05, .01, .1],
               "max_depth": [10, 3, 5, 20],
               "subsample": [.9, .5, 1.0, .75, .1],
               "objective": ['reg:linear'],
               "silent": [True],
               "nthread": [4]},
        "cat": {"iterations": [10000, 20000, 30000, 40000, 50000], 
                "loss_function": ['MAE', 'RMSE']}
       }

### Run Experiment

In [None]:
num_searches=20
search_type = "random"
n_fold=10
save_results= "exp3.csv"

In [None]:
for alg in params.keys():
    print(alg)
    score_df = run_experiment(X=X_train_scaled, Y=y_tr, n_fold=n_fold, alg=alg, alg_params=params[alg], search_type="random", num_searches=num_searches, save_results=save_results)

### Display models ranked by CV scores

In [None]:
score_df = score_df.sort_values(by="score", axis=0)
display(score_df)

### Load results from CSV File and re-produce models ranked by CV scores

In [None]:
score_df = pd.read_csv(save_results)
score_df = score_df.sort_values(by="score", axis=0)

In [None]:
display(score_df)

### Load best model from CSV File

In [None]:
# retrieve top scoring row
best = score_df.iloc[1]
display(best)

# retrieve model parameters from pandas row
alg = best["alg"]
params_json = best["params_json"]
print("alg: {}".format(alg))
print("params_json: {}".format(params_json))

# retrieve relevant values
alg_cls = alg_map[alg]
params = json.loads(params_json)

# initialize model
model = alg_cls(**params)

# train algorithm
train_model(X=X_train_scaled, Y=y_tr, n_fold=n_fold, model=model)