In [None]:
from exp.features import create_train_features
from exp.run import run_experiment
from exp.mappings import alg_map
from exp.train import train_model
import pandas as pd
import json

### Create Training Features

In [None]:
X_save = "X_tr.csv"
y_save = "y_tr.csv"
X_save_scaled = "X_tr_scaled.csv"
scale_params_pickle = "scale_params.pickle"
other_params_json = "other.json"
tr_scaler = None
classic_sta_lta5_mean_fill = None
classic_sta_lta7_mean_fill = None

if not (os.path.exists(X_save_scaled) and os.path.exists(y_save)):
    if os.path.exists(X_save) and os.path.exists(y_save):
        X_tr = pd.read_csv(X_save, index_col=0)
        y_tr = pd.read_csv(y_save, index_col=0)

        scale_params_pickle_on = open(scale_params_pickle, "rb")
        tr_scaler = pickle.load(scale_params_pickle_on)
        scale_params_pickle_on.close()
        
        X_train_scaled = pd.DataFrame(tr_scaler.transform(X_tr), columns=X_tr.columns)
        X_train_scaled.to_csv(X_save_scaled)
    else:
        X_tr, X_train_scaled, y_tr, tr_scaler, classic_sta_lta5_mean_fill, classic_sta_lta7_mean_fill  = create_train_features(r'C:\Users\arvin\dev\lanl\train.csv')
        X_tr.to_csv(X_save)
        y_tr.to_csv(y_save)
        X_train_scaled.to_csv(X_save_scaled)

        scale_params_pickle_on = open(scale_params_pickle, "wb")
        pickle.dump(tr_scaler, scale_params_pickle_on)
        scale_params_pickle_on.close()

        with open(other_params_json, 'w') as fp:
            json.dump({"classic_sta_lta5_mean_fill": classic_sta_lta5_mean_fill,
                       "classic_sta_lta7_mean_fill": classic_sta_lta7_mean_fill}, fp)
else:
    X_train_scaled = pd.read_csv(X_save_scaled, index_col=0)
    y_tr = pd.read_csv(y_save, index_col=0)

### Example hyper-parameter experiments for different algorithms

In [None]:
params={"lr": {"fit_intercept": [False, True], "normalize": [False, True]},
       "ridge": {"alpha": [.000001, .00001, .0001, .001, .01, .1],
                 "fit_intercept": [False, True], "normalize": [False, True]},
       "lasso": {"alpha": [.000001, .00001, .0001, .001, .01, .1],
                 "fit_intercept": [False, True], "normalize": [False, True]},
       "elastic": {"alpha": [.000001, .00001, .0001, .001, .01, .1],
                   "fit_intercept": [False, True], "normalize": [False, True],
                  "l1_ratio": [.01, .99, .2, .4, .6, .8]},
       "dtreg": {"criterion": ["mse", "friedman_mse", "mae"],
                 "splitter": ["best", "best", "random"],
                "max_depth": [None, None, None, 5,10,20,50]},
       "rfreg": {"n_estimators": [5, 10, 50, 100, 200, 100],
                 "criterion": ["mse", "friedman_mse", "mae"],
                 "splitter": ["best", "best", "random"],
                "max_depth": [None, None, None, 5,10,20,50]},
       "abreg": {"n_estimators": [5, 10, 50, 100, 200, 100],
                 "learning_rate ": [1, .9, .5, .1],
                 "loss": ["linear", "square", "exponential"]},
       "gbreg": {"n_estimators": [5, 10, 50, 100, 200, 100],
                 "learning_rate ": [1, .9, .5, .1],
                 "loss": ["ls", "lad", "huber", "quantile"],
                 "subsample": [.1, .2, .5, 1.0]}}

### Run Experiment (for Ridge algorithm)

In [None]:
alg = "lr"
n_fold=10
save_results= "results.csv"
score_df = run_experiment(X=X_train_scaled, Y=y_tr, n_fold=n_fold, alg=alg, alg_params=params[alg], search_type="random", num_searches=2, save_results=save_results)

In [None]:
display(score_df)

### Run Experiment (for Elastic algorithm) and append to CSV results

In [None]:
alg = "elastic"
score_df = run_experiment(score_df=save_results, X=X_train_scaled, Y=y_tr, n_fold=n_fold, alg=alg, alg_params=params[alg], search_type="random", num_searches=2, save_results=save_results)

In [None]:
display(score_df)

### Display models ranked by CV scores

In [None]:
score_df = score_df.sort_values(by="score", axis=0)
display(score_df)

### Load results from CSV File and re-produce models ranked by CV scores

In [None]:
score_df = pd.read_csv(save_results)
score_df = score_df.sort_values(by="score", axis=0)
display(score_df)

### Load best model from CSV File

In [None]:
# retrieve top scoring row
best = score_df.iloc[0]
display(best)

# retrieve model parameters from pandas row
alg = best["alg"]
params_json = best["params_json"]
print("alg: {}".format(alg))
print("params_json: {}".format(params_json))

# retrieve relevant values
alg_cls = alg_map[alg]
params = json.loads(params_json)

# initialize model
model = alg_cls(**params)

# train algorithm
train_model(X=X_train_scaled, Y=y_tr, n_fold=n_fold, model=model)