In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import sklearn
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import yaml
import time

In [2]:
sklearn. __version__

'1.0.1'

In [29]:
def read_data(params):
    x_train = joblib.load(params['DUMP_TRAIN'])
    y_train = joblib.load(params['Y_PATH_TRAIN'])
    x_valid = joblib.load(params['DUMP_VALID'])
    y_valid = joblib.load(params['Y_PATH_VALID'])

    return x_train, y_train, x_valid, y_valid


def model_lasso():
    param_dist = {'alpha': np.random.uniform(0.01,1,3)}
    base_model = Lasso(random_state=42, selection='random')
    return param_dist, base_model


def model_rf():
    param_dist = {"n_estimators": [100, 250, 500, 1000]}
    base_model = RandomForestRegressor(random_state=0, n_jobs=-1)
    return param_dist, base_model


def model_svr():
    param_dist = {'C': [0.25, 0.5, 1, 1.25]}
    base_model = LinearSVR(loss = 'squared_epsilon_insensitive', dual=False, max_iter=10000)
    return param_dist, base_model


def random_search_cv(model, param, scoring, n_iter, x, y, verbosity=0):
    random_fit = RandomizedSearchCV(estimator=model,
                                    param_distributions=param,
                                    scoring=scoring,
                                    n_iter=n_iter,
                                    cv=5,
                                    random_state=0,
                                    verbose=verbosity, refit=scoring[0])
    random_fit.fit(x, y)
    return random_fit


def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    mape = metrics.mean_absolute_percentage_error(true, predicted)
    exp_var = metrics.explained_variance_score(true, predicted)
    return mae, mse, rmse, r2_square, mape, exp_var


def fit(x_train, y_train, model, model_param, general_params):
    """
    Fit model

    Args:
        - model(callable): Sklearn / imblearn model
        - model_param(dict): sklearn's RandomizedSearchCV param_distribution
        - general_params(dict):x general parameters for the function
            - target(str) : y column to be used   
            - scoring(str) : sklearn cross-val scoring scheme
            - n_iter_search : RandomizedSearchCV number of iteration
    """
    #print( general_params['scoring'])

    model_fitted = random_search_cv(model, model_param,
                                    general_params['scoring'],
                                    general_params['n_iter_search'],
                                    x_train, y_train,
                                    general_params['verbosity'])

    
    print(
        f'Model: {model_fitted.best_estimator_}, {general_params["scoring"][0]}: {model_fitted.best_score_}')

    return model_fitted, model_fitted.best_estimator_


def validation_score(x_valid, y_valid, model_fitted):
    
    # Report default
    y_predicted = model_fitted.predict(x_valid)
    mae, mse, rmse, r2_square, mape, exp_var = evaluate(y_valid, y_predicted)
    score = {'mae':mae, 'mse':mse, 'rmse':rmse, 'r2': r2_square, 'mape': mape, 'exp_var': exp_var}

    return score

def select_model(train_log_dict):
    temp = []
    for score in train_log_dict['model_score']:
        temp.append(score['rmse'])
    #print(temp)
    best_model = train_log_dict['model_fit'][temp.index(min(temp))]
    best_parameter = train_log_dict['model_report'][temp.index(min(temp))]
    best_report = train_log_dict['model_score'][temp.index(min(temp))]
    
    return best_model, best_parameter, best_report

In [30]:
def main(params):

    lasso = model_lasso
    rf = model_rf
    lsvr = model_svr

    train_log_dict = {'model': [lasso, rf, lsvr],
                      'model_name': [],
                      'model_fit': [],
                      'model_report': [],
                      'model_score': [],
                      'fit_time': []}

    x_train, y_train, x_valid, y_valid  = read_data(params)

    for model in train_log_dict['model']:
        param_model, base_model = model()
        train_log_dict['model_name'].append(base_model.__class__.__name__)
        print(
           f'Fitting {base_model.__class__.__name__}')

        # Train
        t0 = time.time()
        fitted_model,best_estimator = fit(
            x_train, y_train, base_model, param_model, params)
        elapsed_time = time.time() - t0
        print(f'elapsed time: {elapsed_time} s \n')
        train_log_dict['fit_time'].append(elapsed_time)
        train_log_dict['model_fit'].append(best_estimator.__class__.__name__)
        train_log_dict['model_report'].append(best_estimator)

        fitted_model.fit(x_train, y_train)
        # Validate
        score = validation_score( x_valid, y_valid, fitted_model)
        #train_log_dict['model_score'].append(
        #    report['f1-score']['macro avg'])
        train_log_dict['model_score'].append(
            score)


    best_model, best_parameter, best_report = select_model(
        train_log_dict)
    print(
        f"Model: {best_model}, Score: {best_report}, Parameter: {best_parameter}")
    joblib.dump(best_model, 'output/isrelated_model.pkl')
    joblib.dump(best_parameter, 'output/isrelated_parameter.pkl')
    joblib.dump(train_log_dict, 'output/isrelated_train_log.pkl')
    

In [27]:
f = open("src/params/preprocess_params.yaml", "r")
params = yaml.load(f, Loader=yaml.SafeLoader)
f.close()

In [28]:
main(params)

Fitting Lasso
Fitting 5 folds for each of 3 candidates, totalling 15 fits


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.901620047026447; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.901620047026447; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.901620047026447; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.901620047026447; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.901620047026447; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.9341182081296724; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.9341182081296724; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.9341182081296724; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.9341182081296724; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.9341182081296724; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.6153319202642722; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.6153319202642722; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.6153319202642722; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.6153319202642722; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.6153319202642722; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


Model: Lasso(alpha=0.9341182081296724, random_state=42, selection='random'), neg_root_mean_squared_error: -45943.54429395886
elapsed time: 7.045105934143066 s 

Fitting RandomForestRegressor
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...................................n_estimators=500; total time=   5.5s
[CV] END ...................................n_estimators=500; total time=   5.1s
[CV] END ...................................n_estimators=500; total time=   5.3s
[CV] END ...................................n_estimators=500; total time=   6.2s
[CV] END ...................................n_estimators=500; total time=   5.2s
[CV] END ..................................n_estimators=1000; total time=   9.4s
[CV] END ..................................n_estimators=1000; total time=  10.5s
[CV] END ..................................n_estimators=1000; total time=  10.4s
[CV] END ..................................n_estimators=1000; total time=  10.5s
[CV] END ...........