In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import random
import yaml

In [2]:
f = open("src/params/preprocess_params.yaml", "r")
params = yaml.load(f, Loader=yaml.SafeLoader)
f.close()

In [3]:
def read_data():
    x_train = joblib.load('output/x_train.pkl')
    y_train = joblib.load('output/y_train.pkl')
    x_valid = joblib.load('output/x_valid.pkl')
    y_valid = joblib.load('output/y_valid.pkl')

    return x_train, y_train, x_valid, y_valid

In [4]:
x_train, y_train, x_valid, y_valid = read_data()

In [2]:
def model_lasso():
    param_dist = {'alpha': np.random.uniform(0.01,3,1000)}
    base_model = Lasso(random_state=42, selection='random')
    return param_dist, base_model


def model_rf():
    param_dist = {"n_estimators": [100, 250, 500, 1000]}
    base_model = RandomForestClassifier(random_state=0, n_jobs=-1)
    return param_dist, base_model


def model_svr():
    param_dist = {'C': [0.25, 0.5, 1, 1.25]}
    base_model = LinearSVR(dual=False, max_iter=10000)
    return param_dist, base_model

In [3]:
def random_search_cv(model, param, scoring, n_iter, x, y, verbosity=0):
    random_fit = RandomizedSearchCV(estimator=model,
                                    param_distributions=param,
                                    scoring=scoring,
                                    n_iter=n_iter,
                                    cv=5,
                                    random_state=0,
                                    verbose=verbosity)
    random_fit.fit(x, y)
    return random_fit

In [4]:
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [5]:
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square


def fit(x_train, y_train, model, model_param, general_params):
    """
    Fit model

    Args:
        - model(callable): Sklearn / imblearn model
        - model_param(dict): sklearn's RandomizedSearchCV param_distribution
        - general_params(dict): general parameters for the function
            - target(str) : y column to be used   
            - scoring(str) : sklearn cross-val scoring scheme
            - n_iter_search : RandomizedSearchCV number of iteration
    """

    model_fitted = random_search_cv(model, model_param,
                                    general_params['scoring'],
                                    general_params['n_iter_search'],
                                    x_train, y_train,
                                    general_params['verbosity'])

    print(
        f'Model: {model_fitted.best_estimator_}, {general_params["scoring"]}: {model_fitted.best_score_}')

    return model_fitted


def validation_score(x_valid, y_valid, model_fitted):
    
    # Report default
    y_predicted = model_fitted.predict(x_valid)
    mae, mse, rmse, r2_square = evaluate(y_valid, y_predicted)
    score = {'mae':mae, 'mse':mse, 'rmse':rmse, 'r2': r2_square}

    return score