In [None]:
def evaluate_model(df, target, standardize, thr_bins = [-np.inf, 0.01, 0.52, 6.57, 1090.84], model_name='ridgeRegression', skew=False, plot = False):
    print("model: ", model_name)
    # preprocessing pipline, no shuffling,
    X, y = preproc_df(df, target, standardize, skew)

    if model_name == 'randomForest':
        model = RandomForestRegressor()
    elif model_name == 'ridgeRegression':
        model = Ridge()
    elif model_name == 'MLPRegressor':
        model = MLPRegressor()
    
    
    # creating bins for target, startified sampling
    bins = pd.cut(y, bins=thr_bins, labels=[1, 2, 3, 4])
    # stratified sampling needed for each fold
    # .split(X,y_labels) method for StratifiedKFold: "Generate indices to split data into training and test set."
    # is normally used for classification task with classes as target, but since target is binned 
    # .split() could be used for stratified sampling
    strat_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=35007).split(X, bins)
    # what about hyperparameter tuning within each fold?
    
    if not(plot):
        scores = cross_val_score(model, X, y, cv=strat_kf, scoring='neg_mean_squared_error')
        # RMSE, negate because scoring neg_mean_sqared_error ouput negative MeanSquaredError
        rmse_scores = np.sqrt(-scores)
        # model should be returned too ! for testing
        return rmse_scores
    else:
        train_sizes, train_scores, test_scores = learning_curve(
            model, X, y, cv=strat_kf, scoring='neg_mean_squared_error', 
            train_sizes=np.linspace(0.1, 1.0, 10)
        )
        # train_sizes=np.linspace(0.1, 1.0, 10): validation scores for 10% to 100% of the training data (increasing size of the training set by 10%)
        
        train_scores_mean = np.mean(np.sqrt(-train_scores), axis=1)
        test_scores_mean = np.mean(np.sqrt(-test_scores), axis=1)
        
        plt.figure()
        plt.title("Learning Curve")
        plt.xlabel("Size of Training Set")
        plt.ylabel("Mean RSME Score")
        plt.grid()
        
        # x-axis actual size of the training set (10%-100%)
        # y-axis the RMSE 
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
        
        plt.legend(loc="best")
        plt.show()
        return np.sqrt(-train_scores)


In [None]:
def get_model(model_name, ridge_alpha=1.0, ridge_fit_intercept=True):
    if model_name == 'randomForest':
        return RandomForestRegressor()
    elif model_name == 'ridgeRegression':
        print('ridgeRegression')
        return Ridge(alpha=ridge_alpha, fit_intercept=ridge_fit_intercept)
    elif model_name == 'MLPRegressor':
        return MLPRegressor()

In [None]:
def evaluate_model2(df, target, model, standardize, thr_bins = [-np.inf, 0.01, 0.52, 6.57, 1090.84], bin_label=[1,2,3,4], kfolds=5, skew=False, plot = False):
    """
    Parameters:
    - df: DataFrame
    - target: str, target variable.
    - model: scikit-learn model (regression)
    - standardize: bool, standardize (from preproc_df).
    - thr_bins: list, thresholds for binning target variable.
    - skew: bool, apply skewness correction for RH (from preproc_df, if standardize=False).
    - plot: bool
    
    Returns:
    (plot=False):
        - RMSE scores 
    (plot=True):
        - learning curve
        - train_scores RSME scores from cross-validation
    """

    # preprocessing pipline, no shuffling,
    X, y = preproc_df(df, target, standardize, skew)

   
    # creating bins for target, startified sampling
    bins = pd.cut(y, bins=thr_bins, labels=bin_label)
    # stratified sampling needed for each fold
    # .split(X,y_labels) method for StratifiedKFold: "Generate indices to split data into training and test set."
    # is normally used for classification task with classes as target, but since target is binned 
    # .split() could be used for stratified sampling
    strat_kf = StratifiedKFold(n_splits=kfolds, shuffle=True, random_state=35007).split(X, bins)
    # what about hyperparameter tuning within each fold?
    
    if not(plot):
        scores = cross_val_score(model, X, y, cv=strat_kf, scoring='neg_mean_squared_error')
        # RMSE, negate because scoring neg_mean_sqared_error ouput negative MeanSquaredError
        rmse_scores = np.sqrt(-scores)
        # model should be returned too ! for testing
        return rmse_scores
    else:
        train_sizes, train_scores, test_scores = learning_curve(
            model, X, y, cv=strat_kf, scoring='neg_mean_squared_error', 
            train_sizes=np.linspace(0.1, 1.0, 10)
        )
        # train_sizes=np.linspace(0.1, 1.0, 10): validation scores for 10% to 100% of the training data (increasing size of the training set by 10%)
        
        train_scores_mean = np.mean(np.sqrt(-train_scores), axis=1)
        test_scores_mean = np.mean(np.sqrt(-test_scores), axis=1)
        
        plt.figure()
        plt.title("Learning Curve")
        plt.xlabel("Size of Training Set")
        plt.ylabel("Mean RSME Score")
        plt.grid()
        
        # x-axis actual size of the training set (10%-100%)
        # y-axis the RMSE 
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
        
        plt.legend(loc="best")
        plt.show()
        return np.sqrt(-train_scores)
