In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from numpyencoder import NumpyEncoder
import lightgbm as lgb
import json

In [None]:
df = pd.read_csv('multi_final.csv')
with open('features_multy.txt', 'r') as file:
    features = [feat.replace('\n', '') for feat in file]

Vmax_df = df[features]
Vmax_df['Vmax'] = df['Vmax']
Vmax_target = np.log10(Vmax_df['Vmax'])
Vmax_features = Vmax_df.drop(['Vmax'], axis = 1)
imputer = KNNImputer(n_neighbors=5)
df_filled = imputer.fit_transform(Vmax_features)
Vmax_features = pd.DataFrame(df_filled, columns=Vmax_features.columns)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import HistGradientBoostingRegressor
import joblib
from sklearn.metrics import r2_score


def HistGradientBoostingRegressor_vae(features_, target_,
                                scaler = MinMaxScaler(),
                                use_scaler=True,
                                random_st = 20,
                                n_cv = 5,
                                shuffle_on = True,
                                objective_st='binary',
                                learning_rat=0.1,
                                n_boost_round=500
                                ):

  param_gr = {
     'learning_rate': np.array([0.01, 0.08, 0.05, 0.1, 0.23 , 0.34]),
      'min_samples_leaf': np.array([5, 10, 20, 30]),
      'max_depth': np.array([4, 8, 12, -1], dtype = int),
      'max_bins': np.array([100, 200, 255]),
      'min_samples_leaf': np.array([10, 20, 25]),
      }

  scaler.fit(features_)
  sc_feature = scaler.transform(features_)
  gkf = KFold(n_splits=n_cv, shuffle=shuffle_on, random_state=random_st).split(features_, target_)


  lgb_estimator = HistGradientBoostingRegressor()

  gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_gr, cv=gkf, verbose = 2)
  lgb_model = gsearch.fit(sc_feature, target_)

  return lgb_model.best_params_, lgb_model.best_score_


def HistGradientBoostingRegressor_Learning(features, target, target_name, best_grid,
                            scaler = MinMaxScaler(),
                            use_scaler=True,
                            random_st = 20,
                            size_of_test=0.2,
                            filename_for_wght = '.pkl',
                            filename_for_scaler = '.pkl'
                            ):
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = size_of_test, random_state = random_st)

        scaler.fit(X_train)
        x_train = scaler.transform(X_train)
        x_test = scaler.transform(X_test)
        joblib.dump(scaler, filename_for_scaler)

        lgb_model = HistGradientBoostingRegressor(**best_grid)
        lgb_model.fit(x_train, y_train)
        joblib.dump(lgb_model, filename_for_wght)

        lgb_model = joblib.load(filename_for_wght)
        y_pred = lgb_model.predict(x_test)
        print("R2__score")
        print(r2_score(y_test, y_pred))
        return

In [None]:
def LGBMRegressor_grid_cv(features, target,
                                scaler = MinMaxScaler(),
                                use_scaler=True,
                                random_st = 20,
                                n_cv = 5,
                                shuffle_on = True,
                                n_boost_round=500
                                ):

        param_gr = {
            'num_leaves': [4, 8, 16,20, 25,30],
            'n_estimators': np.array([50, 100, 200 , 500, 700], dtype = int),
            'learning_rate': np.array([0.01, 0.08, 0.05, 0.1, 0.23 , 0.34]),
            'max_depth': np.array([4, 8, 12, -1], dtype = int),
            'class_weight': ["balanced", None],
            'reg_alpha': np.array([0, 0.1, 1]),
            'reg_lambda': np.array([0, 0.1, 1])
            }


        scaler.fit(features)
        sc_feature = scaler.transform(features)
        gkf = KFold(n_splits=n_cv, shuffle=shuffle_on, random_state=random_st).split(features, target)


        lgb_estimator = lgb.LGBMRegressor(boosting_type='gbdt', num_boost_round=n_boost_round)

        gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_gr, cv=gkf)
        lgb_model = gsearch.fit(sc_feature, target)

        return lgb_model.best_params_, lgb_model.best_score_

def LGBMRegressor_Learning(features, target, target_name, best_grid,
                            scaler = MinMaxScaler(),
                            use_scaler=True,
                            random_st = 20,
                            size_of_test=0.2,
                            filename_for_wght = '.pkl',
                            filename_for_scaler = '.pkl'
                            ):
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = size_of_test, random_state = random_st)

        if use_scaler:
            scaler.fit(X_train)
            x_train = scaler.transform(X_train)
            x_test = scaler.transform(X_test)
            joblib.dump(scaler, filename_for_scaler)

            lgb_model = lgb.LGBMRegressor(**best_grid)
            lgb_model.fit(x_train, y_train)
            joblib.dump(lgb_model, filename_for_wght)


            lgb_model = joblib.load(filename_for_wght)
            y_pred = lgb_model.predict(x_test)
            print("R2__score")
            print(r2_score(y_test, y_pred))
            return

In [None]:
def RandomForestRegressor_grid_cv(features, target,
                                scaler = MinMaxScaler(),
                                use_scaler=True,
                                random_st = 20,
                                n_cv = 5,
                                shuffle_on = True,
                                learning_rat=0.1,
                                ):

        param_gr = {
        "n_estimators": np.array(range(100,701, 100)),
        "min_samples_split": np.array([2, 5, 10, 20, 30, 50]),
        "min_samples_leaf": np.array([2, 5, 10, 20, 30, 50]),
        "max_depth": np.array([5, 10, 16, 32, 64, 80, 100]),
        "max_features": np.array(["sqrt", "log2", None]),
    }


        if use_scaler:
            scaler.fit(features)
            sc_feature = scaler.transform(features)
            gkf = KFold(n_splits=n_cv, shuffle=shuffle_on, random_state=random_st).split(features, target)


            rf_estimator = RandomForestRegressor()

            gsearch = GridSearchCV(estimator=rf_estimator, param_grid=param_gr, cv=gkf, verbose = 2)
            rf_model = gsearch.fit(sc_feature, target)

            return rf_model.best_params_, rf_model.best_score_

def RandomForestRegressor_Learning(features, target, target_name, best_grid,
                            scaler = MinMaxScaler(),
                            use_scaler=True,
                            random_st = 20,
                            size_of_test=0.2,
                            filename_for_wght = '.pkl',
                            filename_for_scaler = '.pkl'
                            ):
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = size_of_test, random_state = random_st)

        if use_scaler:
            scaler.fit(X_train)
            x_train = scaler.transform(X_train)
            x_test = scaler.transform(X_test)
            joblib.dump(scaler, filename_for_scaler)

            model = RandomForestRegressor(**best_grid)
            model.fit(x_train, y_train)
            joblib.dump(model, filename_for_wght)


            model = joblib.load(filename_for_wght)
            y_pred = model.predict(x_test)
            print("R2__score")
            print(r2_score(y_test, y_pred))
            return

In [None]:
def ExtraTreesRegressor_grid_cv(features, target,
                                scaler = MinMaxScaler(),
                                use_scaler=True,
                                random_st = 20,
                                n_cv = 5,
                                shuffle_on = True,
                                learning_rat=0.1,
                                ):

        param_gr = {
        "n_estimators": np.array(range(100,701, 100)),
        "min_samples_split": np.array([2, 5, 10, 20, 30, 50]),
        "min_samples_leaf": np.array([2, 5, 10, 20, 30, 50]),
        "max_depth": np.array([5, 10, 16, 32, 64, 80, 100]),
        "max_features": np.array(["sqrt", "log2", None]),
        "ccp_alpha" : np.array([0, 0.01, 0.1])
    }

        if use_scaler:
            scaler.fit(features)
            sc_feature = scaler.transform(features)
            gkf = KFold(n_splits=n_cv, shuffle=shuffle_on, random_state=random_st).split(features, target)


            rf_estimator = ExtraTreesRegressor()

            gsearch = GridSearchCV(estimator=rf_estimator, param_grid=param_gr, cv=gkf, verbose = 2)
            rf_model = gsearch.fit(sc_feature, target)

            return rf_model.best_params_, rf_model.best_score_

def ExtraTreesRegressor_Learning(features, target, target_name, best_grid,
                            scaler = MinMaxScaler(),
                            use_scaler=True,
                            random_st = 20,
                            size_of_test=0.2,
                            filename_for_wght = '.pkl',
                            filename_for_scaler = '.pkl'
                            ):
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = size_of_test, random_state = random_st)

        if use_scaler:
            scaler.fit(X_train)
            x_train = scaler.transform(X_train)
            x_test = scaler.transform(X_test)
            joblib.dump(scaler, filename_for_scaler)

            model = ExtraTreesRegressor(**best_grid)
            model.fit(x_train, y_train)
            joblib.dump(model, filename_for_wght)


            model = joblib.load(filename_for_wght)
            y_pred = model.predict(x_test)
            print("R2__score")
            print(r2_score(y_test, y_pred))
            return

In [None]:
def CatBoostRegressor_grid_cv(features, target,
                                scaler = MinMaxScaler(),
                                random_st = 20,
                                n_cv = 5,
                                shuffle_on = True,
                                learning_rat=0.1,
                                ):

        param_gr = {
        'depth'         : [3, 6, 8, 10, 12, 15],
        'learning_rate' : [0.01, 0.05, 0.1, 0.5, 1],
        'iterations'    : [30, 50, 100, 200, 500],
        'l2_leaf_reg': [1, 3, 5, 7, 9, 13, 15]
    }

        scaler.fit(features)
        sc_feature = scaler.transform(features)

        gkf = KFold(n_splits=n_cv, shuffle=shuffle_on, random_state=random_st).split(features, target)

        catboost_estimator = CatBoostRegressor()

        gsearch = GridSearchCV(estimator=catboost_estimator, param_grid=param_gr, cv=gkf, verbose = 2)
        catboost_model = gsearch.fit(sc_feature, target)

        return catboost_model.best_params_, catboost_model.best_score_

def CatBoostRegressor_Learning(features, target, target_name, best_grid,
                            scaler = MinMaxScaler(),
                            use_scaler=True,
                            random_st = 20,
                            size_of_test=0.2,
                            filename_for_wght = '.pkl',
                            filename_for_scaler = '.pkl'
                            ):
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = size_of_test, random_state = random_st)

        if use_scaler:
            scaler.fit(X_train)
            x_train = scaler.transform(X_train)
            x_test = scaler.transform(X_test)
            joblib.dump(scaler, filename_for_scaler)

            model = CatBoostRegressor(**best_grid)
            model.fit(x_train, y_train)
            joblib.dump(model, filename_for_wght)


            model = joblib.load(filename_for_wght)
            y_pred = model.predict(x_test)
            print("R2__score")
            print(r2_score(y_test, y_pred))
            return

In [None]:
def AdaBoostRegressor_grid_cv(features, target,
                                scaler = MinMaxScaler(),
                                random_st = 20,
                                n_cv = 5,
                                shuffle_on = True,
                                learning_rat=0.1,
                                ):

        param_gr = {
                   'n_estimators': [10, 30, 50,70, 100, 300, 500],
                   'learning_rate': [0.001, 0.005,0.01, 0.05, 0.1, 0.5, 1],
                   'loss': ['linear', 'square', 'exponential']
                    }

        scaler.fit(features)
        sc_feature = scaler.transform(features)

        gkf = KFold(n_splits=n_cv, shuffle=shuffle_on, random_state=random_st).split(features, target)

        adaboost_estimator = AdaBoostRegressor()

        gsearch = GridSearchCV(estimator=adaboost_estimator, param_grid=param_gr, cv=gkf, verbose = 2)
        adaboost_model = gsearch.fit(sc_feature, target)

        return adaboost_model.best_params_, adaboost_model.best_score_

def AdaBoostRegressor_Learning(features, target, target_name, best_grid,
                            scaler = MinMaxScaler(),
                            use_scaler=True,
                            random_st = 20,
                            size_of_test=0.2,
                            filename_for_wght = '.pkl',
                            filename_for_scaler = '.pkl'
                            ):
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = size_of_test, random_state = random_st)

        if use_scaler:
            scaler.fit(X_train)
            x_train = scaler.transform(X_train)
            x_test = scaler.transform(X_test)
            joblib.dump(scaler, filename_for_scaler)

            model = AdaBoostRegressor(**best_grid)
            model.fit(x_train, y_train)
            joblib.dump(model, filename_for_wght)


            model = joblib.load(filename_for_wght)
            y_pred = model.predict(x_test)
            print("R2__score")
            print(r2_score(y_test, y_pred))
            return

In [None]:
params_hgb, score_hgb = HistGradientBoostingRegressor_vae(features_ = Vmax_features, target_ = Vmax_target)
with open("Vmax_params_hgb_multi.json", 'w', encoding="utf8") as fp:
    json.dump(params_hgb, fp, ensure_ascii=False, indent=4, cls = NumpyEncoder)

HistGradientBoostingRegressor_Learning(Vmax_features, Vmax_target, 'Vmax', params_hgb,random_st = 20,
                            filename_for_wght = 'Vmax_model_multi_hgb.pkl',
                            filename_for_scaler = 'Vmax_scaler_multi_hgb.pkl'
                                       )

In [None]:
params_lgb, score_lgb = LGBMRegressor_grid_cv(Vmax_features, Vmax_target)
with open("Vmax_params_lgb_multi.json", 'w', encoding="utf8") as fp:
    json.dump(params_lgb, fp, ensure_ascii=False, indent=4, cls = NumpyEncoder)

LGBMRegressor_Learning(Vmax_features, Vmax_target, 'Vmax', params_lgb, random_st = 20,
                            filename_for_wght = 'Vmax_model_multi_lgb.pkl',
                            filename_for_scaler = 'Vmax_scaler_multi_lgb.pkl'
                                       )

In [None]:
params_rfr, score_rfr = RandomForestRegressor_grid_cv(Vmax_features, Vmax_target)
with open("Vmax_params_rfr_multi.json", 'w', encoding="utf8") as fp:
    json.dump(params_rfr, fp, ensure_ascii=False, indent=4, cls = NumpyEncoder)

RandomForestRegressor_Learning(Vmax_features, Vmax_target, 'Vmax', params_rfr, random_st = 20,
                            filename_for_wght = 'Vmax_model_multi_rfr.pkl',
                            filename_for_scaler = 'Vmax_scaler_multi_rfr.pkl'
                                       )

In [None]:
params_etr, score_etr = ExtraTreesRegressor_grid_cv(Vmax_features, Vmax_target)
with open("Vmax_params_etr_multi.json", 'w', encoding="utf8") as fp:
    json.dump(params_etr, fp, ensure_ascii=False, indent=4, cls = NumpyEncoder)

ExtraTreesRegressor_Learning(Vmax_features, Vmax_target, 'Vmax', params_etr, random_st = 20,
                            filename_for_wght = 'Vmax_model_multi_etr.pkl',
                            filename_for_scaler = 'Vmax_scaler_multi_etr.pkl'
                                       )

In [None]:
params_cbr, score_cbr = CatBoostRegressor_grid_cv(Vmax_features, Vmax_target)
with open("Vmax_params_cbr_multi.json", 'w', encoding="utf8") as fp:
    json.dump(params_cbr, fp, ensure_ascii=False, indent=4, cls = NumpyEncoder)

CatBoostRegressor_Learning(Vmax_features, Vmax_target, 'Vmax', params_cbr, random_st = 20,
                            filename_for_wght = 'Vmax_model_multi_cbr.pkl',
                            filename_for_scaler = 'Vmax_scaler_multi_cbr.pkl'
                                       )

In [None]:
params_abr, score_abr = AdaBoostRegressor_grid_cv(Vmax_features, Vmax_target)
with open("Vmax_params_abr_multi.json", 'w', encoding="utf8") as fp:
    json.dump(params_abr, fp, ensure_ascii=False, indent=4, cls = NumpyEncoder)

AdaBoostRegressor_Learning(Vmax_features, Vmax_target, 'Vmax', params_abr, random_st = 20,
                            filename_for_wght = 'Vmax_model_multi_abr.pkl',
                            filename_for_scaler = 'Vmax_scaler_multi_abr.pkl'
                                       )