In [7]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
import re
from sklearn.linear_model import ElasticNet
import pickle
import warnings
warnings.filterwarnings('ignore')

In [8]:
reduced_df = pd.read_csv('data/fs_2.csv')
df_t = pd.read_csv('data/df_transform.csv')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(reduced_df, 
                                                    df_t['site_eui'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [11]:
reg0 = ElasticNet(random_state=42)
reg1 = XGBRegressor(random_state=42)
reg2 = SVR()
reg3 = GradientBoostingRegressor(random_state=42)
#reg4 = CatBoostRegressor(random_state=42)
reg5 = LGBMRegressor(random_state=42)
#reg6 = get_stacking()

# Initiaze the hyperparameters for each dictionary

param0 = {}
param0['regressor__max_iter'] = [1,5,10,20,50,100,200,500]
param0['regressor__alpha'] = np.logspace(-5, 5, 100, endpoint=True)
param0['regressor__l1_ratio'] = np.arange(0.0, 1.0, 0.1)
param0['regressor'] = [reg0]

param1 = {}
param1['regressor__n_estimators'] = [50, 100, 250]
param1['regressor__max_depth'] = [3, 6, 10]
param1['regressor__learning_rate'] = [0.01, 0.05, 0.1]
param1['regressor__colsample_bytree']: [0.3, 0.7]
param1['regressor'] = [reg1]

param2 = {}
param2['regressor__kernel'] = ["rbf"]
param2['regressor__C'] = [0.1, 1, 10, 100, 1000]
param2['regressor'] = [reg2]

param3 = {}
param3['regressor__max_depth'] = [3,4,5]
param3['regressor__n_estimators'] = [100, 200, 300]
param3['regressor__learning_rate'] = [0.01, 0.05, 0.1]
param3['regressor'] = [reg3]

# param4 = {}
# param4['regressor__n_estimators'] = [100,200,500]
# param4['regressor__learning_rate'] = [.001,0.01,.1]
# param4['regressor__max_depth'] = [1,2,4]
# param4['regressor'] = [reg4]

param5 = {}
param5['regressor__num_leaves'] = [200,300,500, 800]
param5['regressor__learning_rate'] = [.001,0.01,.1, 0.05]
param5['regressor__feature_fraction'] = [0.3,0.6,0.9, 1]
param5['regressor__bagging_freq'] = [30, 50,70,90, 100]
param5['regressor'] = [reg5]

params = [param1, param2, param3, param5]

def tune(reg, param):
    pipeline = Pipeline([('regressor', reg)])
    rs = RandomizedSearchCV(pipeline, param, cv=5, scoring='neg_mean_squared_error')
    rs.fit(X_train, y_train)
    return rs.best_params_, rs.best_score_

In [12]:
tune(reg0, param0)

({'regressor__max_iter': 20,
  'regressor__l1_ratio': 0.8,
  'regressor__alpha': 0.0005214008287999684,
  'regressor': ElasticNet(alpha=0.0005214008287999684, l1_ratio=0.8, max_iter=20,
             random_state=42)},
 -690.6822321628223)

In [13]:
tune(reg1, param1)

({'regressor__n_estimators': 250,
  'regressor__max_depth': 6,
  'regressor__learning_rate': 0.05,
  'regressor': XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=None, colsample_bytree=None, gamma=None,
               gpu_id=None, importance_type='gain', interaction_constraints=None,
               learning_rate=0.05, max_delta_step=None, max_depth=6,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               n_estimators=250, n_jobs=None, num_parallel_tree=None,
               random_state=42, reg_alpha=None, reg_lambda=None,
               scale_pos_weight=None, subsample=None, tree_method=None,
               validate_parameters=None, verbosity=None)},
 -568.5241628185347)

In [14]:
tune(reg2, param2)

({'regressor__kernel': 'rbf', 'regressor__C': 1000, 'regressor': SVR(C=1000)},
 -579.403972375451)

In [15]:
tune(reg3, param3)

({'regressor__n_estimators': 300,
  'regressor__max_depth': 5,
  'regressor__learning_rate': 0.1,
  'regressor': GradientBoostingRegressor(max_depth=5, n_estimators=300, random_state=42)},
 -572.5493533503784)

In [16]:
tune(reg5, param5)



({'regressor__num_leaves': 200,
  'regressor__learning_rate': 0.05,
  'regressor__feature_fraction': 1,
  'regressor__bagging_freq': 90,
  'regressor': LGBMRegressor(bagging_freq=90, feature_fraction=1, learning_rate=0.05,
                num_leaves=200, random_state=42)},
 -562.9816255350181)

In [36]:
pd.concat([X_train, y_train], axis=1).to_csv('data/train_model_input_3.csv', index = False)

In [37]:
pd.concat([X_test, y_test], axis=1).to_csv('data/test_model_input_3.csv', index = False)