In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error

import numpy as np
import pandas as pd

import time
from tqdm import tqdm
import random
import pickle
import os

import gurobipy as gp
from gurobipy import GRB

from helpers.rf import *

In [None]:
n = 4 # max dimention of the input vector
n_samples = 2 # number of sampled quadratic function for each dimension

# set random seed
global_seed = 777
random.seed(global_seed)
np.random.seed(global_seed)
seeds = np.random.randint(1,1000,(1,n_samples))[0]

In [None]:
times = [] # list that stores solution times
objs = [] # list that stores obj function values
num_binvars = [] # list that stores # binary variables
num_constrs = [] # list that stores # constraints
best_models = [] # list that stores the best trained model
mapes = [] # list that stores MAPEs of the trained model

In [None]:
for i in tqdm(range(n_samples)):
    
    # read data from csv file
    file_name = './power_plant.csv'
    df = pd.read_csv(file_name)
    
    X_variable_raw = df.iloc[:,:-1].values
    Y_variable_raw = df.iloc[:,-1].values
    X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_variable_raw, Y_variable_raw,
                                                                        random_state=seeds[i], test_size = 0.25)
    
    # scaling
    sc_X = MinMaxScaler()
    sc_y = MinMaxScaler()

    X_variable = sc_X.fit_transform(X_train_raw)
    Y_variable = sc_y.fit_transform(y_train_raw.reshape(-1,1))
    Y_variable = np.squeeze(Y_variable)
    
    labels=np.array(Y_variable)
    features = np.array(X_variable)
    
    #training
    random.seed(seeds[i])
    np.random.seed(seeds[i])
    gs_rf = GridSearchCV(RandomForestRegressor(),
                         param_grid = {'min_samples_leaf': range(5, 20*n+1, 5),
                                       'n_estimators': [5, 10, 20, 50, 100],
                                       'max_features': range(1, n+1)},
                         return_train_score = True, cv = 5,
                         
                         scoring = 'neg_mean_squared_error',
                         verbose = 0)
    gs_rf.fit(X_variable, Y_variable)
    rf = gs_rf.best_estimator_
    best_models.append(rf)
    
    # prediction accuracy 
    y_predict = rf.predict(sc_X.transform(X_test_raw))
    y_predict_inversed = sc_y.inverse_transform(y_predict.reshape(-1,1))
    mapes.append(mean_absolute_percentage_error(y_test_raw, y_predict_inversed))
    
    #model parameters
    start = time.time()
    flag=1 #regression tree
    #equal weighted
    weight_all = 1/rf.n_estimators
    
    trees=list()
    trees=get_input(rf)
    ntrees = len(trees)
    settrees = range(ntrees)
    p = {}
    treeleaftuples = []
    for i in range(len(trees)):
        for j in leaves(trees, i):
            p[i,j] = sc_y.inverse_transform(np.asarray(prediction(trees,i,j,1)).reshape(-1,1)) * weight_all
            treeleaftuples.append((i,j))
        # print(p)
        # print(treeleaftuples)
    treesplits = []
    for i in range(ntrees):
        for s in splits(trees, i):
            treesplits.append((i,s))

    C_dict = split_values_new(trees)

            
    #create a new model
    m = gp.Model("tree_ensemble")
    m.Params.LogToConsole = 0
    m.setParam('TimeLimit', 1800)

    #create variables
    X_one = {}
    for i in total_split_variable(trees):
        for j in range(K(trees,i)):
            X_one[i,j]=m.addVar(vtype=GRB.BINARY, name='X_one'+str(i)+'_'+str(j))
    
    
    y = m.addVars(treeleaftuples, obj = p, lb = 0, name = 'y')
    
    # Maximization
    m.setObjective(gp.quicksum(y[t,l] * p[t,l] for t,l in treeleaftuples), GRB.MAXIMIZE)

    # Add constraint
    m.addConstrs((gp.quicksum(y[t,l] for l in leaves(trees, t)) == 1 for t in settrees), name = 'oney') 
    m.addConstrs((gp.quicksum(y[t,l] for l in left_leaf(trees, t, s)) <=  X_one[V(trees,t,s),C_new(C_dict,trees,t,s)] for t,s in treesplits), name = 'left')
    m.addConstrs((gp.quicksum(y[t,l] for l in right_leaf(trees, t, s)) <=  1 - X_one[V(trees,t,s),C_new(C_dict,trees,t,s)] for t,s in treesplits), name = 'right')
    
    for i in total_split_variable(trees):
        for j in range(K(trees,i)-1):
            m.addConstr(X_one[i,j] - X_one[i,j+1] <= 0)

    m.update()
    m.write("m.lp")
    
    m.optimize()
    
    times.append(m.Runtime)
    objs.append(m.objVal)
    num_binvars.append(m.NumBinVars)
    num_constrs.append(m.NumConstrs)

In [None]:
print(times)
print(objs)
print(num_binvars)
print(num_constrs)
print(mapes)

In [None]:
def pickle_save(path, file, filename):
    file_loc = path + '/' + filename + '.pickle'
    with open(file_loc, 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

# create the directory to save the results
path = './results_opt_powerplant_rf'

try:
    os.mkdir(path)
except FileExistsError:
    print('Folder already exists')

In [None]:
pickle_save(path, objs, 'objs')
pickle_save(path, times, 'times')
pickle_save(path, num_binvars, 'num_binvars')
pickle_save(path, num_constrs, 'num_constrs')
pickle_save(path, mapes, 'mapes')