In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import pandas as pd


from tqdm import tqdm
import random
import pickle
import os

import gurobipy as gp
from gurobipy import GRB

from helpers.pwl_tree import tree_to_code

In [None]:
n = 4 # max dimention of the input vector
n_samples = 10 # number of sampled quadratic function for each dimension

times_n = {} # dict that stores solution times
objs_n = {} # dict that stores obj function value
num_binvars_n = {} # dict that stores # binary variables
num_constrs_n = {} # dict that stores # constraints

# set random seed
global_seed = 777
random.seed(global_seed)
np.random.seed(global_seed)
seeds = np.random.randint(1,1000,(1,n_samples))[0]
print(seeds)

In [None]:
times = [] # list that stores solution times
objs = [] # list that stores obj function values
num_binvars = [] # list that stores # binary variables
num_constrs = [] # list that stores # constraints
best_models = [] # list that stores the best trained model
mapes = [] # list that stores MAPEs of the trained model

In [None]:
for i in tqdm(range(n_samples)):
    
    # read data from csv file
    file_name = './power_plant.csv'
    df = pd.read_csv(file_name)
    
    X_variable_raw = df.iloc[:,:-1].values
    Y_variable_raw = df.iloc[:,-1].values
    X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_variable_raw, Y_variable_raw,
                                                                        random_state=seeds[i], test_size = 0.25)
    # scaling
    sc_X = MinMaxScaler()
    sc_y = MinMaxScaler()

    X_variable = sc_X.fit_transform(X_train_raw)
    Y_variable = sc_y.fit_transform(y_train_raw.reshape(-1,1))
    Y_variable = np.squeeze(Y_variable)
    
    labels=np.array(Y_variable)
    features = np.array(X_variable)
    
    #training
    random.seed(seeds[i])
    np.random.seed(seeds[i])
    gs = GridSearchCV(DecisionTreeRegressor(),
                      param_grid = {'min_samples_leaf': range(5, 10*n+1, 5)},
                      return_train_score = True, cv = 5,
                      scoring = 'neg_mean_squared_error')
    gs.fit(X_variable, Y_variable)
    best_tree = gs.best_estimator_
    Y_predict = best_tree.predict(X_variable)
    tree_dict, predictions = tree_to_code(best_tree, feature_names=['x' + str(i) for i in range(n)])
    
    # prediction accuracy 
    y_predict = gs.predict(sc_X.transform(X_test_raw))
    y_predict_inversed = sc_y.inverse_transform(y_predict.reshape(-1,1))
    mapes.append(mean_absolute_percentage_error(y_test_raw, y_predict_inversed))
        
    # revert the scaling of prediction at each leaf node
    for key in predictions.keys():
        predictions[key] = sc_y.inverse_transform(np.asarray(predictions[key]).reshape(-1,1)).item()
        
    # obtain the info of the subdomain corresponding to the leaf
    # map the key from tree 'x_i >= value' & 'x_i <= value' to column number in the tree_params 2D array
    # tree_params: each row represents a leaf; the first (last) n column is the upper bound (lower bound) of each x_i (i in n)
    dict_names = {}
    count = 0 
    for i in range(n):
        lessthan = 'x' + str(i) + ' <='
        largerthan = 'x' + str(i) + ' >'
        dict_names[lessthan] = count
        dict_names[largerthan] = count + n
        count += 1

    n_leaves = len(tree_dict)
    
    tree_params = [[1.0 for i in range(n)] + [0.0 for i in range(n)]]
    tree_params = np.repeat(tree_params, n_leaves, axis = 0)

    count = 0
    for _, leaf in tree_dict.items():
        for key, value in leaf.items():
            tree_params[count][dict_names[key]] = value
        count += 1
    
    # tree_params_less: upper bound of x for a leaf
    # tree_params_larger: lower bound of x for a leaf
    tree_params_less = tree_params[:,:n]
    tree_params_large = tree_params[:,n:]
        
    # revert scaling of tree params
    tree_params_less = sc_X.inverse_transform(tree_params_less)
    tree_params_large = sc_X.inverse_transform(tree_params_large)
    
    # set of leaves； set of input variable dimensions
    rangen_leaves = range(n_leaves)
    rangen = range(n)
            
    #create a new model
    m = gp.Model("PWC")
    m.Params.LogToConsole = 0
    m.setParam('TimeLimit', 60)

    #create variables
    z = m.addVars(rangen_leaves, name = 'z', vtype = GRB.BINARY)
    w = m.addVars(rangen, name = 'w')
    y = m.addVar(obj = 1, name = 'y')
        
    m.setObjective(y, GRB.MAXIMIZE)

    # add constraint
    m.addConstr((gp.quicksum(z[l] for l in rangen_leaves) == 1), name = 'onez')  
    m.addConstr((gp.quicksum(predictions[l] * z[l] for l in rangen_leaves) == y), name = 'calcy') 
    m.addConstrs((gp.quicksum(tree_params_large[l][i] * z[l] for l in rangen_leaves) <= w[i] for i in rangen), name = 'largerthan')
    m.addConstrs((gp.quicksum(tree_params_less[l][i] * z[l]  for l in rangen_leaves) >= w[i] for i in rangen), name = 'lessthan')
    
    m.update()
    m.write("m.lp")    
    
    m.optimize()
    
    times.append(m.Runtime)
    objs.append(m.objVal)
    num_binvars.append(m.NumBinVars)
    num_constrs.append(m.NumConstrs)

In [None]:
print(times)
print(objs)
print(num_binvars)
print(num_constrs)

In [None]:
def pickle_save(path, file, filename):
    file_loc = path + '/' + filename + '.pickle'
    with open(file_loc, 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

# create the directory to save the results
path = './results_opt_powerplant_pwc'

try:
    os.mkdir(path)
except FileExistsError:
    print('Folder already exists')

In [None]:
pickle_save(path, objs, 'objs')
pickle_save(path, times, 'times')
pickle_save(path, num_binvars, 'num_binvars')
pickle_save(path, num_constrs, 'num_constrs')
pickle_save(path, mapes, 'mapes')