# Comparing linear regression and DFL-IO

Steps comparing the linear and inverse-DFL models:
1. Save two datasets
4. Split each dataset into train and test sets
5. Run DFL-IO with each training set, get weights $\theta^{\text{IO,close}}$ and $\theta^{\text{IO,wide}}$
6. Run linear regression on training set, get weights $\theta^{\text{R,close}}$ and $\theta^{\text{R,wide}}$
7. Compare all models on each training set, for each model:
    1. Predict demand using weights and features
    2. Solve MCFND with demand, get design variables $\hat{y}$
    3. Solve MCF-Flow with fixed design variables $\hat{y}$, get $\hat{x}$
    4. Compare costs of $\hat{y}, \hat{x}$ with optimal $x^*, y^*$ 

## Imports and configurations

In [None]:
using JuMP
using Gurobi
using LinearAlgebra
using Distributions
using Random
using PDMats
using MLJ
using Tables
using DataFrames
using Plots, StatsPlots
using JLD
using CSV

Random.seed!(42)

In [None]:
using Revise

includet("../models/forward.jl")
import .Forward as Forward

includet("../models/inversedemand.jl")
import .InverseDemand as IODemand

includet("../models/inverselinreg.jl")
import .InverseLinReg as IOLinReg

includet("../datagen/data-generation.jl")
import .DataGeneration as DataGen

In [None]:
Linear = @MLJ.load LinearRegressor pkg = "MLJLinearModels"

In [None]:
BASE_DATA_PATH = "../data/"
BASE_RESULTS_PATH = "../results/"

CLOSE_DATA_NAME = "close"
WIDE_DATA_NAME = "wide"

function dataset_path(n_points)
    return joinpath(BASE_DATA_PATH, "data_$n_points.jld") 
end

function results_path(n_points)
    return joinpath(BASE_RESULTS_PATH, "results_$n_points.csv")
end

## Problem parameters

Make a smaller problem with 1 commodity and 2 possible arcs, one low-ish capacity ($C$) and one high ($\infty$)


In [None]:
forward_params = Forward.Params(
    n_paths=2, 
    n_commodities=1,
    capacities=[100, 100000],
    design_costs=[10, 100],
    flow_costs=[10 100]',
    enabled_flows=ones(Bool, (2, 1))
)

datagen_params = DataGen.DataGenParams(
    weights=[1.5 -3 2], 
    noise_variance=[5.0^2]
)

inverse_params = IOLinReg.Params(
    n_features=datagen_params.n_features, 
    forward_params=forward_params, 
    with_noise=true
)

## Data generation

Generate two datasets using fixed weights $\Theta$:
- $\mathcal{D}_{\text{close}}$ with $\mathbb{E}[d] = C$ 
- $\mathcal{D}_{\text{wide}}$ with $\mathbb{E}[d] \ll C$. 

Procedure for each data point in the dataset:
1. Draw $\phi_1, \ldots, \phi_{m-1} \sim U(a, b)$ for some $a, b$
2. Set $\phi_m$ such that $\sum_{i=1}^m \theta_i \phi_i = \mathbb{E}[d]$
3. Draw noise $\epsilon \sim \mathcal{N}(0, \sigma^2)$ and compute $d = \sum_{i=1}^m \theta_i \phi_i + \epsilon$
4. Solve MCFND for $d$ 
5. Datapoint $(\phi, d, x^*, y^*) \in \mathcal{D}$

Repeat dataset creation for several number of points


In [None]:
n_points = [100, 1000, 3000, 10000]
close_target_demand = 100
wide_target_demand = 20

gurobi_env = Gurobi.Env()

In [None]:
for n in n_points
    close_dataset = DataGen.generate_dataset(forward_params, datagen_params, n_points=n, target_demand=close_target_demand, gurobi_env=gurobi_env)
    wide_dataset = DataGen.generate_dataset(forward_params, datagen_params, n_points=n, target_demand=wide_target_demand, gurobi_env=gurobi_env)

    save(dataset_path(n), CLOSE_DATA_NAME, close_dataset, WIDE_DATA_NAME, wide_dataset, compress=true)
end

## Model training

Define a training function and a prediction function for each model:
- `train_{model_type}_model` takes in a training dataset of `IOLinReg.SolutionPoint`s, and returns a trained model
- `predict_{model_type}_model` takes a model of the correct type and a test dataset of `IOLinReg.SolutionPoint`s, and returns a vector of predicted demands

Utility functions

In [None]:
function load_dataset(n_points)
    dataset = JLD.load(dataset_path(n_points))

    return dataset[CLOSE_DATA_NAME], dataset[WIDE_DATA_NAME]
end

function convert_dataset_to_mlj(dataset)
    features = DataFrame(vcat(map(sol -> sol.linreg_features', dataset)...), :auto)
    demands = vcat(map(sol -> sol.actual_demands, dataset)...)

    return features, demands
end

DFL-IO model training and prediction functions

In [None]:
function train_inverse_model(training_dataset)
    model = IOLinReg.create_problem(inverse_params, training_dataset, gurobi_env=gurobi_env)

    return IOLinReg.solve_problem!(model, inverse_params)
end

function predict_inverse_model(inverse_solution, test_dataset)
    features = map(row -> row.linreg_features, test_dataset)
    predict = f -> IOLinReg.predict_inverse_model(inverse_solution, f)

    return vcat(map(predict, features)...)
end

Linear regression model training and prediction functions

In [None]:
function train_linear_model(training_dataset)
    features, demands = convert_dataset_to_mlj(training_dataset)

    model = Linear()
    mach = machine(model, features, demands)
    fit!(mach)

    return mach
end

function predict_linear_model(linreg_machine, test_dataset)
    features, _ = convert_dataset_to_mlj(test_dataset)
    return predict(linreg_machine, features)
end

## Model evaluation

Define an evaluation procedure `evaluate_model_on_dataset` as described in point 5 of the introduction. Takes a train and test dataset, and a training and prediction function for a given model.

In [None]:
function evaluate_model_on_dataset(train_data, test_data, train_model, make_predictions; gurobi_env=nothing)  
    trained_model = train_model(train_data)
    predicted_demands = make_predictions(trained_model, test_data)
    designed_network = compute_predicted_network_design(forward_params, predicted_demands, gurobi_env=gurobi_env)

    return compute_flow_problem_results(forward_params, test_data, designed_network)
end

function compute_predicted_network_design(forward_params, predicted_demands; gurobi_env=nothing)
    solve_mcfnd = d -> Forward.create_and_solve_problem(forward_params, d, silent=true, gurobi_env=gurobi_env)
    return map(d -> solve_mcfnd(d).z_sol, predicted_demands) 
end

function compute_flow_problem_results(forward_params, test_dataset, predicted_z_sols; gurobi_env=gurobi_env)
    actual_demands = map(row -> row.actual_demands, test_dataset)
    solve_flow = (d, z_sol) -> Forward.create_and_solve_flow_problem(forward_params, d, z_sol, silent=true, gurobi_env=gurobi_env)
    
    forward_solutions = map(solve_flow, actual_demands, predicted_z_sols)
    task_losses = map(sol -> sol.objective_value, forward_solutions)
    recourse_flow = map(sol -> sol.recourse_flow, forward_solutions)

    return DataFrame(task_losses=task_losses, recourse_flow=recourse_flow)
end

### Model comparison pipeline and result cleaning

For a given full dataset of `n_points`, obtain the results of the DFL-IO and the Linear Regression model over the close and wide datasets

In [None]:
function compare_models(n_points; test_train_split=0.7)
    close_dataset, wide_dataset = load_dataset(n_points)

    close_train, close_test = partition(close_dataset, test_train_split)
    wide_train, wide_test =  partition(close_dataset, test_train_split)

    close_io_results = evaluate_model_on_dataset(close_train, close_test, train_inverse_model, predict_inverse_model, gurobi_env=gurobi_env)
    close_linreg_results = evaluate_model_on_dataset(close_train, close_test, train_linear_model, predict_linear_model, gurobi_env=gurobi_env)
    wide_io_results = evaluate_model_on_dataset(wide_train, wide_test, train_inverse_model, predict_inverse_model, gurobi_env=gurobi_env)
    wide_linreg_results = evaluate_model_on_dataset(wide_train, wide_test, train_linear_model, predict_linear_model, gurobi_env=gurobi_env)

    return vcat(
        specify_model_and_data(close_io_results, "close", "io"),
        specify_model_and_data(close_linreg_results, "close", "linreg"),
        specify_model_and_data(wide_io_results, "wide", "io"),
        specify_model_and_data(wide_linreg_results, "wide", "linreg")
    )
end


function specify_model_and_data(results_data, data_type, model_type)
    length = nrow(results_data)

    data_column = categorical(fill(data_type, length))
    model_column = categorical(fill(model_type, length))

    types_df = DataFrame(dataset=data_column, model=model_column)

    return hcat(results_data, types_df)
end

Generating and storing the results

In [None]:
for n in n_points
    results = compare_models(n)
    CSV.write(results_path(n), results)
end

## Analysis

In [None]:
results = Dict(n => CSV.read(results_path(n), DataFrame) for n in n_points)

In [None]:
task_losses = DataFrame(
    close_io=evaluate_on_flow_problem(forward_params, close_test_dataset, close_io_z_sols), 
    close_linreg=evaluate_on_flow_problem(forward_params, close_test_dataset, close_linreg_z_sols),
    wide_io=evaluate_on_flow_problem(forward_params, wide_test_dataset, wide_io_z_sols),
    wide_linreg=evaluate_on_flow_problem(forward_params, wide_test_dataset, wide_linreg_z_sols))

first(task_losses, 5)

In [None]:
describe(task_losses)

In [None]:
@df task_losses boxplot([:close_io, :close_linreg])

In [None]:
@df task_losses boxplot([:wide_io, :wide_linreg])

In [None]:
cutoff = 5000
n_above = df -> nrows(filter(row -> row > cutoff, df))

println("Losses above $cutoff: $(n_above(task_losses.close_io))")
println("Losses above $cutoff: $(n_above(task_losses.close_linreg))")

### General pipeline