## Dependencies and Setup

In [None]:
using Revise
using Paint
using Serialization
using Images, ImageShow
using Plots
using StaticArrays
using ImageFeatures
using IntervalSets
using ReinforcementLearning
using ReinforcementLearningBase
using ReinforcementLearningZoo
using Random
using Flux
using Flux.Losses
import Flux: params
using Distributions: Normal
using CUDA

In [None]:
target = float.(load("../lisa.png"))
nothing

## Environment

State is a 10x10 downsampled image of the difference map, Action is a vector of length 6 corresponding to a triangle

In [None]:
NTris = 10

ns = 10 * 10 # number of states
na = 6 # number of "actions" 

mutable struct MyEnv <: AbstractEnv
    target::Array{RGB{Float32}, 2}
    img::Array{RGB{Float32}, 2}
    idx::Int
end
MyEnv(targetimg) = MyEnv(targetimg, zero(targetimg) .+ averagepixel(targetimg), 0)

In [None]:
ReinforcementLearningBase.action_space(env::MyEnv) = Space([0.0..1.0, 0.0..1.0, 0.0..1.0, 0.0..1.0, 0.0..1.0, 0.0..1.0])
ReinforcementLearningBase.state_space(env::MyEnv) = Space([0.0..1.0 for _ in ns])

function ReinforcementLearningBase.state(env::MyEnv)
    diff = Gray.(abs.(env.img .- env.target))
    diff = diff ./ maximum(diff)
    
    ret_state = zeros(Float32, 100)
    for i = 1:10
        for j = 1:10
            ret_state[(i - 1) * 10 + j]= sum(diff[ 20*(i-1)+1:20*i, 20*(j-1)+1:20*j ]) / Float32(20*20)
        end
    end
    ret_state
end

function ReinforcementLearningBase.reward(env::MyEnv)
    - imloss(env.img, env.target, SELoss())
end

ReinforcementLearningBase.is_terminated(env::MyEnv) = (env.idx == NTris)

function ReinforcementLearningBase.reset!(env::MyEnv)
    env.img = zero(env.target) .+ averagepixel(env.target)
    env.idx = 0

    env
end

function (env::MyEnv)(action)
    env.idx += 1
    tri = Triangle(SVector{6, Float32}(action))
    col = averagepixel(target, tri, RasterAlgorithmScanline())
    draw!(env.img, tri, col, RasterAlgorithmScanline())

    # println(tri)
    # println(drawloss(env.target, env.img, tri, col, SELoss(), RasterAlgorithmScanline()))

    env
end

## Model

Agent is a Soft Actor-Critic based on a small MLP with Q-Net, on the GPU

In [None]:
rng = Random.GLOBAL_RNG
init = glorot_uniform(rng)

create_policy_net() = NeuralNetworkApproximator(
    model=GaussianNetwork(
        pre=Chain(
            Dense(ns, 30, relu, init=init),
            Dense(30, 30, relu, init=init),
        ),
        μ=Chain(Dense(30, na, init=init)),
        logσ=Chain(Dense(30, na, x -> clamp(x, typeof(x)(-10), typeof(x)(2)), init=init)),
    ),
    optimizer=ADAM(0.003),
) |> gpu

create_q_net() = NeuralNetworkApproximator(
    model=Chain(
        Dense(ns + na, 30, relu; init=init),
        Dense(30, 30, relu; init=init),
        Dense(30, 1; init=init),
    ),
    optimizer=ADAM(0.003),
) |> gpu

agent = Agent(
    policy=SACPolicy(
        policy=create_policy_net(),
        qnetwork1=create_q_net(),
        qnetwork2=create_q_net(),
        target_qnetwork1=create_q_net(),
        target_qnetwork2=create_q_net(),
        γ=0.99f0,
        τ=0.005f0,
        α=0.2f0,
        batch_size=64,
        start_steps=1000,
        start_policy=RandomPolicy(Space([0.0 .. 1.0 for _ in 1:na]); rng=rng),
        update_after=1000,
        update_freq=1,
        automatic_entropy_tuning=true,
        lr_alpha=0.003f0,
        action_dims=1,
        rng=rng,
        device_rng=CUDA.functional() ? CUDA.CURAND.RNG() : rng
    ),
    trajectory=CircularArraySARTTrajectory(
        capacity=10000,
        state=Vector{Float32} => (ns,),
        action=Vector{Float32} => (na,),
    ),
)

## Run the agent

In [None]:
env = MyEnv(target)

In [None]:
res = run(
   agent,
   env,
   StopAfterEpisode(10000),
   TotalRewardPerEpisode()
)