Project 2, AA228 
Mark Leone
11/11/24

In [2]:
using DataFrames
using CSV
using Dates
using Plots
using Statistics  

# defined from book
mutable struct QLearning
    S::Int          # number of states
    A::Int          # number of actions
    gamma::Float64  # discount factor
    Q::Array{Float64,2} # Q table (S x A)
    alpha::Float64  # learning rat0
end

function update!(model::QLearning, s, a, r, sp)
    gamma, Q, alpha = model.gamma, model.Q, model.alpha
    Q[s, a] += alpha * (r + gamma * maximum(Q[sp, :]) - Q[s, a])
end

# train q learning
function train_qlearning(data_path::String, S::Int, A::Int, gamma::Float64, alpha::Float64, num_episodes::Int)
    df = CSV.read(data_path, DataFrame)

    # initialize Q-learning model
    model = QLearning(S, A, gamma, zeros(S, A), alpha)
    avg_qvalues_per_episode = Float64[]  # tore average Q-value per episode for plotting

    # update
    for episode in 1:num_episodes
        for row in eachrow(df)
            s = row.s
            a = row.a
            r = row.r
            sp = row.sp
            update!(model, s, a, r, sp)
        end
        # store avg q for episode
        avg_qvalue = mean(model.Q)
        push!(avg_qvalues_per_episode, avg_qvalue)
    end

    return model, avg_qvalues_per_episode
end

# extract deterministic policy
function extract_policy(model::QLearning)
    policy = [argmax(model.Q[s, :]) for s in 1:model.S]
    return policy
end

# save policy
function save_policy(policy, filename)
    open(filename, "w") do io
        for a in policy
            println(io, a)
        end
    end
end

# train and save policies
function main()
    # set parameters
    # datasets = [
    #     ("data/small.csv", (S=100, A=4, gamma=0.95, alpha=0.1, episodes=10)),
    #     ("data/medium.csv", (S=50000, A=7, gamma=1.0, alpha=0.1, episodes=1000)),
    #     ("data/large.csv", (S=302020, A=9, gamma=0.95, alpha=0.1, episodes=30))
    # ]
    datasets = [("data/trajectory.csv", (S=100, A=4, gamma=0.95, alpha=0.1, episodes=10))]

    for (dataset, params) in datasets
        S, A, gamma, alpha, episodes = params
        println("Training model for $dataset")

        start_time = now() # start timer
        model, avg_qvalues_per_episode = train_qlearning(dataset, S, A, gamma, alpha, episodes) #train model

        # print time spent training model
        elapsed_time = now() - start_time
        println("Training time for $dataset: $(Dates.value(elapsed_time) / 1000) seconds")

        # extract and save policy
        policy = extract_policy(model)
        save_policy(policy, replace(dataset, ".csv" => ".policy"))

        # plot avg q values
        plot(1:episodes, avg_qvalues_per_episode, label="Average Q-Value", xlabel="Episode", ylabel="Average Q-Value")
        savefig(replace(dataset, ".csv" => "_qvalue_plot.png"))  
    end
end

main()

Training model for data/trajectory.csv
Training time for data/trajectory.csv: 0.486 seconds
