# Reinforcement Learning, An Introduction - [*Sutton & Barto '18*](http://incompleteideas.net/book/the-book-2nd.html)

[![GitHubBadge]][GitHubLink] [![ColabBadge]][ColabLink]

## Chapter 2 - Multi-armed Bandits

### Exercise 5 - Non-Stationary Problem


 
Design and conduct an experiment to demonstrate the difficulties that sample-average methods have for non-stationary problems. Use a modified version of the 10-armed testbed in which all the $q_*(a)$ start out equal and then take independent random walks (say by adding a normally distributed increment with mean zero and standard deviation 0.01 to all the $q_*(a)$ on each step). Prepare plots like Figure 2.2 for an action-value method using sample averages, incrementally computed, and another action-value method using a constant step-size parameter, $\alpha$=0.1. Use $\epsilon$=0.1 and longer runs, say of 10,000 steps.



[GitHubBadge]: https://img.shields.io/badge/|-Edit_on_GitHub-green.svg?logo=github "Edit notebook's source code on GitHub"
[GitHubLink]: https://github.com/vojtamolda/reinforcement-learning-an-introduction/blob/swift/Chapter%202/Exercise%205.ipynb

[ColabBadge]: https://colab.research.google.com/assets/colab-badge.svg "Run notebook in Google Colab"
[ColabLink]: https://colab.research.google.com/github/vojtamolda/reinforcement-learning-an-introduction/blob/swift/Chapter%202/Exercise%205.ipynb


In [1]:
// Install Packages
//%install-swiftpm-flags -c release
%install-location /swift/packages

%install '.package(url: "https://github.com/vojtamolda/reinforcement-learning-an-introduction", .branch("swift"))' ReinforcementLearning
%install '.package(url: "https://github.com/vojtamolda/Plotly.swift", from: "0.3.1")' Plotly

// Clear Output
print("\u{001B}[2J")

// Enable Plotly Charts
%include "EnableIPythonDisplay.swift"




In [7]:
import MultiArmedBandit
import Plotly

### Bandit Testbed (Stationary and Non-stationary)

In [25]:
extension MultiArmedBandit.State {
    
    var armRewardPlot: Figure {
        let numSamples = 1_000
        var armIndices = [String]()
        var armRewardSamples = [Double]()

        for arm in 0..<n {
            let indices = Array(repeating: String(arm), count: numSamples)
            let rewardSamples = (0..<numSamples).map {
                _ in applying(arm).utility(for: .player(0))
            }

            armIndices.append(contentsOf: indices)
            armRewardSamples.append(contentsOf: rewardSamples)
        }
        
        let armRewardDistributions = Violin(
            y: armRewardSamples,
            x: armIndices,
            points: .off,
            meanLine: .init(visible: true),
            xAxis: .init(title: "Action/Arm"),
            yAxis: .init(title: "Reward Distribution", range: [-5, 5])
        )

        let layout = Layout(
            title: .init(text: "Multi-armed Bandit (stationary: \(game.stationary), armCount: \(game.armCount))"),
            height: 400
        )
        
        return Figure(data: [armRewardDistributions], layout: layout)
    }
}

In [26]:
func play<Policy: StochasticPolicy>(_ game: Policy.Game, actingWith policy: Policy,
                                                for steps: Int) -> Policy.Game.State {
    var currentState = game.initialState
    
    for _ in 0..<steps {
        assert(!currentState.legalActions.isEmpty)
        
        let actionProbabilities = policy.actionProbabilities(forState: currentState)
        assert(!actionProbabilities.isEmpty)

        let sampledAction = actionProbabilities.sample()!
        currentState = currentState.applying(sampledAction)
    }
    
    return currentState
}

In [27]:
let bandit = MultiArmedBandit(stationary: true)
let randomPolicy = RandomGambler(bandit)

bandit.initialState.armRewardPlot.display()
let finalState = play(bandit, actingWith: randomPolicy, for: 10_000)
finalState.armRewardPlot.display()

In [28]:
let bandit = MultiArmedBandit(stationary: false)
let randomPolicy = RandomGambler(bandit)

bandit.initialState.armRewardPlot.display()
let finalState = play(bandit, actingWith: randomPolicy, for: 10_000)
finalState.armRewardPlot.display()

### Reproduction of Figure 2.2

In [29]:
let z = MultiArmedBandit()
print(z)

let a = EpsilonGreedyGambler(z)
print(a)

MultiArmedBandit(armCount: 10, stationary: true)
EpsilonGreedyGambler(game: MultiArmedBandit(armCount: 10, stationary: true), ε: 0.01, α: 0.1, Q: [8: 1.3585316062214604e-06, 0: 5.489261028857013e-06, 1: 2.07905930935866e-06, 9: 3.3179484018958718e-06, 3: 9.323453424676752e-06, 5: 9.362329467168794e-06, 4: 8.469755118549176e-06, 6: 7.5910474145918965e-06, 2: 3.7357204613571863e-06, 7: 7.650532576186737e-06])
