# Reinforcement Learning, An Introduction - *Sutton & Barto '18*

[![GitHubBadge]][GitHubLink] [![ColabBadge]][ColabLink]

## Chapter 2 - Multi-armed Bandits

### Exercise 5 - Non-Stationary Problem


 
Design and conduct an experiment to demonstrate the difficulties that sample-average methods have for non-stationary problems. Use a modified version of the 10-armed testbed in which all the $q_*(a)$ start out equal and then take independent random walks (say by adding a normally distributed increment with mean zero and standard deviation 0.01 to all the $q_*(a)$ on each step). Prepare plots like Figure 2.2 for an action-value method using sample averages, incrementally computed, and another action-value method using a constant step-size parameter, $\alpha$=0.1. Use $\epsilon$=0.1 and longer runs, say of 10,000 steps.



[GitHubBadge]: https://img.shields.io/badge/|-Edit_on_GitHub-green.svg?logo=github "Edit notebook's source code on GitHub"
[GitHubLink]: https://github.com/vojtamolda/reinforcement-learning-an-introduction/blob/swift/Chapter%202/Exercise%202.5.ipynb

[ColabBadge]: https://colab.research.google.com/assets/colab-badge.svg "Run notebook in Google Colab"
[ColabLink]: https://colab.research.google.com/github/vojtamolda/reinforcement-learning-an-introduction/blob/swift/Chapter%202/Exercise%202.5.ipynb


In [1]:
// Install Packages
//%install-swiftpm-flags -c release
%install-location /swift/packages

%install '.package(url: "https://github.com/vojtamolda/reinforcement-learning-an-introduction", .branch("swift"))' ReinforcementLearning
%install '.package(url: "https://github.com/vojtamolda/Plotly.swift", from: "0.3.1")' Plotly

// Clear Output
print("\u{001B}[2J")

// Enable Plotly Charts
%include "EnableIPythonDisplay.swift"

Installing packages:
	.package(url: "https://github.com/vojtamolda/reinforcement-learning-an-introduction", .branch("swift"))
		ReinforcementLearning
	.package(url: "https://github.com/vojtamolda/Plotly.swift", from: "0.3.1")
		Plotly
With SwiftPM flags: []
Working in: /tmp/tmpcmr7ckw9/swift-install
Updating https://github.com/vojtamolda/reinforcement-learning-an-introduction
Updating https://github.com/deepmind/open_spiel.git
Fetching https://github.com/vojtamolda/Plotly.swift
Cloning https://github.com/vojtamolda/Plotly.swift
Resolving https://github.com/vojtamolda/Plotly.swift at 0.3.1
[1/51] Compiling Plotly Funnel.swift
[2/51] Compiling Plotly FunnelArea.swift
[3/51] Compiling Plotly Heatmap.swift
[4/51] Compiling Plotly HeatmapGL.swift
[5/51] Compiling Plotly Histogram.swift
[6/51] Compiling Plotly Histogram2D.swift
[7/51] Compiling Plotly Histogram2DContour.swift
[8/51] Compiling Plotly Image.swift
[9/51] Compiling Plotly Indicator.swift
[10/51] Compiling Plotly Isosurface.swift

In [2]:
import OpenSpiel
import Plotly

import MultiArmedBandit

### Bandit Testbed (Stationary and Non-stationary)

In [3]:
let n = 10

typealias RandomGambler = UniformRandomPolicy<MultiArmedBandit>

In [16]:
extension MultiArmedBandit.State {
    
    var armRewardPlot: Figure {
        let numSamples = 1_000
        var armIndices = [String]()
        var armRewardSamples = [Double]()

        for arm in 0..<n {
            let indices = Array(repeating: String(arm), count: numSamples)
            let rewardSamples = (0..<numSamples).map {
                _ in applying(arm).utility(for: .player(0))
            }

            armIndices.append(contentsOf: indices)
            armRewardSamples.append(contentsOf: rewardSamples)
        }
        
        let armRewardDistributions = Violin(
            y: armRewardSamples,
            x: armIndices,
            points: .off,
            meanLine: .init(visible: true)
        )

        let layout = Layout(
            height: 400
        )
        
        return Figure(data: [armRewardDistributions], layout: layout)
    }
}

// extension ClosedRange: Plotable where Bound: Encodable {
//     public func encode(toPlotly encoder: Encoder) throws {
//         try self.encode(to: encoder)
//     }
// }

In [17]:
let stationary = MultiArmedBandit(stationary: true, armCount: n)

stationary.initialState.armRewardPlot.display()

// Run an episode here

let finalState = stationary.initialState

finalState.armRewardPlot.display()

In [9]:
let nonStationary = MultiArmedBandit(stationary: false, armCount: n)

nonStationary.initialState.armRewardPlot.display()

// Run an episode here

let finalState = nonStationary.initialState

finalState.armRewardPlot.display()

In [None]:

let z = MultiArmedBandit()
print(z)


let a = EpsilonGreedyGambler(z)
print(a)