03-01-ReinforcementLearning.txt


-----------------------------
Object subclass: #RLGrid
	instanceVariableNames: 'content'
	classVariableNames: ''
	package: 'ReinforcementLearning'

-----------------------------
RLGrid>>initialize
	super initialize.
	self setSize: 2

-----------------------------
RLGrid>>setSize: anInteger
	"Set the grid as a square of size anInteger, containing the character $."
	content := (1 to: anInteger) collect: [ :notUsed | Array new: anInteger withAll: $. ] as: Array.
	self setExitBottomRight.

-----------------------------
RLGrid>>atPosition: aPoint
	"Return the character located at a given position"
	^ (content at: aPoint y) at: aPoint x

-----------------------------
RLGrid>>atPosition: aPoint put: aCharacter
	"Set the aCharacter (value of a cell map) at a given position"
	^ (content at: aPoint y) at: aPoint x put: aCharacter

-----------------------------
RLGrid>>setExitBottomRight
	"Set the exit position at the bottom right of the grid"
	self atPosition: self extent put: $e

-----------------------------
RLGrid>>extent
	"Return a point that represents the extent of the grid"
	^ content first size @ content size 

-----------------------------
RLGrid>>setMonsters: numberOfMonstersToAdd
	| random leftMonsters s pos nbTries |
	random := Random seed: 42.
	leftMonsters := numberOfMonstersToAdd.
	nbTries := 0.
	s := self extent.
	[ leftMonsters > 0 ] whileTrue: [ 
		pos := (random nextInteger: s x ) @ (random nextInteger: s y).
		(self atPosition: pos) = $. 
			ifTrue: [ 
				nbTries := 0.
				self atPosition: pos put: $m.
				leftMonsters := leftMonsters - 1 ]
			ifFalse: [ 
				nbTries := nbTries + 1. 
				nbTries > 5 ifTrue: [ ^ self ] ]
	]

-----------------------------
RLGrid>>= anotherObject
	"Return true if anotherObject is the same map than myself"
	anotherObject class == self class ifFalse: [ ^ false ].
	^ content = anotherObject content

-----------------------------
RLGrid>>content
	"Return the grid content, as an array of array of characters"
	^ content

-----------------------------
RLGrid>>hash
	"The hash of a grid is the hash of its content"
	^ content hash

-----------------------------
RLGrid>>postCopy
	"A grid must be properly copied"
	super postCopy.
	content := content copy

-----------------------------
RLGrid>>visualize
	| canvas shapes |
	canvas := RSCanvas new.
	shapes := RSBox models: (self content flatCollect: #yourself) forEach: [ :s :o |
		s size: 20.
		o = $. ifTrue: [ s color: Color veryVeryLightGray ].
		o = $m ifTrue: [ s color: Color lightRed ].
		o = $e ifTrue: [ s color: Color lightYellow ].
		 ].
	canvas addAll: shapes.
	RSGridLayout new gapSize: 0; lineItemsCount: (self extent x); on: shapes.
	shapes translateTopLeftTo: 0 @ 0.
	^ canvas

-----------------------------
RLGrid>>inspectorVisualization
	<inspectorPresentationOrder: 90 title: 'Visualization'>
	| canvas |
	canvas := self visualize.
	canvas @ RSCanvasController.
	^ SpRoassal3InspectorPresenter new
		canvas: canvas;
		yourself

-----------------------------
RLGrid new setSize: 5; setMonsters: 5

-----------------------------
Object subclass: #RLState
	instanceVariableNames: 'grid position'
	classVariableNames: ''
	package: 'ReinforcementLearning'

-----------------------------
RLState>>initialize
	super initialize.
	position := 1 @ 1

-----------------------------
RLState>>grid: aGrid
	"Set the grid associated to the state"
	grid := aGrid

-----------------------------
RLState>>grid
	"Return the grid associated to the state"
	^ grid

-----------------------------
RLState>>position: aPoint
	"Set the knight position"
	position := aPoint

-----------------------------
RLState>>position
	"Return the knight position"
	^ position

-----------------------------
RLState>>= anotherState
	"Two states are identical if (i) they have the same class, (ii) the same grid, and (iii) the same position of the knight"
	anotherState class == self class ifFalse: [ ^ false ].
	^ grid = anotherState grid and: [ position = anotherState position ]

-----------------------------
RLState>>hash
	"The hash of a state is a combination of the hash of the grid with the hash of the position"
	^ grid hash bitXor: position hash

-----------------------------
RLState>>printOn: str
	"Give a string representation of a state"
	str nextPutAll: 'S<'.
	str nextPutAll: self hash asString.
	str nextPutAll: '>'.	

-----------------------------
RLState>>visualize
	"Visualize the grid and the knight position"
	| c knightShape |	
	c := grid visualize.
	knightShape := RSCircle new size: 15; color: Color blue lighter lighter.
	c add: knightShape.
	knightShape translateTo: self position - (0.5 @ 0.5) * 20.
	^ c 

-----------------------------
RLState>>inspectorVisualization
	<inspectorPresentationOrder: 90 title: 'Visualization'>
	| canvas |
	canvas := self visualize.
	canvas @ RSCanvasController.
	^ SpRoassal3InspectorPresenter new
		canvas: canvas;
		yourself

-----------------------------
RLState new 
	grid: (RLGrid new setSize: 5; setMonsters: 5)

-----------------------------
Object subclass: #RL
	instanceVariableNames: 'startState r numberOfEpisodes maxEpisodeSteps minAlpha gamma epsilon qTable rewards path stateConnections'
	classVariableNames: ''
	package: 'ReinforcementLearning'

-----------------------------
RL>>initialize
	super initialize.
	r := Random seed: 42.
	numberOfEpisodes := 20.
	maxEpisodeSteps := 100.
	minAlpha := 0.02.
	gamma := 1.0.
	epsilon := 0.2.
	qTable := Dictionary new.
	rewards := OrderedCollection new.
	path := OrderedCollection new.
	stateConnections := OrderedCollection new.

-----------------------------
RL>>numberOfEpisodes: aNumber
	"Set the number of exploration we need to perform"
	numberOfEpisodes := aNumber

-----------------------------
RL>>epsilon: aFloat
	"Set the probability to explore the world. The argument is between 0.0 and 1.0. A value close to 0.0 favors choosing an action that we know is a good one (thus reducing the exploration of the grid). A value close to 1.0 favors the world exploration instead."
	epsilon := aFloat

-----------------------------
RL>>maxEpisodeSteps: anInteger
	"Indicate how long an exploration can be"
	maxEpisodeSteps := anInteger 

-----------------------------
RL>>act: aState action: action
	"Produce a new tuple {stable . reward . isDone}"
	| reward newGrid p gridItem isDone newState |
	p := self moveKnight: aState action: action.
	gridItem := aState grid atPosition: p.
	newGrid := aState grid copy.
	gridItem = $m ifTrue: [ reward := -100. isDone := true ].
	gridItem = $e ifTrue: [ reward := 1000. isDone := true ].
	('me' includes: gridItem) 
		ifFalse: [ reward := -1. isDone := false ].
	newState := RLState new grid: newGrid; position: p.
	stateConnections add: aState -> newState.
	^ { newState . reward . isDone }

-----------------------------
RL>>actions
	"Return the considered actions"
	^ #(1 2 3 4)

-----------------------------
RL>>chooseAction: state
	"Choose an action for a given state"
	^ r next < epsilon
		ifTrue: [ self actions atRandom: r ]
		ifFalse: [ 
			"Return the best action"
			(self qState: state) argmax ]

-----------------------------
RL>>moveKnight: state action: action
	"Return the new position of a car, as a point. The action is a number from 1 to 4.
	return a new position"
	| delta |
	delta := { 0@ -1 . 0@1 . -1@0 . 1@0 } 
				at: action ifAbsent: [ self error: 'Unknown action' ].
	^ ((state position + delta) min: state grid extent) max: 1 @ 1

-----------------------------
RL>>play
	"Return the position of the car"
	| currentState isDone actions tuple maxNumberOfSteps numberOfSteps |
	currentState := startState.
	isDone := false.
	path := OrderedCollection new.
	path add: currentState position.
	maxNumberOfSteps := 100.
	numberOfSteps := 0.
	[ isDone not and: [ numberOfSteps < maxNumberOfSteps ] ] whileTrue: [
		actions := self qState: currentState.
		tuple := self act: currentState action: actions argmax.
		currentState := tuple first.
		path add: currentState position.
		isDone := tuple third.
		numberOfSteps := numberOfSteps + 1.
	].
	
	^ path asArray

-----------------------------
RL>>qState: state 
	"Return the rewards associated to a state. If the state is not in the qTable, we create it"
	qTable at: state ifAbsentPut: [ Array new: self actions size withAll: 0 ].
	^ qTable at: state

-----------------------------
RL>>qState: state action: action
	"For a particular state, return the reward of an action. If the state is not in the qTable, we create it"
	qTable at: state ifAbsentPut: [ (1 to: self actions size) collect: [ :nU | 0 ] ].
	^ (qTable at: state) at: action

-----------------------------
RL>>run
	"This method is the core of the Q-Learning algorithm"
	| alphas currentState totalReward alpha isDone currentAction tuple nextState currentReward |
	alphas := (minAlpha to: 1.0 count: numberOfEpisodes) reversed.
	rewards := OrderedCollection new.
	1 to: numberOfEpisodes do: [ :e |
		currentState := startState.
		totalReward := 0.
		alpha := alphas at: e.
		isDone := false.
		maxEpisodeSteps timesRepeat: [ 
			isDone ifFalse: [ 
				currentAction := self chooseAction: currentState.
				tuple := self act: currentState action: currentAction.
				nextState := tuple first.
				currentReward := tuple second.
				isDone := tuple third.
				totalReward := totalReward + currentReward.
				
				"The Bellman equation"
				(self qState: currentState) at: currentAction put: (
					(self qState: currentState action: currentAction) + (alpha * (currentReward + (gamma * (self qState: nextState) max) - (self qState: currentState action: currentAction)))).
				currentState := nextState
			]
		].
		rewards add: totalReward.
	].
	rewards := rewards asArray.
	^ rewards

-----------------------------
RL>>setInitialGrid: aGrid
	"Set the grid used in the initial state"
	startState := RLState new grid: aGrid

-----------------------------
RL>>visualizeQTable
	| c state values allBoxes sortedAssociations |
	c := RSCanvas new.
	
	c add: (RSLabel text: 'State').
	c add: (RSLabel text: '^').
	c add: (RSLabel text: 'V').
	c add: (RSLabel text: '<').
	c add: (RSLabel text: '>').
	
	sortedAssociations := qTable associations reverseSortedAs: [ :assoc | assoc value average ].
	sortedAssociations do: [ :assoc |
		state := RSLabel model: assoc key.
		values := RSBox 
						models: (assoc value collect: [ :v | v round: 2 ]) 
						forEach: [ :s :m | s extent: 40 @ 20 ].
		c add: state.
		c addAll: values.
	].
	RSCellLayout new lineItemsCount: 5; gapSize: 1; on: c shapes.
	allBoxes := c shapes select: [ :s | s class == RSBox ].
	RSNormalizer color
		shapes: allBoxes;
		from: Color red darker darker; to: Color green darker darker;
		normalize.
	allBoxes @ RSLabeled middle.
	^ c @ RSCanvasController

-----------------------------
RL>>inspectorQTable
	<inspectorPresentationOrder: 90 title: 'QTable'>
	
	^ SpRoassal3InspectorPresenter new
		canvas: self visualizeQTable;
		yourself

-----------------------------
RL>>inspectorQTableContext: aContext
	aContext withoutEvaluator

-----------------------------
RL>>visualizeReward
	| c plot |
	c := RSChart new.
	plot := RSLinePlot new.
	plot y: rewards.
	c addPlot: plot.
	c addDecoration: (RSChartTitleDecoration new title: 'Reward evolution'; fontSize: 20).
	c xlabel: 'Episode' offset: 0 @ 10.
	c ylabel: 'Reward' offset: -20 @ 0.
	c addDecoration: (RSHorizontalTick new).
	c addDecoration: (RSVerticalTick new).
	c build.
	^ c canvas

-----------------------------
RL>>inspectorReward
	<inspectorPresentationOrder: 90 title: 'Reward'>
	
	^ SpRoassal3InspectorPresenter new
		canvas: self visualizeReward;
		yourself

-----------------------------
RL>>inspectorRewardContext: aContext
	aContext withoutEvaluator

-----------------------------
RL>>visualizeResult
	"Assume that the method play was previously invoked"
	| c s |
	self play.
	c := startState visualize.
	path do: [ :p |
		s := RSCircle new size: 5; color: Color blue lighter lighter.
		c add: s.
		s translateTo: p - (0.5 @ 0.5) * 20.
	].
	^ c @ RSCanvasController

-----------------------------
RL>>inspectorResult
	<inspectorPresentationOrder: 90 title: 'Result'>
	
	^ SpRoassal3InspectorPresenter new
		canvas: self visualizeResult;
		yourself

-----------------------------
RL>>inspectorResultContext: aContext
	aContext withoutEvaluator

-----------------------------
RL>>visualizeGraph
	| s allStates d m |
	s := stateConnections asSet asArray.
	d := Dictionary new.
	s do: [ :assoc | 
		(d at: assoc key ifAbsentPut: [ OrderedCollection new ]) add: assoc value ].

	allStates := qTable keys.
	m := RSMondrian new.
	m shape circle.
	m nodes: allStates.
	m line connectToAll: [ :aState | d at: aState ifAbsent: [ #() ] ].
	m layout force.
	m build.
	^ m canvas.

-----------------------------
RL>>inspectorGraph
	<inspectorPresentationOrder: 90 title: 'State graph'>
	
	^ SpRoassal3InspectorPresenter new
		canvas: self visualizeGraph;
		yourself

-----------------------------
RL>>inspectorGraphContext: aContext
	aContext withoutEvaluator

-----------------------------
RL>>visualizeWeightedGraph
	| s allStates d m |
	s := stateConnections asSet asArray.
	d := Dictionary new.
	s do: [ :assoc | 
		(d at: assoc key ifAbsentPut: [ OrderedCollection new ]) add: assoc value ].

	allStates := qTable keys.
	m := RSMondrian new.
	m shape circle.
	m nodes: allStates.
	m line connectToAll: [ :aState | d at: aState ifAbsent: [ #() ] ].
	m normalizeSize: [ :aState | (qTable at: aState) average ] from: 5 to: 30.
	m normalizeColor: [ :aState | (qTable at: aState) max ] from: Color gray to: Color green.
	m layout force.
	m build.
	^ m canvas.

-----------------------------
RL>>inspectorWeightedGraph
	<inspectorPresentationOrder: 90 title: 'Weighted state graph'>
	
	^ SpRoassal3InspectorPresenter new
		canvas: self visualizeWeightedGraph;
		yourself

-----------------------------
RL>>inspectorWeightedGraphContext: aContext
	aContext withoutEvaluator

-----------------------------
rl := RL new.
rl setInitialGrid: RLGrid new.
rl run.
rl

-----------------------------
rl := RL new.
rl setInitialGrid: (RLGrid new setSize: 5; setMonsters: 2).
rl run.
rl

-----------------------------
rl := RL new.
rl setInitialGrid: (RLGrid new setSize: 5; setMonsters: 2).
rl epsilon: 0.01.
rl run.
rl

-----------------------------
rl := RL new.
rl setInitialGrid: (RLGrid new setSize: 5; setMonsters: 2).
rl epsilon: 0.01.
rl numberOfEpisodes: 7.
rl run.
rl

-----------------------------
rl := RL new.
rl setInitialGrid: (RLGrid new setSize: 5; setMonsters: 2).
rl run.
rl