From 7fe59d80831f3c8bbe207eadc97e9d6d23f74fc5 Mon Sep 17 00:00:00 2001 From: turnmanh <17703667+turnmanh@users.noreply.github.com> Date: Sun, 24 Mar 2024 14:44:27 +0100 Subject: [PATCH 1/2] added logos of new ci --- notebooks/_static/images/aai-institute-cover.png | 3 +++ notebooks/_static/images/aai-logo.png | 4 ++-- notebooks/_static/images/transferlab-logo-dark.svg | 4 ++-- notebooks/_static/images/transferlab-logo.svg | 4 ++-- 4 files changed, 9 insertions(+), 6 deletions(-) create mode 100644 notebooks/_static/images/aai-institute-cover.png diff --git a/notebooks/_static/images/aai-institute-cover.png b/notebooks/_static/images/aai-institute-cover.png new file mode 100644 index 00000000..250520d0 --- /dev/null +++ b/notebooks/_static/images/aai-institute-cover.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:039dd4e069b5f935e51653f14d695884fb3410e70cf1e29526d54f694d3edb5b +size 1921926 diff --git a/notebooks/_static/images/aai-logo.png b/notebooks/_static/images/aai-logo.png index 369a7f1c..303a30c5 100644 --- a/notebooks/_static/images/aai-logo.png +++ b/notebooks/_static/images/aai-logo.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e59237a483fe238a7cc3c203856f6477263e464726f9f4753aaa242c08f601ec -size 9913 +oid sha256:42c1f0640aa36d03f808b403eaba89195c6477784f6581d49005a3bd534eec86 +size 29915 diff --git a/notebooks/_static/images/transferlab-logo-dark.svg b/notebooks/_static/images/transferlab-logo-dark.svg index fe7e070c..3231a647 100644 --- a/notebooks/_static/images/transferlab-logo-dark.svg +++ b/notebooks/_static/images/transferlab-logo-dark.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:835d25242be2bdba03671c19b3ff8ca5471508d3df91bb3f3f048104aba83f91 -size 8527 +oid sha256:6de02f0d27af4d8c8284a2cf3fef3482a0f85acc7e6446dab4594b8605cecf2e +size 8422 diff --git a/notebooks/_static/images/transferlab-logo.svg b/notebooks/_static/images/transferlab-logo.svg index d9f3b928..3231a647 100644 --- a/notebooks/_static/images/transferlab-logo.svg +++ b/notebooks/_static/images/transferlab-logo.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4b1b1ecad1c84bbaa48f53d9e6b3b2d5f0f75acbed4951a316ef28276f54161 -size 3972 +oid sha256:6de02f0d27af4d8c8284a2cf3fef3482a0f85acc7e6446dab4594b8605cecf2e +size 8422 From 001e2a85ac38d0bdb8a3b66bfd2f492fa80213e3 Mon Sep 17 00:00:00 2001 From: turnmanh <17703667+turnmanh@users.noreply.github.com> Date: Sun, 24 Mar 2024 14:51:55 +0100 Subject: [PATCH 2/2] adapted logos to institute ci --- notebooks/nb_20_IntroductionToControl.ipynb | 4 +- notebooks/nb_30_ControlAndPlanning.ipynb | 4 +- .../nb_40_RecentDevelopmentsInControl.ipynb | 4 +- notebooks/nb_50_IntroRL.ipynb | 4 +- notebooks/nb_70_TrainingRLAgents.ipynb | 4 +- notebooks/nb_75_EnvironmentEngineering.ipynb | 42 ++-- notebooks/nb_90_IntroOfflineRL.ipynb | 66 ++--- notebooks/nb_91_RLOpenSourceDatasets.ipynb | 34 +-- notebooks/nb_92_MinariOverview.ipynb | 58 ++--- notebooks/nb_93_imitation_learning.ipynb | 146 +++++------ notebooks/nb_94_Offline_RL_part_I.ipynb | 40 +-- ...olicy_distributional_shift_exercises.ipynb | 232 ++++++++--------- notebooks/nb_96_Offline_RL_part_II.ipynb | 114 ++++----- ...b_97_Offline_rl_algorithms_exercises.ipynb | 234 +++++++++--------- 14 files changed, 493 insertions(+), 493 deletions(-) diff --git a/notebooks/nb_20_IntroductionToControl.ipynb b/notebooks/nb_20_IntroductionToControl.ipynb index aa052edd..9751412e 100644 --- a/notebooks/nb_20_IntroductionToControl.ipynb +++ b/notebooks/nb_20_IntroductionToControl.ipynb @@ -106,7 +106,7 @@ } }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Introduction to Control Theory
" ] }, @@ -4811,7 +4811,7 @@ ] }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Thank you for your attention!
" ] }, diff --git a/notebooks/nb_30_ControlAndPlanning.ipynb b/notebooks/nb_30_ControlAndPlanning.ipynb index 8e024c04..cac76603 100644 --- a/notebooks/nb_30_ControlAndPlanning.ipynb +++ b/notebooks/nb_30_ControlAndPlanning.ipynb @@ -114,7 +114,7 @@ } }, "source": [ - "\"presentation\n", + "\"presentation\n", "
Control and Planning
" ] }, @@ -3825,7 +3825,7 @@ ] }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Thank you for the attention!
" ] }, diff --git a/notebooks/nb_40_RecentDevelopmentsInControl.ipynb b/notebooks/nb_40_RecentDevelopmentsInControl.ipynb index 8f447112..6e6f0608 100644 --- a/notebooks/nb_40_RecentDevelopmentsInControl.ipynb +++ b/notebooks/nb_40_RecentDevelopmentsInControl.ipynb @@ -99,7 +99,7 @@ } }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Recent Developments in Control Theory
" ] }, @@ -1443,7 +1443,7 @@ ] }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Thank you for the attention!
" ] }, diff --git a/notebooks/nb_50_IntroRL.ipynb b/notebooks/nb_50_IntroRL.ipynb index 8b540251..ced47d26 100644 --- a/notebooks/nb_50_IntroRL.ipynb +++ b/notebooks/nb_50_IntroRL.ipynb @@ -307,7 +307,7 @@ } }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Intro to Reinforcement Learning
" ] }, @@ -1340,7 +1340,7 @@ ] }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Thank you for the attention!
" ] }, diff --git a/notebooks/nb_70_TrainingRLAgents.ipynb b/notebooks/nb_70_TrainingRLAgents.ipynb index 55d6a247..993bab46 100644 --- a/notebooks/nb_70_TrainingRLAgents.ipynb +++ b/notebooks/nb_70_TrainingRLAgents.ipynb @@ -288,7 +288,7 @@ } }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Training RL Agents
" ] }, @@ -915,7 +915,7 @@ ] }, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Thank you for the attention!
" ] } diff --git a/notebooks/nb_75_EnvironmentEngineering.ipynb b/notebooks/nb_75_EnvironmentEngineering.ipynb index 0fd302ec..bdb99b38 100644 --- a/notebooks/nb_75_EnvironmentEngineering.ipynb +++ b/notebooks/nb_75_EnvironmentEngineering.ipynb @@ -3,6 +3,9 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,14 +14,14 @@ "%autoreload 2\n", "%matplotlib inline\n", "%load_ext training_rl" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -192,27 +195,27 @@ ], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", "\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -239,20 +242,17 @@ ], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "\"Snow\"\n", - "
Environments and Feature Engineering
" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "\"Snow\"\n", + "
Environments and Feature Engineering
" + ] }, { "cell_type": "markdown", diff --git a/notebooks/nb_90_IntroOfflineRL.ipynb b/notebooks/nb_90_IntroOfflineRL.ipynb index bb016195..05bfea39 100644 --- a/notebooks/nb_90_IntroOfflineRL.ipynb +++ b/notebooks/nb_90_IntroOfflineRL.ipynb @@ -3,6 +3,13 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-12-16T17:57:00.726151Z", + "start_time": "2023-12-16T17:56:59.841417Z" + }, + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,61 +18,54 @@ "%matplotlib inline\n", "%load_ext training_rl\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-16T17:56:59.841417Z", - "end_time": "2023-12-16T17:57:00.726151Z" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-12-16T17:57:02.243879Z", + "start_time": "2023-12-16T17:57:02.213509Z" + }, + "collapsed": false + }, "outputs": [], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-16T17:57:02.213509Z", - "end_time": "2023-12-16T17:57:02.243879Z" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-12-16T17:57:03.983204Z", + "start_time": "2023-12-16T17:57:03.965114Z" + }, + "collapsed": false + }, "outputs": [], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-16T17:57:03.965114Z", - "end_time": "2023-12-16T17:57:03.983204Z" - } - } + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\"Snow\"\n", + "\"Snow\"\n", "
Introduction to Offline Reinforcement Learning
" ] }, { "cell_type": "markdown", - "source": [ - "# Introduction to Offline Reinforcement Learning" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "# Introduction to Offline Reinforcement Learning" + ] }, { "cell_type": "markdown", @@ -168,16 +168,16 @@ }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "## Benchmarking\n", "\n", "In most cases, offline RL methods typically train a policy with specific hyperparameters for a fixed number of steps. They then assess their quality by using the policy from the last iteration in an online evaluation. We will follow this approach in this workshop as it will allow us to focus on understanding concepts rather than fine-tuning optimization.\n", "\n", "However, it's worth noting that alternative methods in the realm of offline RL, known as **Offline Policy Evaluation (OPE)**, exist. These approaches aim to evaluate policy performance directly from the data without any interaction with the environment, which can be a realistic scenario. Although we won't delve into these methods in detail, it's important to acknowledge their existence. **It's worth noting that these methods are still considered somewhat unreliable, and there is ongoing progress in this area**." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", diff --git a/notebooks/nb_91_RLOpenSourceDatasets.ipynb b/notebooks/nb_91_RLOpenSourceDatasets.ipynb index 052c2ca3..51ba1f6d 100644 --- a/notebooks/nb_91_RLOpenSourceDatasets.ipynb +++ b/notebooks/nb_91_RLOpenSourceDatasets.ipynb @@ -3,6 +3,9 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,42 +14,39 @@ "%matplotlib inline\n", "%load_ext training_rl\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "\"Snow\"\n", - "
Offline-RL open source datasets
" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "\"Snow\"\n", + "
Offline-RL open source datasets
" + ] }, { "cell_type": "markdown", diff --git a/notebooks/nb_92_MinariOverview.ipynb b/notebooks/nb_92_MinariOverview.ipynb index b7cf9339..2a654999 100644 --- a/notebooks/nb_92_MinariOverview.ipynb +++ b/notebooks/nb_92_MinariOverview.ipynb @@ -3,6 +3,9 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,36 +14,36 @@ "%matplotlib inline\n", "%load_ext training_rl\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "import os\n", @@ -80,23 +83,23 @@ "register_grid_envs()\n", "\n", "render_mode = RenderMode.RGB_ARRAY_LIST if os.environ.get(\"DISPLAY\") else None" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "\"Snow\"\n", - "
Minari Overview
" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "\"Snow\"\n", + "
Minari Overview
" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "# Minari overview\n", "\n", @@ -128,19 +131,16 @@ "2 - **[Mujoco](https://github.com/google-deepmind/mujoco)**\n", "\n", "MuJoCo, short for Multi-Joint dynamics with Contact, is a versatile physics engine designed to support research and development across various fields, including robotics, bio-mechanics, graphics, animation, machine learning, and more. Originally created by Robotic LLC, it was later acquired by DeepMind and made freely accessible to the public in October 2021. Furthermore, it was open-sourced in May 2022." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "## Exercise I" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Exercise I" + ] }, { "cell_type": "markdown", diff --git a/notebooks/nb_93_imitation_learning.ipynb b/notebooks/nb_93_imitation_learning.ipynb index a9060988..a3d08cf5 100644 --- a/notebooks/nb_93_imitation_learning.ipynb +++ b/notebooks/nb_93_imitation_learning.ipynb @@ -3,6 +3,13 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-12-17T10:13:11.231023Z", + "start_time": "2023-12-17T10:13:10.356212Z" + }, + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,52 +18,45 @@ "%matplotlib inline\n", "%load_ext training_rl\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-17T10:13:10.356212Z", - "end_time": "2023-12-17T10:13:11.231023Z" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-12-17T10:13:11.245432Z", + "start_time": "2023-12-17T10:13:11.232473Z" + }, + "collapsed": false + }, "outputs": [], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-17T10:13:11.232473Z", - "end_time": "2023-12-17T10:13:11.245432Z" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-12-17T10:13:11.546984Z", + "start_time": "2023-12-17T10:13:11.500639Z" + }, + "collapsed": false + }, "outputs": [], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-12-17T10:13:11.500639Z", - "end_time": "2023-12-17T10:13:11.546984Z" - } - } + ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { - "start_time": "2023-12-17T10:13:12.424161Z", - "end_time": "2023-12-17T10:13:12.958422Z" + "end_time": "2023-12-17T10:13:12.958422Z", + "start_time": "2023-12-17T10:13:12.424161Z" } }, "outputs": [], @@ -106,58 +106,61 @@ }, { "cell_type": "markdown", - "source": [ - "\"Snow\"\n", - "
Imitation Learning
" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "\"Snow\"\n", + "
Imitation Learning
" + ] }, { "cell_type": "markdown", - "source": [ - "# Imitation Learning" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "# Imitation Learning" + ] }, { "cell_type": "markdown", - "source": [ - "## Introduction" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Introduction" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "**Imitation learning is a supervise learning approach focuses on learning policies or behaviors by observing and imitating expert demonstrations**. Instead of learning from trial and error, imitation learning leverages existing expert knowledge to train agents.\n", "\n", "This makes these algorithms appealing as **you don't need to create a reward function for your task** like in situations where the manual approach becomes essential because creating a reward function directly is not feasible, such as when training a self-driving vehicle." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "The easiest imitation learning algorithm is call BC (Behavioral Cloning) and is just supervised learning on the collected expert data, i.e.:\n", "\n", "$$ D = \\{(s_0, a_0), (s_1, a_1), \\ldots, (s_T, a_T^o)\\} \\quad \\tag{Dataset} $$\n", "\n", "$$ L_{BC}(\\theta) = \\frac{1}{2} \\left(\\pi_\\theta(s_t) - a_t\\right)^2 \\tag{Cost function}$$" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "There are improve versions of BC like DAgger (Dataset Aggregation) where after BC the policy is being rollout and if new states appear a new feedback is to ask the human expert. This could produce a huge improvement, although it could be quite expensive.\n", "\n", @@ -166,13 +169,13 @@ "**pros**: If you have expert dataset, and you are not worry about safety (i.e. unexpected policy behavior in unknown states) this could be a fast approach.\n", "\n", "**cons**: In general we don't have access to expert data so this is one of the main issues, but even if we have we will have problems related with distributional shift between our clone policy and the provided dataset. We will see this in a moment in an exercise. Also, many of the properties of the Minari datasets (see exercise notebook) that could appear in reality cannot be handled with simple imitation learning approaches, like for instance the stitching property." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "There are other interesting methods that combine imitation learning and the offline RL methods we will introduce later. Typically, they involve two steps:\n", "\n", @@ -181,10 +184,7 @@ "2 - Applying offline RL for planning.\n", "\n", "In the first step, they use more sophisticated techniques for cloning, such as Transformers to generate new trajectories or normalizing flows to fit the state-action data distribution." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", @@ -214,8 +214,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "start_time": "2023-12-17T10:13:16.912486Z", - "end_time": "2023-12-17T10:13:16.944298Z" + "end_time": "2023-12-17T10:13:16.944298Z", + "start_time": "2023-12-17T10:13:16.912486Z" } }, "outputs": [], @@ -255,8 +255,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "start_time": "2023-12-17T10:13:18.622404Z", - "end_time": "2023-12-17T10:13:19.334538Z" + "end_time": "2023-12-17T10:13:19.334538Z", + "start_time": "2023-12-17T10:13:18.622404Z" } }, "outputs": [], @@ -296,8 +296,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "start_time": "2023-12-17T10:13:21.872463Z", - "end_time": "2023-12-17T10:13:23.245110Z" + "end_time": "2023-12-17T10:13:23.245110Z", + "start_time": "2023-12-17T10:13:21.872463Z" } }, "outputs": [], @@ -346,8 +346,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "start_time": "2023-12-17T10:10:20.937274Z", - "end_time": "2023-12-17T10:10:59.186992Z" + "end_time": "2023-12-17T10:10:59.186992Z", + "start_time": "2023-12-17T10:10:20.937274Z" } }, "outputs": [], @@ -394,6 +394,9 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "offpolicy_rendering(\n", @@ -403,10 +406,7 @@ " env_2d_grid_initial_config=env_2D_grid_initial_config,\n", " num_frames=1000,\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", @@ -603,6 +603,9 @@ }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "### References\n", "\n", @@ -611,17 +614,14 @@ "[Janner et al. 2021 - Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)\n", "\n", "[Prudencio et al. 2023 - A Survey on Offline Reinforcement Learning: Taxonomy, Review, and Open Problems ](https://arxiv.org/pdf/2203.01387.pdf)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [], "metadata": { "collapsed": false - } + }, + "source": [] } ], "metadata": { diff --git a/notebooks/nb_94_Offline_RL_part_I.ipynb b/notebooks/nb_94_Offline_RL_part_I.ipynb index febbc0f3..cb74271d 100644 --- a/notebooks/nb_94_Offline_RL_part_I.ipynb +++ b/notebooks/nb_94_Offline_RL_part_I.ipynb @@ -3,6 +3,9 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,42 +14,39 @@ "%matplotlib inline\n", "%load_ext training_rl\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "\"Snow\"\n", - "
Offline RL Part I
" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "\"Snow\"\n", + "
Offline RL Part I
" + ] }, { "cell_type": "markdown", @@ -159,11 +159,11 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [], "metadata": { "collapsed": false - } + }, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/nb_95_Offpolicy_distributional_shift_exercises.ipynb b/notebooks/nb_95_Offpolicy_distributional_shift_exercises.ipynb index fb5de669..0dc32573 100644 --- a/notebooks/nb_95_Offpolicy_distributional_shift_exercises.ipynb +++ b/notebooks/nb_95_Offpolicy_distributional_shift_exercises.ipynb @@ -3,6 +3,9 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,32 +14,29 @@ "%matplotlib inline\n", "%load_ext training_rl\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", @@ -89,31 +89,31 @@ }, { "cell_type": "markdown", - "source": [ - "\"Snow\"\n", - "
Offline RL distributional shift exercises
" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "\"Snow\"\n", + "
Offline RL distributional shift exercises
" + ] }, { "cell_type": "markdown", - "source": [ - "# Off-Policy Distributional Shift" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "# Off-Policy Distributional Shift" + ] }, { "cell_type": "markdown", - "source": [ - "## Distributional Shift I:" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Distributional Shift I:" + ] }, { "cell_type": "markdown", @@ -488,15 +488,18 @@ }, { "cell_type": "markdown", - "source": [ - "## Distributional Shift II:" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Distributional Shift II:" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "## Exercise I\n", "\n", @@ -515,23 +518,23 @@ "In this example we will use again as off-policy RL algorithm, the Deep Q-Network (DQN) algorithm.\n", "\n", "**Let's setup our configuration and create the environment**" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Environment" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Environment" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "ENV_NAME = CustomEnv.Grid_2D_8x8_discrete\n", @@ -549,23 +552,23 @@ "\n", "env = InitialConfigCustom2DGridEnvWrapper(gym.make(ENV_NAME, render_mode=render_mode), env_config=env_2D_grid_initial_config)\n", "snapshot_env(env)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Configure the two datasets" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Configure the two datasets" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "DATA_SET_IDENTIFIER_I = \"_downwards_\"\n", @@ -575,23 +578,23 @@ "DATA_SET_IDENTIFIER_II = \"_optimal_\"\n", "BEHAVIOR_POLICY_II = BehaviorPolicyType.behavior_8x8_deterministic_4_0_to_7_7\n", "NUM_STEPS_II = 1000" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Create combined Minari dataset" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Create combined Minari dataset" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "config_combined_data = create_combined_minari_dataset(\n", @@ -602,23 +605,23 @@ " combined_dataset_identifier = \"_stiching\",\n", " env_2d_grid_initial_config = env_2D_grid_initial_config,\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Rendering behavioral policy" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Rendering behavioral policy" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "# Suboptimal policy\n", @@ -630,14 +633,14 @@ " env_2d_grid_initial_config=env_2D_grid_initial_config,\n", " num_frames=1000,\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "# Expert policy\n", @@ -648,23 +651,23 @@ " env_2d_grid_initial_config=env_2D_grid_initial_config,\n", " num_frames=1000,\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### State-action distribution" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### State-action distribution" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "name_combined_dataset = config_combined_data.data_set_name\n", @@ -678,23 +681,23 @@ "state_action_histogram(state_action_count_data, title=\"State-Action data distribution\", inset_pos_xy=(-0.1, -0.012))\n", "\n", "snapshot_env(env)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Policy to train" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Policy to train" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "POLICY_NAME = PolicyName.dqn\n", @@ -707,23 +710,23 @@ " render_mode=render_mode,\n", " device=\"cpu\"\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Training" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Training" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "# Run the training\n", @@ -744,23 +747,23 @@ " number_test_envs=NUMBER_TEST_ENVS,\n", " restore_training=False,\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Restore policy" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Restore policy" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "POLICY_FILE = \"policy_best_reward.pth\"\n", @@ -772,23 +775,23 @@ "log_name = os.path.join(name_expert_data, POLICY_NAME)\n", "log_path = get_trained_policy_path(log_name)\n", "policy.load_state_dict(torch.load(os.path.join(log_path, POLICY_FILE), map_location=\"cpu\"))" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Let's visualize the policy" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Let's visualize the policy" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "offpolicy_rendering(\n", @@ -799,32 +802,29 @@ " num_frames=1000,\n", " imitation_policy_sampling=False\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "### Questions:\n", "\n", "1 - What do you notice? What happens if you increase the expert data? Is it better?\n", "\n", "2 - Try again with the BCQ algorithm." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [], "metadata": { "collapsed": false - } + }, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/nb_96_Offline_RL_part_II.ipynb b/notebooks/nb_96_Offline_RL_part_II.ipynb index 287e461a..752f4dc1 100644 --- a/notebooks/nb_96_Offline_RL_part_II.ipynb +++ b/notebooks/nb_96_Offline_RL_part_II.ipynb @@ -3,6 +3,9 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,42 +14,39 @@ "%matplotlib inline\n", "%load_ext training_rl\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "\"Snow\"\n", - "
Addressing distributional shift
" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "\"Snow\"\n", + "
Addressing distributional shift
" + ] }, { "cell_type": "markdown", @@ -57,12 +57,12 @@ }, { "cell_type": "markdown", - "source": [ - "## Overview" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Overview" + ] }, { "cell_type": "markdown", @@ -251,24 +251,27 @@ }, { "cell_type": "markdown", - "source": [ - "## Short review of some popular offline RL algorithms" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Short review of some popular offline RL algorithms" + ] }, { "cell_type": "markdown", - "source": [ - "### Introduction" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Introduction" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "In this notebook, we will explore several key algorithms that aim to address distributional shift issues within offline reinforcement learning. It's worth noting that the field of offline RL is evolving rapidly, and this list is by no means exhaustive. Many of the concepts and strategies employed by these algorithms find applications and improvements in various other approaches.\n", "\n", @@ -292,22 +295,22 @@ "$$\n", "\n", "So the main idea is to modify the Evaluation/Improvement steps to improve the distributional shift problems." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Batch Constrained deep Q-learning (BCQ) algorithm" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Batch Constrained deep Q-learning (BCQ) algorithm" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "BCQ algorithm tries to solve the problem of distributional shift, and in particular the issues mentioned before during the Q-value evaluation process, i.e.:\n", "\n", @@ -345,22 +348,22 @@ "could be important for the task to be solved.\n", "\n", "**cons**: As BCQ generated action from a VAE, if the dataset used to train it underrepresents some important actions it could be that the VAE is not able to generate meaningful actions around that state and so the discovery of new or unconventional actions could be hard. This is one of the limitation of constrained policy approaches!" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Conservative Q-Learning (CQL) algorithm" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Conservative Q-Learning (CQL) algorithm" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "CQL follows a pessimistic approach by considering a lower bound of the Q-value. In the paper they show that the solution of:\n", "\n", @@ -373,28 +376,25 @@ "CQL Focuses on **conservative value estimation** to provide lower bounds on the expected return of a policy. Aims to reduce overestimation bias and ensure that the policy remains within a safe region of the state-action space. Achieves safe exploration by constructing action sets that cover a broader range of state-action pairs. Well-suited for scenarios where safety is a top priority, as it **reduces the risk of catastrophic actions**.\n", "\n", "Note that BCQ could be better to discover novel actions and to use the collected data more efficiently but may not guarantee complete safety!." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### IMPLICIT Q-LEARNING (IQL):" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### IMPLICIT Q-LEARNING (IQL):" + ] }, { "cell_type": "markdown", - "source": [ - "In this case another interesting lower bound to the Q-value is introduced to make it more pessimistic as in point 4. See [paper](https://openreview.net/pdf?id=68n2s9ZJWF8) for more details." - ], "metadata": { "collapsed": false - } + }, + "source": [ + "In this case another interesting lower bound to the Q-value is introduced to make it more pessimistic as in point 4. See [paper](https://openreview.net/pdf?id=68n2s9ZJWF8) for more details." + ] }, { "cell_type": "markdown", diff --git a/notebooks/nb_97_Offline_rl_algorithms_exercises.ipynb b/notebooks/nb_97_Offline_rl_algorithms_exercises.ipynb index 15dd53f7..d95c20d8 100644 --- a/notebooks/nb_97_Offline_rl_algorithms_exercises.ipynb +++ b/notebooks/nb_97_Offline_rl_algorithms_exercises.ipynb @@ -3,6 +3,9 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%%capture\n", @@ -11,43 +14,40 @@ "%matplotlib inline\n", "%load_ext training_rl\n", "%set_random_seed 12" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%presentation_style" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%load_latex_macros" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "%autoreload" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", @@ -93,31 +93,31 @@ }, { "cell_type": "markdown", - "source": [ - "\"Snow\"\n", - "
Offline RL algorithms exercises
" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "\"Snow\"\n", + "
Offline RL algorithms exercises
" + ] }, { "cell_type": "markdown", - "source": [ - "# Offline RL algorithms exercises" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "# Offline RL algorithms exercises" + ] }, { "cell_type": "markdown", - "source": [ - "## Exercise I" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Exercise I" + ] }, { "cell_type": "markdown", @@ -376,38 +376,41 @@ }, { "cell_type": "markdown", - "source": [ - "## Exercise III" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Exercise III" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "**In this exercise, we'll evaluate the distributional shift in the CQL and BCQ algorithms and how they deal with it.**\n", "\n", "As mentioned earlier, regularization methods such as CQL are a suitable choice when prioritizing safety in your agent's behavior. However, if your focus is primarily on achieving an optimal solution with fewer constraints on safety, methods like BCQ may be more suitable.\n", "\n", "In this exercise we will start from (0,0) and we will try to reach the target at (4,7) but the target is protected by a wall. We will collect data again from suboptimal policies as shown below (section 1.2)." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Environment" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Environment" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "ENV_NAME = CustomEnv.Grid_2D_8x8_discrete\n", @@ -426,23 +429,23 @@ "env = InitialConfigCustom2DGridEnvWrapper(gym.make(ENV_NAME, render_mode=render_mode),\n", " env_config=env_2D_grid_initial_config)\n", "snapshot_env(env)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Configure the two datasets" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Configure the two datasets" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "IDENTIFIER_COMBINED_DATASETS = \"_conservative_test\"\n", @@ -458,23 +461,23 @@ "BEHAVIOR_POLICY_II = BehaviorPolicyType.random#behavior_8x8_grid_deterministic_0_0_to_4_7\n", "DATA_SET_IDENTIFIER_II = \"_random\"\n", "NUM_STEPS_II = 8000" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Create Minari combined dataset" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Create Minari combined dataset" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "config_combined_data = create_combined_minari_dataset(\n", @@ -487,23 +490,23 @@ ")\n", "buffer_data = load_buffer_minari(config_combined_data.data_set_name)\n", "data_size = len(buffer_data)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Rendering behavioral policy" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Rendering behavioral policy" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "# Policy I\n", @@ -514,23 +517,23 @@ " env_2d_grid_initial_config=env_2D_grid_initial_config,\n", " num_frames=1000,\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Choose your policy" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Choose your policy" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "# The model policy to be trained.\n", @@ -545,23 +548,23 @@ " render_mode=render_mode,\n", " device=\"cpu\",\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Training" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Training" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "NUM_EPOCHS = 20\n", @@ -582,23 +585,23 @@ " step_per_epoch=STEP_PER_EPOCH,\n", " restore_training=False,\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Restore policy" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Restore policy" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "POLICY_FILE = \"policy_best_reward.pth\"\n", @@ -609,23 +612,23 @@ "log_name = os.path.join(name_expert_data, POLICY_NAME)\n", "log_path = get_trained_policy_path(log_name)\n", "policy.load_state_dict(torch.load(os.path.join(log_path, POLICY_FILE), map_location=\"cpu\"))" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "### Render trained policy" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Render trained policy" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "offpolicy_rendering(\n", @@ -636,35 +639,35 @@ " num_frames=1000,\n", " imitation_policy_sampling=False\n", ")" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "## Exercise IV\n", "\n", "a) Remove the obstacle. What do you think are going to be the results?\n", "\n", "b) Modify the parameters related to distributional shift in BCQ and CQL, and observe their impact on out-of-distribution behavior." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "## Final remarks" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## Final remarks" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "Offline RL proves valuable in various scenarios, especially when:\n", "\n", @@ -681,14 +684,14 @@ "... and many more.\n", "\n", "However, if you have access to an environment with abundant data, online Reinforcement Learning (RL) can be a powerful choice due to its potential for exploration and real-time feedback. Nevertheless, the landscape of RL is evolving, and a data-centric approach is gaining prominence, exemplified by vast datasets like X-Embodiment. It's becoming evident that robots trained with diverse data across various scenarios tend to outperform those solely focused on specific tasks. Furthermore, leveraging multitask trained agents for transfer learning can be a valuable strategy for addressing your specific task at hand." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "# Policy II\n", @@ -699,10 +702,7 @@ " env_2d_grid_initial_config=env_2D_grid_initial_config,\n", " num_frames=1000,\n", ")" - ], - "metadata": { - "collapsed": false - } + ] } ], "metadata": {