From bb9827845053ff47e3575103a1f51097ace9f396 Mon Sep 17 00:00:00 2001
From: Ignacio Talavera <ignaciotalaveracepeda@gmail.com>
Date: Sun, 31 Jul 2022 17:55:13 +0200
Subject: [PATCH] docs: spacy `DocBin` cookbook (#1642)

(cherry picked from commit 625d1532f041de9b1901b795bc1c6d32599ea62c)

- docs: Improve cookbook spacy docbin (#1691)
(cherry picked from commit 3f753235946be23ce288c469cf9960a7266fe30c)
---
 docs/guides/cookbook.ipynb | 44 +++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/docs/guides/cookbook.ipynb b/docs/guides/cookbook.ipynb
index 4b75332aa7..1a4e8c68b9 100644
--- a/docs/guides/cookbook.ipynb
+++ b/docs/guides/cookbook.ipynb
@@ -760,6 +760,38 @@
     "rb.log(records=record, name=\"lesmiserables-pos\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Train a spaCy model by exporting to Docbin\n",
+    "With the examples above, we have been able to store from spaCy models into Rubrix. \n",
+    "\n",
+    "In order to train models with spaCy, Rubrix provides you with an easy util to prepare a dataset [spaCy Docbin format](https://spacy.io/api/docbin). With the example below, you can export your Rubrix dataset into a Docbin, save it to disk, and then use this file with the `spacy train` [command](https://spacy.io/api/cli#train)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "import rubrix as rb\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "# Dataset loading to export it as a Docbin\n",
+    "dataset_raw = load_dataset(\"conll2003\", split=\"train\")\n",
+    "dataset_rubrix = rb.DatasetForTokenClassification.from_datasets(dataset_raw, tags=\"ner_tags\")\n",
+    "\n",
+    "# Loading an spaCy blank language model to create the Docbin, as it works faster\n",
+    "nlp = spacy.blank(\"en\")\n",
+    "\n",
+    "# After this line, the file will be stored in disk\n",
+    "dataset_rubrix.prepare_for_training(framework=\"spacy\", lang=nlp).to_disk(\"train.spacy\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1556,11 +1588,8 @@
   }
  ],
  "metadata": {
-  "interpreter": {
-   "hash": "b709380ea7d1cb2eb4650c0f11ac7e002ec6a534602815725771481b4784238c"
-  },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -1574,7 +1603,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.8.5"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "d37ff9f4b2d8cea8fd3fc88a3420f8da5f414e64666a8add97ac41ea297827c4"
+   }
   }
  },
  "nbformat": 4,