From bb9827845053ff47e3575103a1f51097ace9f396 Mon Sep 17 00:00:00 2001 From: Ignacio Talavera Date: Sun, 31 Jul 2022 17:55:13 +0200 Subject: [PATCH] docs: spacy `DocBin` cookbook (#1642) (cherry picked from commit 625d1532f041de9b1901b795bc1c6d32599ea62c) - docs: Improve cookbook spacy docbin (#1691) (cherry picked from commit 3f753235946be23ce288c469cf9960a7266fe30c) --- docs/guides/cookbook.ipynb | 44 +++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/docs/guides/cookbook.ipynb b/docs/guides/cookbook.ipynb index 4b75332aa7..1a4e8c68b9 100644 --- a/docs/guides/cookbook.ipynb +++ b/docs/guides/cookbook.ipynb @@ -760,6 +760,38 @@ "rb.log(records=record, name=\"lesmiserables-pos\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train a spaCy model by exporting to Docbin\n", + "With the examples above, we have been able to store from spaCy models into Rubrix. \n", + "\n", + "In order to train models with spaCy, Rubrix provides you with an easy util to prepare a dataset [spaCy Docbin format](https://spacy.io/api/docbin). With the example below, you can export your Rubrix dataset into a Docbin, save it to disk, and then use this file with the `spacy train` [command](https://spacy.io/api/cli#train)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import spacy\n", + "import rubrix as rb\n", + "\n", + "from datasets import load_dataset\n", + "\n", + "# Dataset loading to export it as a Docbin\n", + "dataset_raw = load_dataset(\"conll2003\", split=\"train\")\n", + "dataset_rubrix = rb.DatasetForTokenClassification.from_datasets(dataset_raw, tags=\"ner_tags\")\n", + "\n", + "# Loading an spaCy blank language model to create the Docbin, as it works faster\n", + "nlp = spacy.blank(\"en\")\n", + "\n", + "# After this line, the file will be stored in disk\n", + "dataset_rubrix.prepare_for_training(framework=\"spacy\", lang=nlp).to_disk(\"train.spacy\")" + ] + }, { "cell_type": "markdown", "metadata": { @@ -1556,11 +1588,8 @@ } ], "metadata": { - "interpreter": { - "hash": "b709380ea7d1cb2eb4650c0f11ac7e002ec6a534602815725771481b4784238c" - }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1574,7 +1603,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.8.5" + }, + "vscode": { + "interpreter": { + "hash": "d37ff9f4b2d8cea8fd3fc88a3420f8da5f414e64666a8add97ac41ea297827c4" + } } }, "nbformat": 4,