argilla-io · frascuchon · Jul 31, 2022 · Jul 26, 2022 · Jul 28, 2022
diff --git a/docs/guides/cookbook.ipynb b/docs/guides/cookbook.ipynb
@@ -760,6 +760,49 @@
     "rb.log(records=record, name=\"lesmiserables-pos\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Export to spaCy Docbin\n",
+    "\n",
+    "With the examples above, we have been able to store data from spaCy models into Rubrix. But Rubrix datasets are not bound to stay always inside Rubrix, and we can export them in the [spaCy Docbin format](https://spacy.io/api/docbin). With the example below, you can export your Rubrix dataset into a Docbin and save it to disk.\n",
+    "\n",
+    "To create a Docbin, we need an spaCy language model to create the annotations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-07-28 13:25:14.367 | WARNING  | datasets.builder:download_and_prepare:577 - Reusing dataset conll2003 (/Users/ignacio/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)\n",
+      "2022-07-28 13:25:14.372 | WARNING  | rubrix.client.datasets:_remove_unsupported_columns:252 - Following columns are not supported by the TokenClassificationRecord model and are ignored: ['pos_tags', 'chunk_tags']\n",
+      "2022-07-28 13:25:16.324 | WARNING  | rubrix.client.datasets:from_datasets:761 - Ignoring row with no tokens.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import spacy\n",
+    "import rubrix as rb\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "# Dataset loading to export it as a Docbin\n",
+    "dataset_raw = load_dataset(\"conll2003\", split=\"train\")\n",
+    "dataset_rubrix = rb.DatasetForTokenClassification.from_datasets(dataset_raw, tags=\"ner_tags\")\n",
+    "\n",
+    "# Loading an spaCy blank language model to create the Docbin, as it works faster\n",
+    "nlp = spacy.blank(\"en\")\n",
+    "\n",
+    "# After this line, the file will be stored in disk\n",
+    "dataset_rubrix.prepare_for_training(framework=\"spacy\", lang=nlp).to_disk(\"docbin\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1556,11 +1599,8 @@
   }
  ],
  "metadata": {
-  "interpreter": {
-   "hash": "b709380ea7d1cb2eb4650c0f11ac7e002ec6a534602815725771481b4784238c"
-  },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3.8.13 ('rubrix')",
    "language": "python",
    "name": "python3"
   },
@@ -1574,7 +1614,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.8.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "d37ff9f4b2d8cea8fd3fc88a3420f8da5f414e64666a8add97ac41ea297827c4"
+   }
   }
  },
  "nbformat": 4,