diff --git a/changelog.md b/changelog.md
index e8ef6dafa6..7db6414ad3 100644
--- a/changelog.md
+++ b/changelog.md
@@ -7,6 +7,8 @@
 - Handling intra-word linebreak as pollution : adds a pollution pattern that detects intra-word linebreak, which can then be removed in the `get_text` method
 - Qualifiers can process `Span` or `Doc` : this feature especially makes it easier to nest qualifiers components in other components
 - New label_weights parameter in eds.span_classifier`, which allows the user to set per label-value loss weights during training
+- New `edsnlp.data.converters.MarkupToDocConverter` to convert Markdown or XML-like markup to documents, which is particularly useful to create annotated documents from scratch (e.g., for testing purposes).
+- New [Metrics](https://aphp.github.io/edsnlp/master/metrics/) documentation page to document the available metrics and how to use them.
 
 ### Fixed
 
diff --git a/docs/assets/images/ner_metrics_example.png b/docs/assets/images/ner_metrics_example.png
new file mode 100644
index 0000000000..09d50bd458
Binary files /dev/null and b/docs/assets/images/ner_metrics_example.png differ
diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css
index da8fe9706e..90719615e9 100644
--- a/docs/assets/stylesheets/extra.css
+++ b/docs/assets/stylesheets/extra.css
@@ -189,6 +189,7 @@ a.discrete-link {
 .sourced-heading > a {
     font-size: 1rem;
     align-content: center;
+    white-space: nowrap;
 }
 
 .doc-param-details .subdoc {
@@ -207,3 +208,101 @@ a.discrete-link {
     margin: 0;
     font-weight: normal;
 }
+
+/*.chip {
+    position: relative;
+    box-sizing: content-box;
+    display: inline-block;
+    padding: 2px 1px;
+    margin: 1px 0px 14px;
+    border-radius: 6px;
+    font-style: normal;
+    background: #dae8fc;
+    border: 1px solid #6c8ebf;
+    --border-color: #6c8ebf;
+    white-space: nowrap;
+}
+
+.chip::after {
+    content: attr(data-chip);
+    position: absolute;
+    right: -1px;
+    background: white;
+    border: 1px solid var(--border-color);
+    border-radius: 3px;
+    line-height: 1;
+    top: calc(100% - 6px);
+    box-sizing: border-box;
+}*/
+
+.chip {
+    position: relative;
+    box-sizing: content-box;
+    display: inline-block;
+    padding: 0 0 0 2px;
+    margin: 1px 0px;
+    border-radius: 4px;
+    font-style: normal;
+    background: #dae8fc;
+    border: 1px solid #6c8ebf;
+    --border-color: #6c8ebf;
+    white-space: nowrap;
+}
+
+.chip::after {
+    content: attr(data-chip);
+    display: inline-block;
+    right: -1px;
+    background: white;
+    border: 1px solid var(--border-color);
+    border-radius: 0px 3px 3px 0px;
+    padding: 0 1px;
+    margin: -2px -2px -2px 2px;
+    box-sizing: border-box;
+}
+
+.chip.tp {
+    background-color: #cef8ce;
+    border-color: #50b950;
+    --border-color: #50b950;
+}
+
+.chip-green {
+    display: inline-block;
+    padding: 2px 2px;
+    margin: 1px 1px;
+    border-radius: 6px;
+    font-style: normal;
+    background: #cef8ce;
+    border: 1px solid #50b950;
+    white-space: nowrap;
+}
+
+.chip-red {
+    display: inline-block;
+    padding: 2px 2px;
+    margin: 1px 1px;
+    border-radius: 6px;
+    font-style: normal;
+    background: #f8cecc;
+    border: 1px solid #b95450;
+    white-space: nowrap
+}
+
+.chip.fp, .chip.fn {
+    background-color: #f8cecc;
+    border-color: #b95450;
+    --border-color: #b95450;
+}
+
+.chip.na {
+    display: inline-block;
+    padding: 2px 2px;
+    margin: 1px 1px;
+    border-radius: 6px;
+    font-style: normal;
+    background: #efefef;
+    border: 1px solid #bababa;
+    color: #bababa;
+    white-space: nowrap;
+}
diff --git a/docs/assets/termynal/termynal.css b/docs/assets/termynal/termynal.css
deleted file mode 100644
index affc90e34f..0000000000
--- a/docs/assets/termynal/termynal.css
+++ /dev/null
@@ -1,132 +0,0 @@
-/**
- * termynal.js
- *
- * @author Ines Montani <ines@ines.io>
- * @version 0.0.1
- * @license MIT
- *
- * Modified version from https://github.com/tiangolo/typer
- */
-
-:root {
-    --color-bg: #252a33;
-    --color-text: #eee;
-    --color-text-subtle: #a2a2a2;
-}
-
-[data-termynal] {
-    width: auto;
-    max-width: 100%;
-    background: var(--color-bg);
-    color: var(--color-text);
-    font-size: 18px;
-    /* font-family: 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; */
-    font-family: 'Roboto Mono', 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace;
-    border-radius: 4px;
-    padding: 75px 45px 35px;
-    position: relative;
-    -webkit-box-sizing: border-box;
-    box-sizing: border-box;
-}
-
-[data-termynal]:before {
-    content: '';
-    position: absolute;
-    top: 15px;
-    left: 15px;
-    display: inline-block;
-    width: 15px;
-    height: 15px;
-    border-radius: 50%;
-    /* A little hack to display the window buttons in one pseudo element. */
-    background: #d9515d;
-    -webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;
-    box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;
-}
-
-[data-termynal]:after {
-    content: 'bash';
-    position: absolute;
-    color: var(--color-text-subtle);
-    top: 5px;
-    left: 0;
-    width: 100%;
-    text-align: center;
-}
-
-a[data-terminal-control] {
-    text-align: right;
-    display: block;
-    color: #aebbff;
-}
-
-[data-terminal-copy] {
-    text-align: right;
-    position: absolute;
-    top: 5px;
-    right: 5px;
-}
-
-[data-terminal-copy].md-icon {
-    color: #aebbff;
-}
-
-[data-ty] {
-    display: block;
-    line-height: 2;
-}
-
-[data-ty]:before {
-    /* Set up defaults and ensure empty lines are displayed. */
-    content: '';
-    display: inline-block;
-    vertical-align: middle;
-}
-
-[data-ty="input"]:before,
-[data-ty-prompt]:before {
-    margin-right: 0.75em;
-    color: var(--color-text-subtle);
-}
-
-[data-ty="input"]:before {
-    content: '$';
-}
-
-[data-ty][data-ty-prompt]:before {
-    content: attr(data-ty-prompt);
-}
-
-[data-ty-cursor]:after {
-    content: attr(data-ty-cursor);
-    font-family: monospace;
-    margin-left: 0.5em;
-    -webkit-animation: blink 1s infinite;
-    animation: blink 1s infinite;
-}
-
-
-/* Cursor animation */
-
-@-webkit-keyframes blink {
-    50% {
-        opacity: 0;
-    }
-}
-
-@keyframes blink {
-    50% {
-        opacity: 0;
-    }
-}
-
-/* tooltip */
-
-[data-md-state="open"] {
-    transform: translateY(0);
-    opacity: 1;
-    transition:
-        transform 400ms cubic-bezier(0.075, 0.85, 0.175, 1),
-        opacity 400ms;
-    pointer-events: initial;
-}
diff --git a/docs/assets/termynal/termynal.js b/docs/assets/termynal/termynal.js
deleted file mode 100644
index 8a572449ae..0000000000
--- a/docs/assets/termynal/termynal.js
+++ /dev/null
@@ -1,411 +0,0 @@
-/**
- * termynal.js
- * A lightweight, modern and extensible animated terminal window, using
- * async/await.
- *
- * @author Ines Montani <ines@ines.io>
- * @version 0.0.1
- * @license MIT
- *
- * Modified version from https://github.com/tiangolo/typer
- *
- */
-
-'use strict';
-
-/** Generate a terminal widget. */
-class Termynal {
-    /**
-     * Construct the widget's settings.
-     * @param {(string|Node)=} container - Query selector or container element.
-     * @param {Object=} options - Custom settings.
-     * @param {string} options.prefix - Prefix to use for data attributes.
-     * @param {number} options.startDelay - Delay before animation, in ms.
-     * @param {number} options.typeDelay - Delay between each typed character, in ms.
-     * @param {number} options.lineDelay - Delay between each line, in ms.
-     * @param {number} options.progressLength - Number of characters displayed as progress bar.
-     * @param {string} options.progressChar – Character to use for progress bar, defaults to █.
-     * @param {number} options.progressPercent - Max percent of progress.
-     * @param {string} options.cursor – Character to use for cursor, defaults to ▋.
-     * @param {Object[]} lineData - Dynamically loaded line data objects.
-     * @param {boolean} options.noInit - Don't initialise the animation.
-     */
-    constructor(container = '#termynal', options = {}) {
-        this.container = (typeof container === 'string') ? document.querySelector(container) : container;
-        this.pfx = `data-${options.prefix || 'ty'}`;
-        this.originalStartDelay = this.startDelay = options.startDelay
-            || parseFloat(this.container.getAttribute(`${this.pfx}-startDelay`)) || 600;
-        this.originalTypeDelay = this.typeDelay = options.typeDelay
-            || parseFloat(this.container.getAttribute(`${this.pfx}-typeDelay`)) || 50;
-        this.originalLineDelay = this.lineDelay = options.lineDelay
-            || parseFloat(this.container.getAttribute(`${this.pfx}-lineDelay`)) || 500;
-        this.progressLength = options.progressLength
-            || parseFloat(this.container.getAttribute(`${this.pfx}-progressLength`)) || 40;
-        this.progressChar = options.progressChar
-            || this.container.getAttribute(`${this.pfx}-progressChar`) || '█';
-        this.progressPercent = options.progressPercent
-            || parseFloat(this.container.getAttribute(`${this.pfx}-progressPercent`)) || 100;
-        this.cursor = options.cursor
-            || this.container.getAttribute(`${this.pfx}-cursor`) || '▋';
-        this.lineData = this.lineDataToElements(options.lineData || []);
-        this.loadLines()
-        if (!options.noInit) this.init()
-    }
-
-    loadLines() {
-        // Load all the lines and create the container so that the size is fixed
-        // Otherwise it would be changing and the user viewport would be constantly
-        // moving as she/he scrolls
-        const finish = this.generateFinish()
-        finish.style.visibility = 'hidden'
-        this.container.appendChild(finish)
-        // Appends dynamically loaded lines to existing line elements.
-        this.lines = [...this.container.querySelectorAll(`[${this.pfx}]`)].concat(this.lineData);
-        for (let line of this.lines) {
-            line.style.visibility = 'hidden'
-            this.container.appendChild(line)
-        }
-        const restart = this.generateRestart()
-        restart.style.visibility = 'hidden'
-        this.container.appendChild(restart)
-        this.container.setAttribute('data-termynal', '');
-    }
-
-    /**
-     * Initialise the widget, get lines, clear container and start animation.
-     */
-    init() {
-        /**
-         * Calculates width and height of Termynal container.
-         * If container is empty and lines are dynamically loaded, defaults to browser `auto` or CSS.
-         */
-        const containerStyle = getComputedStyle(this.container);
-        this.container.style.width = containerStyle.width !== '0px' ?
-            containerStyle.width : undefined;
-        this.container.style.minHeight = containerStyle.height !== '0px' ?
-            containerStyle.height : undefined;
-
-        this.container.setAttribute('data-termynal', '');
-        this.container.innerHTML = '';
-        for (let line of this.lines) {
-            line.style.visibility = 'visible'
-        }
-        this.start();
-    }
-
-
-    /**
-     * Start the animation and rener the lines depending on their data attributes.
-     */
-    async start() {
-        this.addCopy()
-        this.addFinish()
-        await this._wait(this.startDelay);
-
-        for (let line of this.lines) {
-            const type = line.getAttribute(this.pfx);
-            const delay = line.getAttribute(`${this.pfx}-delay`) || this.lineDelay;
-
-            if (type == 'input') {
-                line.setAttribute(`${this.pfx}-cursor`, this.cursor);
-                await this.type(line);
-                await this._wait(delay);
-            }
-
-            else if (type == 'progress') {
-                await this.progress(line);
-                await this._wait(delay);
-            }
-
-            else {
-                this.container.appendChild(line);
-                await this._wait(delay);
-            }
-
-            line.removeAttribute(`${this.pfx}-cursor`);
-        }
-        this.addRestart()
-        this.finishElement.style.visibility = 'hidden'
-        this.lineDelay = this.originalLineDelay
-        this.typeDelay = this.originalTypeDelay
-        this.startDelay = this.originalStartDelay
-    }
-
-    generateRestart() {
-        const restart = document.createElement('a')
-        restart.onclick = (e) => {
-            e.preventDefault()
-            this.container.innerHTML = ''
-            this.init()
-        }
-        restart.href = '#'
-        restart.setAttribute('data-terminal-control', '')
-        restart.innerHTML = "restart ↻"
-        return restart
-    }
-
-    generateCopy() {
-        var dialog = document.getElementsByClassName('md-dialog')[0]
-        var dialog_text = document.getElementsByClassName('md-dialog__inner md-typeset')[0]
-        const copy = document.createElement('a')
-        copy.classList.add("md-clipboard")
-        copy.classList.add("md-icon")
-        copy.onclick = (e) => {
-            e.preventDefault()
-            var command = ''
-            for (let line of this.lines) {
-                if (line.getAttribute("data-ty") == 'input') {
-                    command = command + line.innerHTML + '\n'
-                }
-            }
-            navigator.clipboard.writeText(command)
-            dialog.setAttribute('data-md-state', 'open');
-            dialog_text.innerText = 'Copied to clipboard';
-
-            setTimeout(function () {
-                dialog.removeAttribute('data-md-state');
-            }, 2000);
-        }
-        copy.setAttribute('data-terminal-copy', '')
-        return copy
-    }
-
-    generateFinish() {
-        const finish = document.createElement('a')
-        finish.onclick = (e) => {
-            e.preventDefault()
-            this.lineDelay = 0
-            this.typeDelay = 0
-            this.startDelay = 0
-        }
-        finish.href = '#'
-        finish.setAttribute('data-terminal-control', '')
-        finish.innerHTML = "fast →"
-        this.finishElement = finish
-        return finish
-    }
-
-    addRestart() {
-        const restart = this.generateRestart()
-        this.container.appendChild(restart)
-    }
-
-    addFinish() {
-        const finish = this.generateFinish()
-        this.container.appendChild(finish)
-    }
-
-    addCopy() {
-        let copy = this.generateCopy()
-        this.container.appendChild(copy)
-    }
-
-    /**
-     * Animate a typed line.
-     * @param {Node} line - The line element to render.
-     */
-    async type(line) {
-        const chars = [...line.textContent];
-        line.textContent = '';
-        this.container.appendChild(line);
-
-        for (let char of chars) {
-            const delay = line.getAttribute(`${this.pfx}-typeDelay`) || this.typeDelay;
-            await this._wait(delay);
-            line.textContent += char;
-        }
-    }
-
-    /**
-     * Animate a progress bar.
-     * @param {Node} line - The line element to render.
-     */
-    async progress(line) {
-        const progressLength = line.getAttribute(`${this.pfx}-progressLength`)
-            || this.progressLength;
-        const progressChar = line.getAttribute(`${this.pfx}-progressChar`)
-            || this.progressChar;
-        const chars = progressChar.repeat(progressLength);
-        const progressPercent = line.getAttribute(`${this.pfx}-progressPercent`)
-            || this.progressPercent;
-        line.textContent = '';
-        this.container.appendChild(line);
-
-        for (let i = 1; i < chars.length + 1; i++) {
-            await this._wait(this.typeDelay) / 4;
-            const percent = Math.round(i / chars.length * 100);
-            line.textContent = `${chars.slice(0, i)} ${percent}%`;
-            if (percent > progressPercent) {
-                break;
-            }
-        }
-    }
-
-    /**
-     * Helper function for animation delays, called with `await`.
-     * @param {number} time - Timeout, in ms.
-     */
-    _wait(time) {
-        return new Promise(resolve => setTimeout(resolve, time));
-    }
-
-    /**
-     * Converts line data objects into line elements.
-     *
-     * @param {Object[]} lineData - Dynamically loaded lines.
-     * @param {Object} line - Line data object.
-     * @returns {Element[]} - Array of line elements.
-     */
-    lineDataToElements(lineData) {
-        return lineData.map(line => {
-            let div = document.createElement('div');
-            div.innerHTML = `<span ${this._attributes(line)}>${line.value || ''}</span>`;
-
-            return div.firstElementChild;
-        });
-    }
-
-    /**
-     * Helper function for generating attributes string.
-     *
-     * @param {Object} line - Line data object.
-     * @returns {string} - String of attributes.
-     */
-    _attributes(line) {
-        let attrs = '';
-        for (let prop in line) {
-            // Custom add class
-            if (prop === 'class') {
-                attrs += ` class=${line[prop]} `
-                continue
-            }
-            if (prop === 'type') {
-                attrs += `${this.pfx}="${line[prop]}" `
-            } else if (prop !== 'value') {
-                attrs += `${this.pfx}-${prop}="${line[prop]}" `
-            }
-        }
-
-        return attrs;
-    }
-}
-
-/**
-* HTML API: If current script has container(s) specified, initialise Termynal.
-*/
-if (document.currentScript.hasAttribute('data-termynal-container')) {
-    const containers = document.currentScript.getAttribute('data-termynal-container');
-    containers.split('|')
-        .forEach(container => new Termynal(container))
-}
-
-document.querySelectorAll(".use-termynal").forEach(node => {
-    node.style.display = "block";
-    new Termynal(node, {
-        lineDelay: 500
-    });
-});
-const progressLiteralStart = "---> 100%";
-const promptLiteralStart = "$ ";
-const customPromptLiteralStart = "$* ";
-const commentPromptLiteralStart = "# ";
-const colorOutputLiteralStart = "color:";
-const termynalActivateClass = "termy";
-let termynals = [];
-
-function createTermynals() {
-    document
-        .querySelectorAll(`.${termynalActivateClass} .highlight`)
-        .forEach(node => {
-            const text = node.textContent;
-            const lines = text.split("\n");
-            const useLines = [];
-            let buffer = [];
-            function saveBuffer() {
-                if (buffer.length) {
-                    let isBlankSpace = true;
-                    buffer.forEach(line => {
-                        if (line) {
-                            isBlankSpace = false;
-                        }
-                    });
-                    var dataValue = {};
-                    if (isBlankSpace) {
-                        dataValue["delay"] = 0;
-                    }
-                    if (buffer[buffer.length - 1] === "") {
-                        // A last single <br> won't have effect
-                        // so put an additional one
-                        buffer.push("");
-                    }
-
-                    const bufferValue = buffer.join("<br>");
-                    dataValue["value"] = bufferValue;
-                    useLines.push(dataValue);
-                    buffer = [];
-                }
-            }
-            for (let line of lines) {
-                if (line === progressLiteralStart) {
-                    saveBuffer();
-                    useLines.push({
-                        type: "progress"
-                    });
-                } else if (line.startsWith(promptLiteralStart)) {
-                    saveBuffer();
-                    const value = line.replace(promptLiteralStart, "").trimEnd();
-                    useLines.push({
-                        type: "input",
-                        value: value
-                    });
-                } else if (line.startsWith(commentPromptLiteralStart)) {
-                    saveBuffer();
-                    const value = "💬 " + line.replace(commentPromptLiteralStart, "").trimEnd();
-                    const color_value = "<span style=' color: grey;'>" + value + "</span>"
-                    useLines.push({
-                        value: color_value,
-                        class: "termynal-comment",
-                        delay: 0
-                    });
-                } else if (line.startsWith(customPromptLiteralStart)) {
-                    saveBuffer();
-                    const prompt = line.slice(3, line.indexOf(' ', 3))
-                    let value = line.slice(line.indexOf(' ', 3)).trimEnd();
-                    useLines.push({
-                        type: "input",
-                        value: value,
-                        prompt: prompt
-                    });
-                } else if (line.startsWith(colorOutputLiteralStart)) {
-                    let color = line.substring(0, line.indexOf(' '));
-                    let line_value = line.substring(line.indexOf(' ') + 1);
-                    var color_line = "<span style='" + color + ";'>" + line_value + "</span>"
-                    buffer.push(color_line);
-                } else {
-                    buffer.push(line);
-                }
-            }
-            saveBuffer();
-            const div = document.createElement("div");
-            node.replaceWith(div);
-            const termynal = new Termynal(div, {
-                lineData: useLines,
-                noInit: true,
-                lineDelay: 500
-            });
-            termynals.push(termynal);
-        });
-}
-
-function loadVisibleTermynals() {
-    termynals = termynals.filter(termynal => {
-        if (termynal.container.getBoundingClientRect().top - innerHeight <= 0) {
-            termynal.init();
-            return false;
-        }
-        return true;
-    });
-}
-window.addEventListener("scroll", loadVisibleTermynals);
-createTermynals();
-loadVisibleTermynals();
diff --git a/docs/data/converters.md b/docs/data/converters.md
index 6914dfa998..1897eabcf9 100644
--- a/docs/data/converters.md
+++ b/docs/data/converters.md
@@ -218,3 +218,13 @@ one per entity, that can be used to write to a dataframe. The schema of each pro
     options:
         heading_level: 4
         show_source: false
+
+## Markup (`converter="markup"`) {: #edsnlp.data.converters.MarkupToDocConverter }
+
+This converter is used to convert markup data, such as Markdown or XML into documents.
+This can be particularly useful when you want to create annotated documents from scratch (e.g., for testing purposes).
+
+::: edsnlp.data.converters.MarkupToDocConverter
+    options:
+        heading_level: 4
+        show_source: false
diff --git a/docs/data/index.md b/docs/data/index.md
index e1198590c7..4bbd9899fe 100644
--- a/docs/data/index.md
+++ b/docs/data/index.md
@@ -46,9 +46,10 @@ At the moment, we support the following data sources:
 
 and the following schemas:
 
-| Schema                                                                     | Snippet                |
-|:---------------------------------------------------------------------------|------------------------|
-| [Custom](./converters/#custom)                                             | `converter=custom_fn`  |
-| [OMOP](./converters/#omop)                                                 | `converter="omop"`     |
-| [Standoff](./converters/#standoff)                                         | `converter="standoff"` |
-| [Ents](./converters/#edsnlp.data.converters.EntsDoc2DictConverter)         | `converter="ents"`     |
+| Schema                                                              | Snippet                |
+|:--------------------------------------------------------------------|------------------------|
+| [Custom](./converters/#custom)                                      | `converter=custom_fn`  |
+| [OMOP](./converters/#omop)                                          | `converter="omop"`     |
+| [Standoff](./converters/#standoff)                                  | `converter="standoff"` |
+| [Ents](./converters/#edsnlp.data.converters.EntsDoc2DictConverter)  | `converter="ents"`     |
+| [Markup](./converters/#edsnlp.data.converters.MarkupToDocConverter) | `converter="markup"`   |
diff --git a/docs/metrics/index.md b/docs/metrics/index.md
new file mode 100644
index 0000000000..25201285ad
--- /dev/null
+++ b/docs/metrics/index.md
@@ -0,0 +1,12 @@
+# Metrics
+
+EDS-NLP provides several metrics to evaluate the performance of its components. These metrics can be used to assess the quality of entity recognition, negation detection, and other tasks.
+
+At the moment, we support the following metrics:
+
+| Metric               | Description                                        |
+|:---------------------|:---------------------------------------------------|
+| `eds.ner_exact`      | NER metric with exact match at the span level      |
+| `eds.ner_token`      | NER metric with token-level match                  |
+| `eds.ner_overlap`    | NER metric with overlap match at the span level    |
+| `eds.span_attribute` | Span multi-label multi-class classification metric |
diff --git a/docs/metrics/ner.md b/docs/metrics/ner.md
new file mode 100644
index 0000000000..52855565d6
--- /dev/null
+++ b/docs/metrics/ner.md
@@ -0,0 +1,63 @@
+# NER Metrics
+
+We provide several metrics to evaluate the performance of Named Entity Recognition (NER) components.
+Let's look at an example and see how they differ. We'll use the following two documents: a reference
+document (ref) and a document with predicted entities (pred).
+
+### Shared example
+
++-------------------------------------------------------------+------------------------------------------+
+| pred                                                        | ref                                      |
++=============================================================+==========================================+
+| *La*{.chip data-chip=PER} *patiente*{.chip data-chip=PER} a | La *patiente*{.chip data-chip=PER} a     |
+| une *fièvre aigüe*{.chip data-chip=DIS}                     | *une fièvre*{.chip data-chip=DIS} aigüe. |
++-------------------------------------------------------------+------------------------------------------+
+
+Let's create matching documents in EDS-NLP using the following code snippet:
+
+```python
+from edsnlp.data.converters import MarkupToDocConverter
+
+conv = MarkupToDocConverter(preset="md", span_setter="entities")
+
+pred = conv("[La](PER) [patiente](PER) a une [fièvre aiguë](DIS).")
+ref = conv("La [patiente](PER) a [une fièvre](DIS) aiguë.")
+```
+
+### Summary of metrics
+
+The table below shows the different scores depending on the metric used.
+
+| Metric             | Precision | Recall | F1   |
+|--------------------|-----------|--------|------|
+| Span-level exact   | 0.33      | 0.5    | 0.40 |
+| Token-level        | 0.50      | 0.67   | 0.57 |
+| Span-level overlap | 0.67      | 1.0    | 0.80 |
+
+## Span-level NER metric with exact match {: #edsnlp.metrics.ner.NerExactMetric }
+
+::: edsnlp.metrics.ner.NerExactMetric
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+## Span-level NER metric with approximate match {: #edsnlp.metrics.ner.NerOverlapMetric }
+
+::: edsnlp.metrics.ner.NerOverlapMetric
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+
+## Token-level NER metric {: #edsnlp.metrics.ner.NerTokenMetric }
+
+::: edsnlp.metrics.ner.NerTokenMetric
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
+        only_class_level: true
diff --git a/docs/metrics/span-attribute.md b/docs/metrics/span-attribute.md
new file mode 100644
index 0000000000..07db17157b
--- /dev/null
+++ b/docs/metrics/span-attribute.md
@@ -0,0 +1,43 @@
+# Span Attribute Classification Metrics {: #edsnlp.metrics.span_attribute.SpanAttributeMetric }
+
+Several NLP tasks consist in classifying existing spans of text into multiple classes,
+such as the detection of negation, hypothesis or span linking. We provide a metric
+to evaluate the performance of such tasks.
+
+Let's look at an example. We'll use the following two documents: a reference
+document (ref) and a document with predicted entities (pred).
+
++-------------------------------------------------------------------+-------------------------------------------------------------------+
+| pred                                                              | ref                                                               |
++===================================================================+===================================================================+
+| Le patient n'est pas *fièvreux*{.chip data-chip="SYMP neg=true"}, | Le patient n'est pas *fièvreux*{.chip data-chip="SYMP neg=true"}, |
+| son père a *du diabète*{.chip data-chip="DIS carrier=PATIENT"}.   | son père a *du diabète*{.chip data-chip="DIS carrier=FATHER"}.    |
+| Pas d'évolution du                                                | Pas d'évolution du                                                |
+| *cancer*{.chip data-chip="DIS neg=true carrier=PATIENT"}.         | *cancer*{.chip data-chip="DIS carrier=PATIENT"}.                  |
++-------------------------------------------------------------------+-------------------------------------------------------------------+
+
+We can quickly create matching documents in EDS-NLP using the following code snippet:
+
+```python
+from edsnlp.data.converters import MarkupToDocConverter
+
+conv = MarkupToDocConverter(preset="md", span_setter="entities")
+# Create a document with predicted attributes and a reference document
+pred = conv(
+    "Le patient n'est pas [fièvreux](SYMP neg=true), "
+    "son père a [du diabète](DIS neg=false carrier=PATIENT). "
+    "Pas d'évolution du [cancer](DIS neg=true carrier=PATIENT)."
+)
+ref = conv(
+    "Le patient n'est pas [fièvreux](SYMP neg=true), "
+    "son père a [du diabète](DIS neg=false carrier=FATHER). "
+    "Pas d'évolution du [cancer](DIS neg=false carrier=PATIENT)."
+)
+```
+
+::: edsnlp.metrics.span_attribute.SpanAttributeMetric
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
+        only_class_level: true
diff --git a/docs/scripts/clickable_snippets.py b/docs/scripts/clickable_snippets.py
index 2b901448a1..affd0d4f90 100644
--- a/docs/scripts/clickable_snippets.py
+++ b/docs/scripts/clickable_snippets.py
@@ -99,6 +99,7 @@ def on_post_page(
             for ep in (
                 *self.get_ep_namespace(ep, "spacy_factories"),
                 *self.get_ep_namespace(ep, "edsnlp_factories"),
+                *self.get_ep_namespace(ep, "spacy_scorers"),
             )
         }
 
diff --git a/edsnlp/__init__.py b/edsnlp/__init__.py
index 965d8db6b2..59bc1a46d7 100644
--- a/edsnlp/__init__.py
+++ b/edsnlp/__init__.py
@@ -45,7 +45,11 @@ def find_spec(self, fullname, path, target=None):  # pragma: no cover
             spec = importlib.util.spec_from_loader(fullname, AliasLoader(new_name))
             return spec
         if fullname.startswith("edsnlp.metrics.span_classification"):
-            new_name = "edsnlp.metrics.span_attributes" + fullname[34:]
+            new_name = "edsnlp.metrics.span_attribute" + fullname[34:]
+            spec = importlib.util.spec_from_loader(fullname, AliasLoader(new_name))
+            return spec
+        if fullname.startswith("edsnlp.metrics.span_attributes"):
+            new_name = "edsnlp.metrics.span_attribute" + fullname[30:]
             spec = importlib.util.spec_from_loader(fullname, AliasLoader(new_name))
             return spec
         if "span_qualifier" in fullname.split("."):
diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py
index c1247a14d0..c601b4d3f3 100644
--- a/edsnlp/data/converters.py
+++ b/edsnlp/data/converters.py
@@ -24,6 +24,7 @@
 from confit.registry import ValidatedFunction
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc, Span
+from typing_extensions import Literal
 
 import edsnlp
 from edsnlp import registry
@@ -707,6 +708,225 @@ def __call__(self, doc):
         ]
 
 
+# ex: `[The [cat](ANIMAL) is [black](COLOR hex="#000000")].
+
+
+@registry.factory.register("eds.markup_to_doc", spacy_compatible=False)
+class MarkupToDocConverter:
+    """
+    Examples
+    --------
+    ```python
+    import edsnlp
+
+    # Any kind of reader (`edsnlp.data.read/from_...`) can be used here
+    # If input items are dicts, the converter expects a "text" key/column.
+    docs = list(
+        edsnlp.data.from_iterable(
+            [
+                "This [is](VERB negation=True) not a [test](NOUN).",
+                "This is another [test](NOUN).",
+            ],
+            converter="markup",
+            span_setter="entities",
+        ),
+    )
+    print(docs[0].spans["entities"])
+    # Out: [is, test]
+    ```
+
+    You can also use it directly on a string:
+
+    ```python
+    from edsnlp.data.converters import MarkupToDocConverter
+
+    converter = MarkupToDocConverter(
+        span_setter={"verb": "VERB", "noun": "NOUN"},
+        preset="xml",
+    )
+    doc = converter("This <VERB negation=True>is</VERB> not a <NOUN>test</NOUN>.")
+    print(doc.spans["verb"])
+    # Out: [is]
+    print(doc.spans["verb"][0]._.negation)
+    # Out: True
+    ```
+
+    Parameters
+    ----------
+    preset: Literal["md", "xml"]
+        The preset to use for the markup format. Defaults to "md" (Markdown-like
+        syntax). Use "xml" for XML-like syntax.
+    opener: Optional[str]
+        The regex pattern to match the opening tag of the markup. Defaults to the
+        preset's opener.
+    closer: Optional[str]
+        The regex pattern to match the closing tag of the markup. Defaults to the
+        preset's closer.
+    tokenizer: Optional[Tokenizer]
+        The tokenizer instance used to tokenize the documents. Likely not needed since
+        by default it uses the current context tokenizer :
+
+        - the tokenizer of the next pipeline run by `.map_pipeline` in a
+          [Stream][edsnlp.core.stream.Stream].
+        - or the `eds` tokenizer by default.
+    span_setter: SpanSetterArg
+        The span setter to use when setting the spans in the documents. Defaults to
+        setting the spans in the `ents` attribute and creates a new span group for
+        each JSON entity label.
+    span_attributes: Optional[AttributesMappingArg]
+        Mapping from markup attributes to Span extensions (can be a list too).
+        By default, all attributes are imported as Span extensions with the same name.
+    keep_raw_attribute_values: bool
+        Whether to keep the raw attribute values (as strings) or to convert them to
+        Python objects (e.g. booleans).
+    default_attributes: AttributesMappingArg
+        How to set attributes on spans for which no attribute value was found in the
+        input format. This is especially useful for negation, or frequent attributes
+        values (e.g. "negated" is often False, "temporal" is often "present"), that
+        annotators may not want to annotate every time.
+    bool_attributes: AsList[str]
+        List of boolean attributes to set to False by default. This is useful for
+        attributes that are often not annotated, but you want to have a default value
+        for them.
+    """
+
+    PRESETS = {
+        "md": {
+            "opener": r"(?P<opener>\[)",
+            "closer": r"(?P<closer>\]\(\s*(?P<closer_label>[a-zA-Z0-9]+)\s*(?P<closer_attrs>.*?)\))",  # noqa: E501
+        },
+        "xml": {
+            "opener": r"(?P<opener><(?P<opener_label>[a-zA-Z0-9]+)(?P<opener_attrs>.*?)>)",  # noqa: E501
+            "closer": r"(?P<closer></(?P<closer_label>[a-zA-Z0-9]+)>)",
+        },
+    }
+
+    def __init__(
+        self,
+        *,
+        tokenizer: Optional[Tokenizer] = None,
+        span_setter: SpanSetterArg = {"ents": True, "*": True},
+        span_attributes: Optional[AttributesMappingArg] = None,
+        keep_raw_attribute_values: bool = False,
+        default_attributes: AttributesMappingArg = {},
+        bool_attributes: AsList[str] = [],
+        preset: Literal["md", "xml"] = "md",
+        opener: Optional[str] = None,
+        closer: Optional[str] = None,
+    ):
+        self.tokenizer = tokenizer
+        self.span_setter = span_setter
+        self.span_attributes = span_attributes
+        self.keep_raw_attribute_values = keep_raw_attribute_values
+        self.default_attributes = dict(default_attributes)
+        for attr in bool_attributes:
+            self.default_attributes[attr] = False
+        self.opener = opener or self.PRESETS[preset]["opener"]
+        self.closer = closer or self.PRESETS[preset]["closer"]
+
+    def _as_python(self, value: str):
+        import ast
+
+        if self.keep_raw_attribute_values:
+            return value
+        try:
+            return ast.literal_eval(value)
+        except Exception:
+            if value.lower() == "true":
+                return True
+            elif value.lower() == "false":
+                return False
+        return value
+
+    def _parse(self, inline_text: str):
+        import re
+
+        last_inline_offset = 0
+        starts = []
+        text = ""
+        seps = list(re.finditer(self.opener + "|" + self.closer, inline_text))
+        entities = []
+        for i, sep in enumerate(seps):
+            is_opener = bool(sep["opener"])
+            groups = sep.groupdict()
+            inline_start = sep.start("opener") if is_opener else sep.start("closer")
+            inline_end = sep.end("opener") if is_opener else sep.end("closer")
+            label = groups.get("closer_label", groups.get("opener_label"))
+            attrs = groups.get("closer_attrs", groups.get("opener_attrs")) or ""
+            attrs = {
+                k: self._as_python(v)
+                for k, v in (kv.split("=") for kv in attrs.split())
+            }
+            text += inline_text[last_inline_offset:inline_start]
+            if is_opener:
+                starts.append((len(text), label, attrs))
+            else:
+                try:
+                    idx = next(
+                        i
+                        for i in range(len(starts) - 1, -1, -1)
+                        if starts[i][1] == label or not label or not starts[i][1]
+                    )
+                except StopIteration:
+                    warnings.warn(f"Unmatched closing tag for '{sep.group()}'")
+                    continue
+                start, start_label, start_attrs = starts.pop(idx)
+                entities.append(
+                    (start, len(text), start_label or label, {**attrs, **start_attrs})
+                )
+            last_inline_offset = inline_end
+        if last_inline_offset < len(inline_text):
+            text += inline_text[last_inline_offset:]
+        if starts:
+            warnings.warn(
+                f"Unmatched opening tags at indices {', '.join(s[1] for s in starts)}"
+            )
+        entities = sorted(entities)
+        return text, entities
+
+    def __call__(self, obj, tokenizer=None):
+        tok = tokenizer or self.tokenizer or get_current_tokenizer()
+        if isinstance(obj, str):
+            obj = {"text": obj}
+        annotated = obj["text"]
+        plain, raw_ents = self._parse(annotated)
+
+        doc = tok(plain)
+        doc._.note_id = obj.get("doc_id", obj.get(FILENAME))
+
+        for dst in (
+            *(() if self.span_attributes is None else self.span_attributes.values()),
+            *self.default_attributes,
+        ):
+            if not Span.has_extension(dst):
+                Span.set_extension(dst, default=None)
+
+        spans = []
+        for start, end, label, attrs in raw_ents:
+            span = doc.char_span(start, end, label=label, alignment_mode="expand")
+            if span is None:
+                continue
+            for k, v in attrs.items():
+                new_k = (
+                    self.span_attributes.get(k)
+                    if self.span_attributes is not None
+                    else k
+                )
+                if self.span_attributes is None and not Span.has_extension(new_k):
+                    Span.set_extension(new_k, default=None)
+                if new_k:
+                    span._.set(new_k, v)
+            spans.append(span)
+
+        set_spans(doc, spans, span_setter=self.span_setter)
+        for attr, value in self.default_attributes.items():
+            for span in spans:
+                if span._.get(attr) is None:
+                    span._.set(attr, value)
+
+        return doc
+
+
 def get_dict2doc_converter(
     converter: Union[str, Callable], kwargs
 ) -> Tuple[Callable, Dict]:
@@ -716,7 +936,11 @@ def get_dict2doc_converter(
             filtered = [
                 name
                 for name in available
-                if converter == name or (converter in name and "dict2doc" in name)
+                if converter == name
+                or (
+                    converter in name
+                    and (name.endswith("2doc") or name.endswith("to_doc"))
+                )
             ]
             converter = edsnlp.registry.factory.get(filtered[0])
             nlp = kwargs.pop("nlp", None)
@@ -726,7 +950,9 @@ def get_dict2doc_converter(
             kwargs = {}
             return converter, kwargs
         except (KeyError, IndexError):
-            available = [v for v in available if "dict2doc" in v]
+            available = [
+                v for v in available if (v.endswith("2doc") or v.endswith("to_doc"))
+            ]
             raise ValueError(
                 f"Cannot find converter for format {converter}. "
                 f"Available converters are {', '.join(available)}"
@@ -745,14 +971,20 @@ def get_doc2dict_converter(
             filtered = [
                 name
                 for name in available
-                if converter == name or (converter in name and "doc2dict" in name)
+                if converter == name
+                or (
+                    converter in name
+                    and (name.endswith("2dict") or name.endswith("to_dict"))
+                )
             ]
             converter = edsnlp.registry.factory.get(filtered[0])
             converter = converter(**kwargs)
             kwargs = {}
             return converter, kwargs
         except (KeyError, IndexError):
-            available = [v for v in available if "doc2dict" in v]
+            available = [
+                v for v in available if (v.endswith("2dict") or v.endswith("to_dict"))
+            ]
             raise ValueError(
                 f"Cannot find converter for format {converter}. "
                 f"Available converters are {', '.join(available)}"
diff --git a/edsnlp/metrics/__init__.py b/edsnlp/metrics/__init__.py
index e96f74017a..df17ab40a3 100644
--- a/edsnlp/metrics/__init__.py
+++ b/edsnlp/metrics/__init__.py
@@ -25,7 +25,7 @@ def average_precision(pred: Dict[Any, float], gold: Iterable[Any]):
     for i in range(1, len(precisions)):
         if recalls[i] > recalls[i - 1]:
             ap += (recalls[i] - recalls[i - 1]) * precisions[i]
-    return ap
+    return float(ap)
 
 
 def prf(pred: Collection, gold: Collection):
diff --git a/edsnlp/metrics/ner.py b/edsnlp/metrics/ner.py
index d39306445d..17bc9f5f71 100644
--- a/edsnlp/metrics/ner.py
+++ b/edsnlp/metrics/ner.py
@@ -1,3 +1,27 @@
+"""
+We provide several metrics to evaluate the performance of Named Entity Recognition (NER) components.
+Let's look at an example and see how they differ. We'll use the following two documents: a reference
+document (ref) and a document with predicted entities (pred).
+
++-------------------------------------------------------------+------------------------------------------+
+| pred                                                        | ref                                      |
++=============================================================+==========================================+
+| *La*{.chip data-chip=PER} *patiente*{.chip data-chip=PER} a | La *patiente*{.chip data-chip=PER} a     |
+| une *fièvre aigüe*{.chip data-chip=DIS}                     | *une fièvre*{.chip data-chip=DIS} aigüe. |
++-------------------------------------------------------------+------------------------------------------+
+
+Let's create matching documents in EDS-NLP using the following code snippet:
+
+```python
+from edsnlp.data.converters import MarkupToDocConverter
+
+conv = MarkupToDocConverter(preset="md", span_setter="entities")
+
+pred = conv("[La](PER) [patiente](PER) a une [fièvre aiguë](DIS).")
+ref = conv("La [patiente](PER) a [une fièvre](DIS) aiguë.")
+```
+"""  # noqa: E501
+
 import abc
 from collections import defaultdict
 from typing import Any, Dict, Optional
@@ -13,26 +37,6 @@ def ner_exact_metric(
     micro_key: str = "micro",
     filter_expr: Optional[str] = None,
 ) -> Dict[str, Any]:
-    """
-    Scores the extracted entities that may be overlapping or nested
-    by looking in the spans returned by a given SpanGetter object.
-
-    Parameters
-    ----------
-    examples: Examples
-        The examples to score, either a tuple of (golds, preds) or a list of
-        spacy.training.Example objects
-    span_getter: SpanGetter
-        The span getter to use to extract the spans from the document
-    micro_key: str
-        The key to use to store the micro-averaged results for spans of all types
-    filter_expr: str
-        The filter expression to use to filter the documents
-
-    Returns
-    -------
-    Dict[str, Any]
-    """
     examples = make_examples(examples)
     if filter_expr is not None:
         filter_fn = eval(f"lambda doc: {filter_expr}")
@@ -65,27 +69,6 @@ def ner_token_metric(
     micro_key: str = "micro",
     filter_expr: Optional[str] = None,
 ) -> Dict[str, Any]:
-    """
-    Scores the extracted entities that may be overlapping or nested
-    by looking in `doc.ents`, and `doc.spans`, and comparing the predicted
-    and gold entities at the TOKEN level.
-
-    Parameters
-    ----------
-    examples: Examples
-        The examples to score, either a tuple of (golds, preds) or a list of
-        spacy.training.Example objects
-    span_getter: SpanGetter
-        The span getter to use to extract the spans from the document
-    micro_key: str
-        The key to use to store the micro-averaged results for spans of all types
-    filter_expr: str
-        The filter expression to use to filter the documents
-
-    Returns
-    -------
-    Dict[str, Any]
-    """
     examples = make_examples(examples)
     if filter_expr is not None:
         filter_fn = eval(f"lambda doc: {filter_expr}")
@@ -130,30 +113,6 @@ def ner_overlap_metric(
     filter_expr: Optional[str] = None,
     threshold: float = 0.5,
 ) -> Dict[str, Any]:
-    """
-    Scores the extracted entities that may be overlapping or nested
-    by looking in `doc.ents`, and `doc.spans`, and comparing the predicted
-    and gold entities and counting true when a predicted entity overlaps
-    with a gold entity of the same label
-
-    Parameters
-    ----------
-    examples: Examples
-        The examples to score, either a tuple of (golds, preds) or a list of
-        spacy.training.Example objects
-    span_getter: SpanGetter
-        The span getter to use to extract the spans from the document
-    micro_key: str
-        The key to use to store the micro-averaged results for spans of all types
-    filter_expr: str
-        The filter expression to use to filter the documents
-    threshold: float
-        The threshold to use to consider that two spans overlap
-
-    Returns
-    -------
-    Dict[str, Any]
-    """
     examples = make_examples(*examples)
     if filter_expr is not None:
         filter_fn = eval(f"lambda doc: {filter_expr}")
@@ -239,6 +198,54 @@ def __call__(self, *examples) -> Dict[str, Any]:
     deprecated=["eds.ner_exact_metric"],
 )
 class NerExactMetric(NerMetric):
+    r"""
+    The `eds.ner_exact` metric
+    scores the extracted entities (that may be overlapping or nested)
+    by looking in the spans returned by a given SpanGetter object and
+    comparing predicted spans to gold spans for **exact** boundary and label matches.
+
+    Let's view these elements as collections of (span → label) and count how
+    many of the predicted spans match the gold spans exactly (and vice versa):
+
+    +----------------------------------------------+--------------------------------------------+
+    | pred                                         | ref                                        |
+    +==============================================+============================================+
+    | *La*{.chip .fp data-chip=PER}<br/>           | *patiente*{.chip .tp data-chip=PER}<br/>   |
+    | *patiente*{.chip .tp data-chip=PER}<br/>     | *une fièvre*{.chip .fp data-chip=DIS}<br/> |
+    | *fièvre aiguë*{.chip .fp data-chip=DIS}<br/> |                                            |
+    +----------------------------------------------+--------------------------------------------+
+
+    Precision, Recall and F1 (micro-average and per‐label) are computed as follows:
+
+    - Precision: `p = |matched items of pred| / |pred|`
+    - Recall: `r = |matched items of ref| / |ref|`
+    - F1: `f = 2 / (1/p + 1/f)`
+
+    Examples
+    --------
+
+    ```python
+    from edsnlp.metrics.ner import NerExactMetric
+
+    metric = NerExactMetric(span_getter=conv.span_setter, micro_key="micro")
+    metric([ref], [pred])
+    # Out: {
+    #   'micro': {'f': 0.4, 'p': 0.33, 'r': 0.5, 'tp': 1, 'support': 2, 'positives': 3},
+    #   'PER': {'f': 0.67, 'p': 0.5, 'r': 1, 'tp': 1, 'support': 1, 'positives': 2},
+    #   'DIS': {'f': 0.0, 'p': 0.0, 'r': 0.0, 'tp': 0, 'support': 1, 'positives': 1},
+    # }
+    ```
+
+    Parameters
+    ----------
+    span_getter: SpanGetter
+        The span getter to use to extract the spans from the document
+    micro_key: str
+        The key to use to store the micro-averaged results for spans of all types
+    filter_expr: str
+        The filter expression to use to filter the documents. Evaluated with `doc` as the variable.
+    """  # noqa: E501
+
     def __init__(
         self,
         span_getter: SpanGetterArg,
@@ -265,6 +272,58 @@ def __call__(self, *examples):
     deprecated=["eds.ner_token_metric"],
 )
 class NerTokenMetric(NerMetric):
+    r"""
+    The `eds.ner_token` metric
+    scores the extracted entities that may be overlapping or nested by looking in
+    `doc.ents`, and `doc.spans`, and comparing the predicted and gold entities at the
+    **token** level.
+
+    Assuming we use the `eds` (or `fr` or `en`) tokenizer, in the above example, there
+    are 3 annotated tokens in the reference, and 4 annotated tokens in the prediction.
+    Let's view these elements as sets of (token, label) and count how many of the
+    predicted tokens match the gold tokens exactly (and vice versa):
+
+    +------------------------------------------+------------------------------------------+
+    | pred                                     | ref                                      |
+    +==========================================+==========================================+
+    | *La*{.chip .fp data-chip=PER}<br/>       | *patiente*{.chip .tp data-chip=PER}<br/> |
+    | *patiente*{.chip .tp data-chip=PER}<br/> | *une*{.chip .fp data-chip=DIS}<br/>      |
+    | *fièvre*{.chip .tp data-chip=DIS}<br/>   | *fièvre*{.chip .tp data-chip=DIS}        |
+    | *aiguë*{.chip .fp data-chip=DIS}         |                                          |
+    +------------------------------------------+------------------------------------------+
+
+    Precision, Recall and F1 (micro-average and per‐label) are computed as follows:
+
+    - Precision: `p = |matched items of pred| / |pred|`
+    - Recall: `r = |matched items of ref| / |ref|`
+    - F1: `f = 2 / (1/p + 1/f)`
+
+    Examples
+    --------
+
+    ```python
+    from edsnlp.metrics.ner import NerTokenMetric
+
+    metric = NerTokenMetric(span_getter=conv.span_setter, micro_key="micro")
+    metric([ref], [pred])
+    # Out: {
+    #   'micro': {'f': 0.57, 'p': 0.5, 'r': 0.67, 'tp': 2, 'support': 3, 'positives': 4},
+    #   'PER': {'f': 0.67, 'p': 0.5, 'r': 1, 'tp': 1, 'support': 1, 'positives': 2},
+    #   'DIS': {'f': 0.5, 'p': 0.5, 'r': 0.5, 'tp': 1, 'support': 2, 'positives': 2}
+    # }
+    ```
+
+    Parameters
+    ----------
+    span_getter: SpanGetter
+        The span getter to use to extract the spans from the document
+    micro_key: str
+        The key to use to store the micro-averaged results for spans of all types
+    filter_expr: str
+        The filter expression to use to filter the documents. Will be evaluated
+        with `doc` as the variable name, so you can use `doc.ents`, `doc.spans`, etc.
+    """  # noqa: E501
+
     def __init__(
         self,
         span_getter: SpanGetterArg,
@@ -291,6 +350,71 @@ def __call__(self, *examples):
     deprecated=["eds.ner_overlap_metric"],
 )
 class NerOverlapMetric(NerMetric):
+    r"""
+    The `eds.ner_overlap` metric
+    scores the extracted entities that may be overlapping or nested
+    by looking in the spans returned by a given SpanGetter object and
+    counting a prediction as correct if it overlaps by at least the given
+    Dice‐coefficient threshold with a gold span of the same label.
+
+    This metric is useful for evaluating NER systems where the exact boundaries
+    do not matter too much, but the presence of the entity at the same spot is important.
+    For instance, you may not want to penalize a system that forgets determiners if
+    the rest of the entity is correctly identified.
+
+    Let's view these elements as sets of (span → label) and count how many of the
+    predicted spans match the gold spans by at least the given Dice coefficient
+    (and vice versa):
+
+    +---------------------------------------------+------------------------------------------+
+    | pred                                        | ref                                      |
+    +=============================================+==========================================+
+    | *La*{.chip .fp data-chip=PER}<br/>          | *patiente*{.chip .tp data-chip=PER}<br/> |
+    | *patiente*{.chip .tp data-chip=PER}<br/>    | *une fièvre*{.chip .tp data-chip=DIS}    |
+    | *fièvre aiguë*{.chip .tp data-chip=DIS}<br/>|                                          |
+    +---------------------------------------------+------------------------------------------+
+
+    Precision, Recall and F1 (micro-average and per‐label) are computed as follows:
+
+    - Precision: `p = |matched items of pred| / |pred|`
+    - Recall: `r = |matched items of ref| / |ref|`
+    - F1: `f = 2 / (1/p + 1/f)`
+
+    !!! note "Overlap threshold"
+
+        The threshold is the minimum Dice coefficient to consider two spans as overlapping. Setting
+        it to 1.0 will yield the same results as the `eds.ner_exact` metric, while setting it to a
+        near-zero value (e.g., like 1e-14) will match any two spans that share at least one token.
+
+    Examples
+    --------
+
+    ```python
+    from edsnlp.metrics.ner import NerOverlapMetric
+
+    metric = NerOverlapMetric(
+        span_getter=conv.span_setter, micro_key="micro", threshold=0.5
+    )
+    metric([ref], [pred])
+    # Out: {
+    #   'micro': {'f': 0.8, 'p': 0.67, 'r': 1.0, 'tp': 2, 'support': 2, 'positives': 3},
+    #   'PER': {'f': 0.67, 'p': 0.5, 'r': 1.0, 'tp': 1, 'support': 1, 'positives': 2},
+    #   'DIS': {'f': 1.0, 'p': 1.0, 'r': 1.0, 'tp': 1, 'support': 1, 'positives': 1}
+    # }
+    ```
+
+    Parameters
+    ----------
+    span_getter: SpanGetter
+        The span getter to use to extract the spans from the document
+    micro_key: str
+        The key to use to store the micro-averaged results for spans of all types
+    filter_expr: str
+        The filter expression to use to filter the documents
+    threshold: float
+        The threshold on the Dice coefficient to consider two spans as overlapping
+    """  # noqa: E501
+
     def __init__(
         self,
         span_getter: SpanGetterArg,
diff --git a/edsnlp/metrics/span_attribute.py b/edsnlp/metrics/span_attribute.py
new file mode 100644
index 0000000000..d1c7b6dac4
--- /dev/null
+++ b/edsnlp/metrics/span_attribute.py
@@ -0,0 +1,311 @@
+"""
+Metrics for Span Attribute Classification
+
+# Span Attribute Classification Metrics {: #edsnlp.metrics.span_attribute.SpanAttributeMetric }
+
+Several NLP tasks consist in classifying existing spans of text into multiple classes,
+such as the detection of negation, hypothesis or span linking.
+
+We provide a metric to evaluate the performance of such tasks,
+
+Let's look at an example:
+
++-------------------------------------------------------------------+-------------------------------------------------------------------+
+| pred                                                              | ref                                                               |
++===================================================================+===================================================================+
+| Le patient n'est pas *fièvreux*{.chip data-chip="SYMP neg=true"}, | Le patient n'est pas *fièvreux*{.chip data-chip="SYMP neg=true"}, |
+| son père a *du diabète*{.chip data-chip="DIS carrier=PATIENT"}.   | son père a *du diabète*{.chip data-chip="DIS carrier=FATHER"}.    |
+| Pas d'évolution du                                                | Pas d'évolution du                                                |
+| *cancer*{.chip data-chip="DIS neg=true carrier=PATIENT"}.         | *cancer*{.chip data-chip="DIS carrier=PATIENT"}.                  |
++-------------------------------------------------------------------+-------------------------------------------------------------------+
+
+We can quickly create matching documents in EDS-NLP using the following code snippet:
+
+```python
+from edsnlp.data.converters import MarkupToDocConverter
+
+conv = MarkupToDocConverter(preset="md", span_setter="entities")
+# Create a document with predicted attributes and a reference document
+pred = conv(
+    "Le patient n'est pas [fièvreux](SYMP neg=true), "
+    "son père a [du diabète](DIS neg=false carrier=PATIENT). "
+    "Pas d'évolution du [cancer](DIS neg=true carrier=PATIENT)."
+)
+ref = conv(
+    "Le patient n'est pas [fièvreux](SYMP neg=true), "
+    "son père a [du diabète](DIS neg=false carrier=FATHER). "
+    "Pas d'évolution du [cancer](DIS neg=false carrier=PATIENT)."
+)
+```
+"""  # noqa: E501
+
+import warnings
+from collections import defaultdict
+from typing import Any, Dict, Optional
+
+from edsnlp import registry
+from edsnlp.metrics import Examples, average_precision, make_examples, prf
+from edsnlp.utils.bindings import BINDING_GETTERS, Attributes, AttributesArg
+from edsnlp.utils.span_getters import SpanGetterArg, get_spans
+
+
+def span_attribute_metric(
+    examples: Examples,
+    span_getter: SpanGetterArg,
+    attributes: Attributes = None,
+    include_falsy: bool = False,
+    default_values: Dict = {},
+    micro_key: str = "micro",
+    filter_expr: Optional[str] = None,
+    **kwargs: Any,
+):
+    if "qualifiers" in kwargs:
+        warnings.warn(
+            "The `qualifiers` argument of span_attribute_metric() is "
+            "deprecated. Use `attributes` instead.",
+            DeprecationWarning,
+        )
+        assert attributes is None
+        attributes = kwargs.pop("qualifiers")
+    if attributes is None:
+        raise TypeError(
+            "span_attribute_metric() missing 1 required argument: 'attributes'"
+        )
+    if kwargs:
+        raise TypeError(
+            f"span_attribute_metric() got unexpected keyword arguments: "
+            f"{', '.join(kwargs.keys())}"
+        )
+    examples = make_examples(examples)
+    if filter_expr is not None:
+        filter_fn = eval(f"lambda doc: {filter_expr}")
+        examples = [eg for eg in examples if filter_fn(eg.reference)]
+    labels = defaultdict(lambda: (set(), set(), dict()))
+    labels["micro"] = (set(), set(), dict())
+    total_pred_count = 0
+    total_gold_count = 0
+
+    if not include_falsy:
+        default_values_ = defaultdict(lambda: False)
+        default_values_.update(default_values)
+        default_values = default_values_
+        del default_values_
+    for eg_idx, eg in enumerate(examples):
+        doc_spans = get_spans(eg.predicted, span_getter)
+        for span in doc_spans:
+            total_pred_count += 1
+            beg, end = span.start, span.end
+            for attr, span_filter in attributes.items():
+                if not (span_filter is True or span.label_ in span_filter):
+                    continue
+                getter_key = attr if attr.startswith("_.") else f"_.{attr}"
+                value = BINDING_GETTERS[getter_key](span)
+                top_val, top_p = max(
+                    getattr(span._, "prob", {}).get(attr, {}).items(),
+                    key=lambda x: x[1],
+                    default=(value, 1.0),
+                )
+                if (top_val or include_falsy) and default_values[attr] != top_val:
+                    labels[attr][2][(eg_idx, beg, end, attr, top_val)] = top_p
+                    labels[micro_key][2][(eg_idx, beg, end, attr, top_val)] = top_p
+                if (value or include_falsy) and default_values[attr] != value:
+                    labels[micro_key][0].add((eg_idx, beg, end, attr, value))
+                    labels[attr][0].add((eg_idx, beg, end, attr, value))
+
+        doc_spans = get_spans(eg.reference, span_getter)
+        for span in doc_spans:
+            total_gold_count += 1
+            beg, end = span.start, span.end
+            for attr, span_filter in attributes.items():
+                if not (span_filter is True or span.label_ in span_filter):
+                    continue
+                getter_key = attr if attr.startswith("_.") else f"_.{attr}"
+                value = BINDING_GETTERS[getter_key](span)
+                if (value or include_falsy) and default_values[attr] != value:
+                    labels[micro_key][1].add((eg_idx, beg, end, attr, value))
+                    labels[attr][1].add((eg_idx, beg, end, attr, value))
+
+    if total_pred_count != total_gold_count:
+        raise ValueError(
+            f"Number of predicted and gold spans differ: {total_pred_count} != "
+            f"{total_gold_count}. Make sure that you are running your span "
+            "attribute classification pipe on the gold annotations, and not spans "
+            "predicted by another NER pipe in your model."
+        )
+
+    for name, (pred, gold, pred_with_prob) in labels.items():
+        print("-", name, "pred/gold", pred, gold, "=>", prf(pred, gold))
+    return {
+        name: {
+            **prf(pred, gold),
+            "ap": average_precision(pred_with_prob, gold),
+        }
+        for name, (pred, gold, pred_with_prob) in labels.items()
+    }
+
+
+@registry.metrics.register(
+    "eds.span_attribute",
+    deprecated=["eds.span_classification_scorer", "eds.span_attribute_scorer"],
+)
+class SpanAttributeMetric:
+    """
+    The `eds.span_attribute` metric
+    evaluates span‐level attribute classification by comparing predicted and gold
+    attribute values on the same set of spans. For each attribute you specify, it
+    computes Precision, Recall, F1, number of true positives (tp), number of
+    gold instances (support), number of predicted instances (positives), and
+    the Average Precision (ap). A micro‐average over all attributes is also
+    provided under `micro_key`.
+
+    ```python
+    from edsnlp.metrics.span_attribute import SpanAttributeMetric
+
+    metric = SpanAttributeMetric(
+        span_getter=conv.span_setter,
+        # Evaluated attributes
+        attributes={
+            "neg": True,  # 'neg' on every entity
+            "carrier": ["DIS"],  # 'carrier' only on 'DIS' entities
+        },
+        # Ignore these default values when counting matches
+        default_values={
+            "neg": False,
+        },
+        micro_key="micro",
+    )
+    ```
+
+    Let's enumerate (span -> attr = value) items in our documents. Only the items with
+    matching span boundaries, attribute name, and value are counted as a true positives.
+    For instance, with the predicted and reference spans of the example above:
+
+    +--------------------------------------------------+-------------------------------------------------+
+    | pred                                             | ref                                             |
+    +==================================================+=================================================+
+    | *fièvreux → neg = True*{.chip .tp}<br/>          | *fièvreux → neg = True*{.chip .tp}<br/>         |
+    | *du diabète → neg = False*{.chip .na}<br/>       | *du diabète → neg = False*{.chip .na}<br/>      |
+    | *du diabète → carrier = PATIENT*{.chip .fp}<br/> | *du diabète → carrier = FATHER*{.chip .fn}<br/> |
+    | *cancer → neg = True*{.chip .fp}<br/>            | *cancer → neg = False*{.chip .na}<br/>          |
+    | *cancer → carrier = PATIENT*{.chip .tp}          | *cancer → carrier = PATIENT*{.chip .tp}         |
+    +--------------------------------------------------+-------------------------------------------------+
+
+    !!! note "Default values"
+
+        Note that there we don't count "neg=False" items, shown in grey in the table. In EDS-NLP,
+        this is done by setting `defaults_values={"neg": False}` when creating the metric. This
+        is quite common in classification tasks, where one of the values is both the most common
+        and the "default" (hence the name of the parameter). Counting these values would likely
+        skew the micro-average metrics towards the default value.
+
+    Precision, Recall and F1 (micro-average and per‐label) are computed as follows:
+
+    - Precision: `p = |matched items of pred| / |pred|`
+    - Recall: `r = |matched items of ref| / |ref|`
+    - F1: `f = 2 / (1/p + 1/f)`
+
+    This yields the following metrics:
+
+    ```python
+    metric([ref], [pred])
+    # Out: {
+    #   'micro': {'f': 0.57, 'p': 0.5, 'r': 0.67, 'tp': 2, 'support': 3, 'positives': 4, 'ap': 0.17},
+    #   'neg': {'f': 0.67, 'p': 0.5, 'r': 1, 'tp': 1, 'support': 1, 'positives': 2, 'ap': 0.0},
+    #   'carrier': {'f': 0.5, 'p': 0.5, 'r': 0.5, 'tp': 1, 'support': 2, 'positives': 2, 'ap': 0.25},
+    # }
+    ```
+
+    Parameters
+    ----------
+    span_getter : SpanGetterArg
+        The span getter to extract spans from each `Doc`.
+    attributes : Mapping[str, Union[bool, Sequence[str]]]
+        Map each attribute name to `True` (evaluate on all spans) or a sequence of
+        labels restricting which spans to test.
+    default_values : Dict[str, Any]
+        Attribute values to omit from micro‐average counts (e.g., common negative or
+        default labels).
+    include_falsy : bool
+        If `False`, ignore falsy values (e.g., `False`, `None`, `''`) in predictions
+        or gold when computing metrics; if `True`, count them.
+    micro_key : str
+        Key under which to store the micro‐averaged results across all attributes.
+    filter_expr : Optional[str]
+        A Python expression (using `doc`) to filter which examples are scored.
+
+    Returns
+    -------
+    Dict[str, Dict[str, float]]
+        A dictionary mapping each attribute name (and the `micro_key`) to its metrics:
+
+        - `label` or micro_key :
+
+            - `p` : precision
+            - `r` : recall
+            - `f` : F1 score
+            - `tp` : true positive count
+            - `support` : number of gold instances
+            - `positives` : number of predicted instances
+            - `ap` : [average precision](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision)
+    """  # noqa: E501
+
+    attributes: Attributes
+
+    def __init__(
+        self,
+        span_getter: SpanGetterArg,
+        attributes: AttributesArg = None,
+        qualifiers: AttributesArg = None,
+        default_values: Dict = {},
+        include_falsy: bool = False,
+        micro_key: str = "micro",
+        filter_expr: Optional[str] = None,
+    ):
+        if qualifiers is not None:
+            warnings.warn(
+                "The `qualifiers` argument is deprecated. Use `attributes` instead.",
+                DeprecationWarning,
+            )
+        self.span_getter = span_getter
+        self.attributes = attributes or qualifiers
+        self.default_values = default_values
+        self.include_falsy = include_falsy
+        self.micro_key = micro_key
+        self.filter_expr = filter_expr
+
+    __init__.__doc__ = span_attribute_metric.__doc__
+
+    def __call__(self, *examples: Any):
+        """
+        Compute the span attribute metrics for the given examples.
+
+        Parameters
+        ----------
+        examples : Examples
+            The examples to score, either a tuple of (golds, preds) or a list of
+            spacy.training.Example objects
+
+        Returns
+        -------
+        Dict[str, Dict[str, float]]
+            The scores for the attributes
+        """
+        return span_attribute_metric(
+            examples,
+            span_getter=self.span_getter,
+            attributes=self.attributes,
+            default_values=self.default_values,
+            include_falsy=self.include_falsy,
+            micro_key=self.micro_key,
+            filter_expr=self.filter_expr,
+        )
+
+
+# For backward compatibility
+span_classification_scorer = span_attribute_scorer = span_attribute_metric
+create_span_attributes_scorer = SpanAttributeScorer = SpanAttributeMetric
+
+__all__ = [
+    "span_attribute_metric",
+    "SpanAttributeMetric",
+]
diff --git a/edsnlp/metrics/span_attributes.py b/edsnlp/metrics/span_attributes.py
deleted file mode 100644
index be1b2a948e..0000000000
--- a/edsnlp/metrics/span_attributes.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import warnings
-from collections import defaultdict
-from typing import Any, Dict, Optional
-
-from edsnlp import registry
-from edsnlp.metrics import Examples, average_precision, make_examples, prf
-from edsnlp.utils.bindings import BINDING_GETTERS, Attributes, AttributesArg
-from edsnlp.utils.span_getters import SpanGetterArg, get_spans
-
-
-def span_attribute_metric(
-    examples: Examples,
-    span_getter: SpanGetterArg,
-    attributes: Attributes = None,
-    include_falsy: bool = False,
-    default_values: Dict = {},
-    micro_key: str = "micro",
-    filter_expr: Optional[str] = None,
-    **kwargs: Any,
-):
-    """
-    Scores the attributes predictions between a list of gold and predicted spans.
-
-    Parameters
-    ----------
-    examples : Examples
-        The examples to score, either a tuple of (golds, preds) or a list of
-        spacy.training.Example objects
-    span_getter : SpanGetterArg
-        The span getter to use to extract the spans from the document
-    attributes : Sequence[str]
-        The attributes to use to score the spans
-    default_values: Dict
-        Values to dismiss when computing the micro-average per label. This is
-        useful to compute precision and recall for certain attributes that have
-        imbalanced value repartitions, such as "negation", "family related"
-        or "certainty" attributes.
-    include_falsy : bool
-        Whether to count predicted or gold occurrences of falsy values when computing
-        the metrics. If `False`, only the non-falsy values will be counted and matched
-        together.
-    micro_key : str
-        The key to use to store the micro-averaged results for spans of all types
-    filter_expr : Optional[str]
-        The filter expression to use to filter the documents
-
-    Returns
-    -------
-    Dict[str, float]
-    """
-    if "qualifiers" in kwargs:
-        warnings.warn(
-            "The `qualifiers` argument of span_attribute_metric() is "
-            "deprecated. Use `attributes` instead.",
-            DeprecationWarning,
-        )
-        assert attributes is None
-        attributes = kwargs.pop("qualifiers")
-    if attributes is None:
-        raise TypeError(
-            "span_attribute_metric() missing 1 required argument: 'attributes'"
-        )
-    if kwargs:
-        raise TypeError(
-            f"span_attribute_metric() got unexpected keyword arguments: "
-            f"{', '.join(kwargs.keys())}"
-        )
-    examples = make_examples(examples)
-    if filter_expr is not None:
-        filter_fn = eval(f"lambda doc: {filter_expr}")
-        examples = [eg for eg in examples if filter_fn(eg.reference)]
-    labels = defaultdict(lambda: (set(), set(), dict()))
-    labels["micro"] = (set(), set(), dict())
-    total_pred_count = 0
-    total_gold_count = 0
-
-    if not include_falsy:
-        default_values_ = defaultdict(lambda: False)
-        default_values_.update(default_values)
-        default_values = default_values_
-        del default_values_
-    for eg_idx, eg in enumerate(examples):
-        doc_spans = get_spans(eg.predicted, span_getter)
-        for span_idx, span in enumerate(doc_spans):
-            total_pred_count += 1
-            for attr, span_filter in attributes.items():
-                if not (span_filter is True or span.label_ in span_filter):
-                    continue
-                getter_key = attr if attr.startswith("_.") else f"_.{attr}"
-                value = BINDING_GETTERS[getter_key](span)
-                top_val, top_p = max(
-                    getattr(span._, "prob", {}).get(attr, {}).items(),
-                    key=lambda x: x[1],
-                    default=(value, 1.0),
-                )
-                if (top_val or include_falsy) and default_values[attr] != top_val:
-                    labels[attr][2][(eg_idx, span_idx, attr, top_val)] = top_p
-                    labels[micro_key][2][(eg_idx, span_idx, attr, top_val)] = top_p
-                if (value or include_falsy) and default_values[attr] != value:
-                    labels[micro_key][0].add((eg_idx, span_idx, attr, value))
-                    labels[attr][0].add((eg_idx, span_idx, attr, value))
-
-        doc_spans = get_spans(eg.reference, span_getter)
-        for span_idx, span in enumerate(doc_spans):
-            total_gold_count += 1
-            for attr, span_filter in attributes.items():
-                if not (span_filter is True or span.label_ in span_filter):
-                    continue
-                getter_key = attr if attr.startswith("_.") else f"_.{attr}"
-                value = BINDING_GETTERS[getter_key](span)
-                if (value or include_falsy) and default_values[attr] != value:
-                    labels[micro_key][1].add((eg_idx, span_idx, attr, value))
-                    labels[attr][1].add((eg_idx, span_idx, attr, value))
-
-    if total_pred_count != total_gold_count:
-        raise ValueError(
-            f"Number of predicted and gold spans differ: {total_pred_count} != "
-            f"{total_gold_count}. Make sure that you are running your span "
-            "attribute classification pipe on the gold annotations, and not spans "
-            "predicted by another NER pipe in your model."
-        )
-
-    return {
-        name: {
-            **prf(pred, gold),
-            "ap": average_precision(pred_with_prob, gold),
-        }
-        for name, (pred, gold, pred_with_prob) in labels.items()
-    }
-
-
-@registry.metrics.register(
-    "eds.span_attribute",
-    deprecated=["eds.span_classification_scorer", "eds.span_attribute_scorer"],
-)
-class SpanAttributeMetric:
-    attributes: Attributes
-
-    def __init__(
-        self,
-        span_getter: SpanGetterArg,
-        attributes: AttributesArg = None,
-        qualifiers: AttributesArg = None,
-        default_values: Dict = {},
-        include_falsy: bool = False,
-        micro_key: str = "micro",
-        filter_expr: Optional[str] = None,
-    ):
-        if qualifiers is not None:
-            warnings.warn(
-                "The `qualifiers` argument is deprecated. Use `attributes` instead.",
-                DeprecationWarning,
-            )
-        self.span_getter = span_getter
-        self.attributes = attributes or qualifiers
-        self.default_values = default_values
-        self.include_falsy = include_falsy
-        self.micro_key = micro_key
-        self.filter_expr = filter_expr
-
-    __init__.__doc__ = span_attribute_metric.__doc__
-
-    def __call__(self, *examples: Any):
-        return span_attribute_metric(
-            examples,
-            span_getter=self.span_getter,
-            attributes=self.attributes,
-            default_values=self.default_values,
-            include_falsy=self.include_falsy,
-            micro_key=self.micro_key,
-            filter_expr=self.filter_expr,
-        )
-
-
-# For backward compatibility
-span_classification_scorer = span_attribute_scorer = span_attribute_metric
-create_span_attributes_scorer = SpanAttributeScorer = SpanAttributeMetric
-
-__all__ = [
-    "span_attribute_metric",
-    "SpanAttributeMetric",
-]
diff --git a/mkdocs.yml b/mkdocs.yml
index 35d3c30484..272d50ec99 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -146,6 +146,10 @@ nav:
       - concepts/pipeline.md
       - concepts/torch-component.md
       - concepts/inference.md
+  - Metrics:
+      - metrics/index.md
+      - metrics/ner.md
+      - metrics/span-attribute.md
   - Utilities:
       - utilities/index.md
       - utilities/tests/blocs.md
@@ -171,15 +175,14 @@ extra:
 extra_css:
   - assets/stylesheets/extra.css
   - assets/stylesheets/cards.css
-  - assets/termynal/termynal.css
+  #- assets/termynal/termynal.css
 
 extra_javascript:
-  - https://cdn.jsdelivr.net/npm/vega@5
-  - https://cdn.jsdelivr.net/npm/vega-lite@5
-  - https://cdn.jsdelivr.net/npm/vega-embed@6
-  - assets/termynal/termynal.js
+  #- https://cdn.jsdelivr.net/npm/vega@5
+  #- https://cdn.jsdelivr.net/npm/vega-lite@5
+  #- https://cdn.jsdelivr.net/npm/vega-embed@6
   - https://polyfill.io/v3/polyfill.min.js?features=es6
-  - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
+  # - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
 
 watch:
   - contributing.md
@@ -245,8 +248,9 @@ markdown_extensions:
       slugify: !!python/object/apply:pymdownx.slugs.slugify
         kwds:
           case: lower
-  - pymdownx.arithmatex:
-      generic: true
+  #- pymdownx.arithmatex:
+  #    generic: true
+  - markdown_grid_tables
   - footnotes
   - md_in_html
   - attr_list
diff --git a/pyproject.toml b/pyproject.toml
index c391302ab6..47d1e83f92 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,7 @@ docs-no-ml = [
     "mkdocstrings-python~=1.1",
     "mkdocs-minify-plugin",
     "mkdocs-redirects>=1.2.1;python_version>='3.8'",
+    "markdown-grid-tables==0.4.0",
     "pybtex~=0.24.0",
     "pathspec>=0.11.1",  # required by vendored mkdocs-autorefs PR
     "astunparse",
@@ -212,6 +213,7 @@ where = ["."]
 "eds.omop_dict2doc"               = "edsnlp.data.converters:OmopDict2DocConverter"
 "eds.omop_doc2dict"               = "edsnlp.data.converters:OmopDoc2DictConverter"
 "eds.ents_doc2dict"               = "edsnlp.data.converters:EntsDoc2DictConverter"
+"eds.markup_to_doc"               = "edsnlp.data.converters:MarkupToDocConverter"
 
 # Deprecated (links to the same factories as above)
 "SOFA"                   = "edsnlp.pipes.ner.scores.sofa.factory:create_component"
@@ -278,21 +280,22 @@ where = ["."]
 "linear" = "edsnlp.training.optimizer:LinearSchedule"
 
 [project.entry-points."spacy_scorers"]
-"eds.ner_exact"                  = "edsnlp.metrics.ner:NerExactMetric"
-"eds.ner_token"                  = "edsnlp.metrics.ner:NerTokenMetric"
-"eds.ner_overlap"                = "edsnlp.metrics.ner:NerOverlapMetric"
-"eds.span_attributes"            = "edsnlp.metrics.span_attributes:SpanAttributeMetric"
-"eds.dep_parsing"                = "edsnlp.metrics.dep_parsing:DependencyParsingMetric"
+"eds.ner_exact"              = "edsnlp.metrics.ner:NerExactMetric"
+"eds.ner_token"              = "edsnlp.metrics.ner:NerTokenMetric"
+"eds.ner_overlap"            = "edsnlp.metrics.ner:NerOverlapMetric"
+"eds.span_attribute"         = "edsnlp.metrics.span_attribute:SpanAttributeMetric"
+"eds.dep_parsing"            = "edsnlp.metrics.dep_parsing:DependencyParsingMetric"
 
 # Deprecated
-"eds.ner_exact_metric"           = "edsnlp.metrics.ner:NerExactMetric"
-"eds.ner_token_metric"           = "edsnlp.metrics.ner:NerTokenMetric"
-"eds.ner_overlap_metric"         = "edsnlp.metrics.ner:NerOverlapMetric"
-"eds.span_attributes_metric"     = "edsnlp.metrics.span_attributes:SpanAttributeMetric"
-"eds.ner_exact_scorer"           = "edsnlp.metrics.ner:NerExactMetric"
-"eds.ner_token_scorer"           = "edsnlp.metrics.ner:NerTokenMetric"
-"eds.ner_overlap_scorer"         = "edsnlp.metrics.ner:NerOverlapMetric"
-"eds.span_attributes_scorer"     = "edsnlp.metrics.span_attributes:SpanAttributeMetric"
+"eds.ner_exact_metric"       = "edsnlp.metrics.ner:NerExactMetric"
+"eds.ner_token_metric"       = "edsnlp.metrics.ner:NerTokenMetric"
+"eds.ner_overlap_metric"     = "edsnlp.metrics.ner:NerOverlapMetric"
+"eds.span_attributes_metric" = "edsnlp.metrics.span_attributes:SpanAttributeMetric"
+"eds.span_attributes"        = "edsnlp.metrics.span_attribute:SpanAttributeMetric"
+"eds.ner_exact_scorer"       = "edsnlp.metrics.ner:NerExactMetric"
+"eds.ner_token_scorer"       = "edsnlp.metrics.ner:NerTokenMetric"
+"eds.ner_overlap_scorer"     = "edsnlp.metrics.ner:NerOverlapMetric"
+"eds.span_attributes_scorer" = "edsnlp.metrics.span_attributes:SpanAttributeMetric"
 
 [project.entry-points."edsnlp_readers"]
 "spark" =    "edsnlp.data:from_spark"
diff --git a/tests/test_docs.py b/tests/test_docs.py
index b6975e7b74..58de9a3cfa 100644
--- a/tests/test_docs.py
+++ b/tests/test_docs.py
@@ -4,6 +4,7 @@
 import sys
 import textwrap
 import warnings
+from math import isclose
 
 import catalogue
 import pytest
@@ -30,6 +31,42 @@
     assert len(url_to_code) > 50
 
 
+class nested_approx:
+    def __init__(self, value, rel=1e-12, abs=1e-12):
+        self._value, self._rel, self._abs = value, rel, abs
+
+    def __eq__(self, other):
+        return self._match(self._value, other)
+
+    def __req__(self, other):
+        return self._match(other, self._value)
+
+    __hash__ = None  # keep it un-hashable
+
+    def _match(self, a, b):
+        if isinstance(a, (int, float)) and isinstance(b, (int, float)):
+            return isclose(a, b, rel_tol=self._rel, abs_tol=self._abs)
+        if isinstance(a, (list, tuple)):
+            return (
+                isinstance(b, (list, tuple))
+                and len(a) == len(b)
+                and all(self._match(x, y) for x, y in zip(a, b))
+            )
+        if isinstance(a, dict):
+            return (
+                isinstance(b, dict)
+                and a.keys() == b.keys()
+                and all(self._match(a[k], b[k]) for k in a)
+            )
+        return a == b
+
+    def __repr__(self):
+        return f"nested_approx({self._value!r}, rel={self._rel}, abs={self._abs})"
+
+
+pytest.nested_approx = nested_approx
+
+
 def printer(code: str) -> None:
     """
     Prints a code bloc with lines for easier debugging.
@@ -62,16 +99,22 @@ def insert_assert_statements(code):
             if stmt.end_lineno == lineno:
                 if isinstance(stmt, ast.Expr):
                     expected = textwrap.dedent(match.group(1)).replace("\n# ", "\n")
+                    expected_s = expected
                     begin = line_table[stmt.lineno - 1]
                     if not (expected.startswith("'") or expected.startswith('"')):
-                        expected = repr(expected)
+                        expected_s = repr(expected)
                     end = match.end()
                     stmt_str = ast.unparse(stmt)
                     if stmt_str.startswith("print("):
                         stmt_str = stmt_str[len("print") :]
                     repl = f"""\
-value = {stmt_str}
-assert {expected} == str(value)
+val = {stmt_str}
+try:
+    import ast
+    expected = ast.literal_eval({expected_s})
+except (ValueError, SyntaxError):
+    expected = None
+assert str(val) == {expected_s} or val == pytest.nested_approx(expected, 0.01, 0.01)
 """
                     replacements.append((begin, end, repl))
                 if isinstance(stmt, ast.For):
@@ -83,7 +126,7 @@ def insert_assert_statements(code):
                     repl = f"""\
 printed = []
 {stmt_str}
-assert {expected} == printed
+assert printed == {expected}
 """
                     replacements.append((begin, end, repl))
 
@@ -123,6 +166,8 @@ def reset_imports():
 def test_code_blocks(url, tmpdir, reset_imports):
     code = url_to_code[url]
     code_with_asserts = """
+import pytest
+
 def assert_print(*args, sep=" ", end="\\n", file=None, flush=False):
     printed.append((sep.join(map(str, args)) + end).rstrip('\\n'))