diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index f83ca5cf..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "python.analysis.typeCheckingMode": "strict", - "python.testing.pytestArgs": [ - "src" - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, -} \ No newline at end of file diff --git a/data-processing/pyproject.toml b/data-processing/pyproject.toml index b25fefb6..9ffd7eba 100644 --- a/data-processing/pyproject.toml +++ b/data-processing/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "tqdm", "zstandard", "nlp_dedup", + "pyyaml", ] [project.optional-dependencies] diff --git a/data-processing/scripts/dataset_validator.py b/data-processing/scripts/dataset_validator.py new file mode 100644 index 00000000..fb006ad8 --- /dev/null +++ b/data-processing/scripts/dataset_validator.py @@ -0,0 +1,266 @@ +""" +This CLI tool evaluate whether a datasets adhere to the expected schema. + +The tool takes two arguments: +- dataset_folder: Path to the dataset folder. On UCloud this is danish-foundation-models/dfm-data/pre-training +- datasheets_folder: Path to the datasheets folder. On GitHub this is 'danish-foundation-models/docs/datasheets' + +The tool will then check following attributes about the datasets in the dataset_folder: + +Datasheets: + +A datasheet of the same name should be located in the datasheets_folder and should contain the following information: +- License +- Languages + +Folder Structure: + +The dataset folder should contain one folder per dataset. Each dataset folder should have the following structure: + +``` +dataset_folder +│ +└── dataset_name + │ + ├── documents + │ ├── document1.jsonl.gz # MANDATORY: one or more files containing the documents in the dataset + │ └── ... + │ + └── attributes # OPTIONAL: folder containing annotations from dataset cleaning +``` + + +Schema: + +An entry in the dataset should adhere to the Document schema (defined below). + +``` +{ + "id": "...", # MANDATORY: source-specific identifier + "text": "foo", # MANDATORY: textual content of the document + "source": "...", # MANDATORY: source of the data, such as peS2o, common-crawl, etc. + "added": "...", # MANDATORY: timestamp we acquired this data (time file was created), specified as + # YYYY-MM-DD e.g 2021-04-13 + "created": "..." # MANDATORY: timestamp when orig document was created (best-guess if not available), + # should be specified as a range; + # "YYYY-MM-DD, YYYY-MM-DD" + "metadata": { # OPTIONAL: source-specific metadata + ... + } +} +""" + + +import argparse +import gzip +import json +import logging +from datetime import datetime +from pathlib import Path +from typing import Any + +import yaml +from pydantic import BaseModel, field_validator +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +class Document(BaseModel): + id: str # noqa + text: str + source: str + added: str + created: str + metadata: dict[str, Any] = {} + + @field_validator("added") + @classmethod + def check_timestamp_format(cls, v: str): # noqa + if not v: + raise ValueError("Timestamp 'added' is required.") + try: + datetime.strptime(v, "%Y-%m-%dT%H:%M:%S.%fZ") + except ValueError: + raise ValueError( + "Timestamp 'added' should be in the format 'YYYY-MM-DD'.", + ) + + @field_validator("created") + @classmethod + def check_created_format(cls, v: str): + if not v: + raise ValueError("Timestamp 'created' is required.") + try: + start, end = v.split(", ") + start_date = datetime.strptime(start, "%Y-%m-%d") + end_date = datetime.strptime(end, "%Y-%m-%d") + if start_date > end_date: + raise ValueError( + "Timestamp 'created' should be in the format 'YYYY-MM-DDTHH:MM:SS.TIMEZONE, YYYY-MM-DDTHH:MM:SS.TIMEZONE'.", + ) + except ValueError as e: + raise ValueError( + "Timestamp 'created' should be in the format 'YYYY-MM-DDTHH:MM:SS.TIMEZONE, YYYY-MM-DDTHH:MM:SS.TIMEZONE'. Got additional error:\n" + + str(e), + ) + + +def check_first_entry(document_file: Path): + """ + Get first entry in jsonl.gz file. + """ + dataset_name = document_file.parent.parent.name + with gzip.open(document_file, "rb") as f: + line = f.readline().decode("utf-8") + + json_entry = json.loads(line) + doc = Document(**json_entry) + assert ( + doc.source == dataset_name + ), f"Source should be {dataset_name}, but is {doc.source}" + + +def create_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--dataset_folder", + type=str, + help="Path to the dataset folder. On UCloud this is danish-foundation-models/dfm-data/pre-training", + ) + parser.add_argument( + "--datasheets_folder", + type=str, + help="Path to the datasheets folder. On GitHub this is 'danish-foundation-models/docs/datasheets'", + ) + return parser + + +def convert_to_paths(dataset_folder: str, datasheets_folder: str) -> tuple[Path, Path]: + _dataset_folder = Path(dataset_folder) + _datasheets_folder = Path(datasheets_folder) + + if not _dataset_folder.exists(): + raise FileNotFoundError( + f"Dataset folder {_dataset_folder.resolve()} does not exist.", + ) + if not _datasheets_folder.exists(): + raise FileNotFoundError( + f"Datasheets folder {_datasheets_folder.resolve()} does not exist.", + ) + + return _dataset_folder, _datasheets_folder + + +def check_datasheet(dataset_path: Path, datasheets_path: Path) -> str: + datasheet_path = datasheets_path / dataset_path.name + msg = "" + + if not datasheet_path.exists(): + msg += f"Datasheet {datasheet_path.name} does not exist.\n" + + # extract frontmatter from datasheet + try: + with datasheet_path.open("r") as f: + markdown = f.read() + frontmatter = yaml.safe_load(markdown.split("---")[1]) + except Exception as e: + msg += f"Error reading datasheet {datasheet_path.name}: {e}\n" + return msg + + # check if datasheet contains required fields + required_fields = ["license", "language"] + missing_fields = [field for field in required_fields if field not in frontmatter] + if missing_fields: + msg += f"Datasheet {datasheet_path.name} is missing the following fields: {missing_fields}\n" + + # check license == "other" + if frontmatter.get("license") == "other": + # license name should be specified + if not frontmatter.get("license_name"): + msg += f"Datasheet {datasheet_path.name} has license 'other' but is missing 'license_name' field\n" + + return msg + + +def check_folder_structure(dataset_path: Path) -> str: + msg = "" + + required_folders = ["documents"] + allowed_folders = ["attributes"] + subfolders = list(dataset_path.glob("*")) + + for folder in subfolders: + if folder.is_dir(): + if folder.name not in required_folders + allowed_folders: + msg += f"Folder {folder.name} is not allowed in dataset {dataset_path.name}\n" + else: + msg += f"File {folder.name} is not allowed in dataset {dataset_path.name}\n" + return msg + + +def check_schema(dataset_path: Path) -> str: + msg = "" + documents_path = dataset_path / "documents" + + if not documents_path.exists(): + msg += f"Folder 'documents' does not exist in dataset {dataset_path.name}\n" + + document_files = list(documents_path.glob("*.jsonl.gz")) + if not document_files: + msg += f"Folder 'documents' does not contain any document files in dataset {dataset_path.name}\n" + + n_errors = 3 # only print up to 3 errors + for document_file in document_files: + try: + check_first_entry(document_file) + except Exception as e: + n_errors -= 1 + msg += f"Error in document file {document_file.name}: {e}\n" + if n_errors < 1: + break + + return msg + + +def check_datasets(dataset_folder: str, datasheets_folder: str): + logger.info( + f"Checking datasets in {dataset_folder} against datasheets in {datasheets_folder}.", + ) + + datasets_path, datasheets_path = convert_to_paths(dataset_folder, datasheets_folder) + datasets = list(datasets_path.glob("*")) + + failed_datasets: list[str] = [] + + pbar = tqdm(datasets) + + for dataset_path in pbar: + # update progress bar description + pbar.set_description(f"Checking dataset: {dataset_path.name}") # type: ignore + + msg = check_datasheet(dataset_path, datasheets_path) + msg += check_folder_structure(dataset_path) + msg += check_schema(dataset_path) + + dataset_failed_checks = msg != "" + if dataset_failed_checks: + logger.error( + f"--- Dataset {dataset_path.name} failed validation ------------", + ) + logger.error(msg) + + if dataset_failed_checks: + failed_datasets.append(dataset_path.name) + + if failed_datasets: + logger.error("The following datasets failed validation:") + logger.error("\n - ".join(failed_datasets)) + + +if __name__ == "__main__": + parser = create_parser() + args = parser.parse_args() + logging.basicConfig(level=logging.INFO) + + check_datasets(args.dataset_folder, args.datasheets_folder) diff --git a/literature.md b/literature.md deleted file mode 100644 index 45e4329a..00000000 --- a/literature.md +++ /dev/null @@ -1,23 +0,0 @@ -# Litererature -This is a working document for literature. This is much for sharing and organizing material as opposed to the Zotero which is for citation management. - -## Model suggestions -- [DeBERTa v3](https://arxiv.org/abs/2111.09543?context=cs) - -## Training -- Performance [docs](https://huggingface.co/docs/transformers/performance) for transformers -- [Deepspeed blog post](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html) - -potentially also: -- [xformer](https://devblog.pytorchlightning.ai/part-i-simplifying-transformer-research-with-xformers-lightning-a715737b8ad4) - - -# Collaborative models -- Blog Post by Collin on building models like [open-source code](https://colinraffel.com/blog/a-call-to-build-models-like-we-build-open-source-software.html). - - -# Large scale models: -- Google recently released 3 new papers on large scale language models, there is a blog post detailing it [here](https://deepmind.com/blog/article/language-modelling-at-scale). - -# Data cleaning -- [Deduplicating Training Data Makes Language Models Better](https://arxiv.org/abs/2107.06499) diff --git a/makefile b/makefile index 4f91530a..9faa064b 100644 --- a/makefile +++ b/makefile @@ -1,33 +1,19 @@ -install: - @echo "Installing package and required dependencies" - pip install -e .[dev,test,docs] +validate-datasets: + @echo "Validate that all datasets are correctly formatted" + @echo "Note that this command assumed you have the 'dfm-data' folder and the github repo folder 'danish-foundation-models' during UCloud setup" + python data-processing/scripts/dataset_validator.py --dataset_folder /work/dfm-data/pre-training --datasheets_folder /work/danish-foundation-models/docs/datasheets -test: - @echo "Running tests" - pytest src +data-processing-install: + @echo "Installing package and required dependencies" + pip install -e "data-processing/.[dev,test,docs]" lint: @echo "Linting code" ruff check src --fix black . -type-check: - @echo "Type-checking code-base" - pyright src - -validate: - @echo "Running all checks" - make lint - make type-check - make test - -pr: - @echo "Running relevant checks before PR" - make validate - gh pr create -w - docs-serve: @echo "Serving documentation" @echo "Make sure you have installed docs:" - @echo "pip install -e .[docs]" + @echo "pip install -r docs/requirements.txt" mkdocs serve \ No newline at end of file