diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..e21dc40 --- /dev/null +++ b/.envrc @@ -0,0 +1,52 @@ +# Environment variables go here, and can be read in by Python using `os.getenv`: +# +# -------------------------------------------------------- +# import os +# +# # Example variable +# EXAMPLE_VARIABLE = os.getenv("EXAMPLE_VARIABLE") +# -------------------------------------------------------- +# +# To ensure the `sed` command below works correctly, make sure all file paths in environment variables are absolute +# (recommended), or are relative paths using other environment variables (works for Python users only). Environment +# variable names are expected to contain letters, numbers or underscores only. +# +# DO NOT STORE SECRETS HERE - this file is version-controlled! You should store secrets in a `.secrets` file, which is +# not version-controlled - this can then be sourced here, using `source_env ".secrets"`. + +# Extract the variables to `.env` if required. Note `.env` is NOT version-controlled, so `.secrets` will not be +# committed +sed -n 's/^export \(.*\)$/\1/p' .envrc .secrets | sed -e 's?$(pwd)?'"$(pwd)"'?g' | sed -e 's?$\([a-zA-Z0-9_]\{1,\}\)?${\1}?g' > .env + +# Add the working directory to `PYTHONPATH`; allows Jupyter notebooks in the `notebooks` folder to import `src` +export PYTHONPATH="$PYTHONPATH:$(pwd)" + +# Import secrets from an untracked file `.secrets` +source_env ".secrets" + +# Add environment variables for the `data` directories +export DIR_DATA=$(pwd)/data +export DIR_DATA_EXTERNAL=$(pwd)/data/external +export DIR_DATA_RAW=$(pwd)/data/raw +export DIR_DATA_INTERIM=$(pwd)/data/interim +export DIR_DATA_PROCESSED=$(pwd)/data/processed + +# Add environment variables for the `docs` directory +export DIR_DOCS=$(pwd)/docs + +# Add environment variables for the `notebooks` directory +export DIR_NOTEBOOKS=$(pwd)/notebooks + +# Add environment variables for the `outputs` directory +export DIR_OUTPUTS=$(pwd)/outputs + +# Add environment variables for the `src` directories +export DIR_SRC=$(pwd)/src +export DIR_SRC_MAKE_DATA=$(pwd)/src/make_data +export DIR_SRC_MAKE_FEATURES=$(pwd)/src/make_features +export DIR_SRC_MAKE_MODELS=$(pwd)/src/make_models +export DIR_SRC_MAKE_VISUALISATIONS=$(pwd)/src/make_visualisations +export DIR_SRC_UTILS=$(pwd)/src/utils + +# Add environment variables for the `tests` directory +export DIR_TESTS=$(pwd)/tests diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..e9bfa57 --- /dev/null +++ b/.flake8 @@ -0,0 +1,9 @@ +[flake8] +# Rule definitions: http://flake8.pycqa.org/en/latest/user/error-codes.html +# D203: 1 blank line required before class docstring +# W503: line break before binary operator +exclude = venv*,__pycache__,node_modules,bower_components,migrations +ignore = D203,W503 +max-complexity = 9 +max-line-length = 88 +extend-ignore = E203 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..a6fc3c0 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,34 @@ +# Summary + +Add your summary here - keep it brief, to the point, and in plain English. [For further +information about pull requests, check out the GDS +Way](https://gds-way.cloudapps.digital/standards/pull-requests.html). + +# Checklists + + + +This pull/merge request meets the following requirements: + +- [ ] code runs +- [ ] [developments are ethical][data-ethics-framework] and secure +- [ ] you have made proportionate checks that the code works correctly +- [ ] test suite passes +- [ ] developments adhere to AQA plan (see `docs/aqa/aqa_plan.md`) +- [ ] data log updated (see `docs/aqa/data_log.md`), if necessary +- [ ] assumptions, and caveats log updated (see `docs/aqa/assumptions_caveats.md`), if + necessary +- [ ] [minimum usable documentation][agilemodeling] written in the `docs` folder + +Comments have been added below around the incomplete checks. + +[agilemodeling]: http://agilemodeling.com/essays/documentLate.htm +[data-ethics-framework]: https://www.gov.uk/government/publications/data-ethics-framework diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1064212 --- /dev/null +++ b/.gitignore @@ -0,0 +1,886 @@ +# Created by https://www.toptal.com/developers/gitignore/api/vim,venv,pydev,linux,macos,flask,dotenv,django,direnv,python,windows,virtualenv,pycharm+all,visualstudio,jupyternotebooks,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=vim,venv,pydev,linux,macos,flask,dotenv,django,direnv,python,windows,virtualenv,pycharm+all,visualstudio,jupyternotebooks,visualstudiocode + +### direnv ### +.direnv +#.envrc + +### Django ### +*.log +*.pot +*.pyc +__pycache__/ +local_settings.py +db.sqlite3 +db.sqlite3-journal +media + +# If your build process includes running collectstatic, then you probably don't need or want to include staticfiles/ +# in your Git repository. Update and uncomment the following line accordingly. +# /staticfiles/ + +### Django.Python Stack ### +# Byte-compiled / optimized / DLL files +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +pytestdebug.log + +# Translations +*.mo + +# Django stuff: + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pythonenv* + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# profiling data +.prof + +### dotenv ### + +### Flask ### +instance/* +!instance/.gitignore + +### Flask.Python Stack ### +# Byte-compiled / optimized / DLL files + +# C extensions + +# Distribution / packaging + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. + +# Installer logs + +# Unit test / coverage reports + +# Translations + +# Django stuff: + +# Flask stuff: + +# Scrapy stuff: + +# Sphinx documentation + +# PyBuilder + +# Jupyter Notebook + +# IPython + +# pyenv + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow + +# Celery stuff + +# SageMath parsed files + +# Environments + +# Spyder project settings + +# Rope project settings + +# mkdocs documentation + +# mypy + +# Pyre type checker + +# pytype static type analyzer + +# profiling data + +### JupyterNotebooks ### +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +*/.ipynb_checkpoints/* + +# IPython + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### PyCharm+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/ + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + +*.iml +modules.xml +.idea/misc.xml +*.ipr + +# Sonarlint plugin +.idea/sonarlint + +### pydev ### +.pydevproject + +### Python ### +# Byte-compiled / optimized / DLL files + +# C extensions + +# Distribution / packaging + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. + +# Installer logs + +# Unit test / coverage reports + +# Translations + +# Django stuff: + +# Flask stuff: + +# Scrapy stuff: + +# Sphinx documentation + +# PyBuilder + +# Jupyter Notebook + +# IPython + +# pyenv + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow + +# Celery stuff + +# SageMath parsed files + +# Environments + +# Spyder project settings + +# Rope project settings + +# mkdocs documentation + +# mypy + +# Pyre type checker + +# pytype static type analyzer + +# profiling data + +### venv ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### VirtualEnv ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ + +### VisualStudioCode ### +.vscode/* +#!.vscode/tasks.json +#!.vscode/launch.json +*.code-workspace + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +### VisualStudio ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*[.json, .xml, .info] + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# End of https://www.toptal.com/developers/gitignore/api/vim,venv,pydev,linux,macos,flask,dotenv,django,direnv,python,windows,virtualenv,pycharm+all,visualstudio,jupyternotebooks,visualstudiocode + +### Intro workshop about GOV.UK data science ### + +# Ignore the contents of the `data` folder, except for the `README.md` file +data/* +!data/README.md + +# Ignore the contents of the `data` sub-folders, except for their `.gitkeep` files; adapted from +# https://stackoverflow.com/a/20652768 +!data/**/ +data/external/* +data/raw/* +data/interim/* +data/processed/* +!data/external/.gitkeep +!data/raw/.gitkeep +!data/interim/.gitkeep +!data/processed/.gitkeep + +# Ignore the `docs/reference/api` folder +docs/reference/api/* + +# Ignore the `.secrets` file +.secrets + +# Ignore R artifacts +*.Renviron +*.Rhistory + +# Ignore Sphinx documentation link checking folder +docs/_linkcheck/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..eabe49b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,62 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: + - repo: https://github.com/kynan/nbstripout + rev: 0.4.0 + hooks: + - id: nbstripout + name: nbstripout - Strip outputs from notebooks (auto-fixes) + args: + - --extra-keys + - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId" + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-added-large-files + name: Check for files larger than 5 MB + args: [ "--maxkb=5120" ] + - id: end-of-file-fixer + name: Check for a blank line at the end of scripts (auto-fixes) + exclude: '\.Rd' + - id: trailing-whitespace + name: Check for trailing whitespaces (auto-fixes) + - repo: https://github.com/pycqa/isort + rev: 5.8.0 + hooks: + - id: isort + name: isort - Sort Python imports (auto-fixes) + types: [ cython, pyi, python ] + args: [ "--profile", "black", "--filter-files" ] + - repo: https://github.com/psf/black + rev: 21.5b2 # Replace by any tag/version: https://github.com/psf/black/tags + hooks: + - id: black + name: black - consistent Python code formatting (auto-fixes) + language_version: python # Should be a command that runs python3.6+ + - repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + name: flake8 - Python linting + - repo: https://github.com/nbQA-dev/nbQA + rev: 0.12.0 + hooks: + - id: nbqa-isort + name: nbqa-isort - Sort Python imports (notebooks; auto-fixes) + args: [ --nbqa-mutate ] + additional_dependencies: [ isort==5.8.0 ] + - id: nbqa-black + name: nbqa-black - consistent Python code formatting (notebooks; auto-fixes) + args: [ --nbqa-mutate ] + additional_dependencies: [ black==21.5b2 ] + # TODO: Disabled for now until it's clear how to add noqa to specific cells of a Jupyter notebook + #- id: nbqa-flake8 + # name: nbqa-flake8 - Python linting (notebooks) + # additional_dependencies: [ flake8==3.9.2 ] + - repo: https://github.com/Yelp/detect-secrets + rev: v1.0.3 + hooks: + - id: detect-secrets + name: detect-secrets - Detect secrets in staged code + args: [ "--baseline", ".secrets.baseline" ] + exclude: .*/tests/.*|^\.cruft\.json$ diff --git a/.secrets.baseline b/.secrets.baseline new file mode 100644 index 0000000..58012f6 --- /dev/null +++ b/.secrets.baseline @@ -0,0 +1,94 @@ +{ + "version": "1.0.3", + "plugins_used": [ + { + "name": "ArtifactoryDetector" + }, + { + "name": "AWSKeyDetector" + }, + { + "name": "AzureStorageKeyDetector" + }, + { + "name": "Base64HighEntropyString", + "limit": 4.5 + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "name": "HexHighEntropyString", + "limit": 3.0 + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "name": "KeywordDetector", + "keyword_exclude": "" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "NpmDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "SquareOAuthDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "filters_used": [ + { + "path": "detect_secrets.filters.allowlist.is_line_allowlisted" + }, + { + "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", + "min_level": 2 + }, + { + "path": "detect_secrets.filters.heuristic.is_indirect_reference" + }, + { + "path": "detect_secrets.filters.heuristic.is_likely_id_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_potential_uuid" + }, + { + "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" + }, + { + "path": "detect_secrets.filters.heuristic.is_sequential_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_templated_secret" + } + ], + "results": {}, + "generated_at": "2021-06-14T10:43:14Z" +} diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..50f8976 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,6 @@ +# Code of conduct for `govuk-data-science-workshop` + +[Our code of conduct can be found at +`docs/contributor_guide/CODE_OF_CONDUCT.md`][code-of-conduct]. + +[code-of-conduct]: ./docs/contributor_guide/CODE_OF_CONDUCT.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..46ca528 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,6 @@ +# Contributing + +[Our contributing guidelines can be found at +`docs/contributor_guide/CONTRIBUTING.md`][contributing]. + +[contributing]: ./docs/contributor_guide/CONTRIBUTING.md diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4cb022c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Crown copyright (Government Digital Service) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d22da3f --- /dev/null +++ b/Makefile @@ -0,0 +1,82 @@ +.PHONY: + coverage + coverage_html + coverage_xml + docs + docs_check_external_links + help + prepare_docs_folder + requirements + +.DEFAULT_GOAL := help + +## Install the Python requirements for contributors, and install pre-commit hooks +requirements: + python -m pip install -U pip setuptools + python -m pip install -r requirements.txt + pre-commit install + +## Create a `docs/_build` folder, if it does not exist. Otherwise delete any sub-folders and their contents within it +prepare_docs_folder: + if [ ! -d "./docs/_build" ]; then mkdir ./docs/_build; fi + find ./docs/_build -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} \; + +## Compile the Sphinx documentation in HTML format in the docs/_build folder from a clean build +docs: prepare_docs_folder requirements + sphinx-build -b html ./docs ./docs/_build + +## Check external links in the Sphinx documentation using linkcheck in the docs/_build folder from a clean build +docs_check_external_links: prepare_docs_folder requirements + sphinx-build -b linkcheck ./docs ./docs/_build + +## Run code coverage +coverage: requirements + coverage run -m pytest + +## Run code coverage, and produce a HTML output +coverage_html: coverage + coverage html + +## Run code coverage, and produce an XML output +coverage_xml: coverage + coverage xml + +## Get help on all make commands; referenced from https://github.com/drivendata/cookiecutter-data-science +help: + @echo "$$(tput bold)Available rules:$$(tput sgr0)" + @echo + @sed -n -e "/^## / { \ + h; \ + s/.*//; \ + :doc" \ + -e "H; \ + n; \ + s/^## //; \ + t doc" \ + -e "s/:.*//; \ + G; \ + s/\\n## /---/; \ + s/\\n/ /g; \ + p; \ + }" ${MAKEFILE_LIST} \ + | LC_ALL='C' sort --ignore-case \ + | awk -F '---' \ + -v ncol=$$(tput cols) \ + -v indent=25 \ + -v col_on="$$(tput setaf 6)" \ + -v col_off="$$(tput sgr0)" \ + '{ \ + printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ + n = split($$2, words, " "); \ + line_length = ncol - indent; \ + for (i = 1; i <= n; i++) { \ + line_length -= length(words[i]) + 1; \ + if (line_length <= 0) { \ + line_length = ncol - indent - length(words[i]) - 1; \ + printf "\n%*s ", -indent, " "; \ + } \ + printf "%s ", words[i]; \ + } \ + printf "\n"; \ + }' \ + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/README.md b/README.md new file mode 100644 index 0000000..72b487c --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +# `govuk-data-science-workshop` + +Materials for a workshop about data science and GOV.UK + +```{warning} +Where this documentation refers to the root folder we mean where this README.md is +located. +``` + +## Getting started + +To start using this project, [first make sure your system meets its +requirements](#requirements). + +To be added. + +### Requirements + +```{note} Requirements for contributors +[Contributors have some additional requirements][contributing]! +``` + +- Python 3.6.1+ installed +- a `.secrets` file with the [required secrets and + credentials](#required-secrets-and-credentials) +- [load environment variables][docs-loading-environment-variables] from `.envrc` + +To install the Python requirements, open your terminal and enter: + +```shell +pip install -r requirements.txt +``` + +## Required secrets and credentials + +To run this project, [you need a `.secrets` file with secrets/credentials as +environmental variables][docs-loading-environment-variables-secrets]. The +secrets/credentials should have the following environment variable name(s): + +| Secret/credential | Environment variable name | Description | +|-------------------|---------------------------|--------------------------------------------| +| Secret 1 | `SECRET_VARIABLE_1` | Plain English description of Secret 1. | +| Credential 1 | `CREDENTIAL_VARIABLE_1` | Plain English description of Credential 1. | + +Once you've added, [load these environment variables using +`.envrc`][docs-loading-environment-variables]. + +## Licence + +Unless stated otherwise, the codebase is released under the MIT License. This covers +both the codebase and any sample code in the documentation. The documentation is © +Crown copyright and available under the terms of the Open Government 3.0 licence. + +## Contributing + +[If you want to help us build, and improve `govuk-data-science-workshop`, view our +contributing guidelines][contributing]. + +## Acknowledgements + +[This project structure is based on the `govcookiecutter` template +project][govcookiecutter]. + +[contributing]: ./docs/contributor_guide/CONTRIBUTING.md +[govcookiecutter]: https://github.com/best-practice-and-impact/govcookiecutter +[docs-loading-environment-variables]: ./docs/user_guide/loading_environment_variables.md +[docs-loading-environment-variables-secrets]: ./docs/user_guide/loading_environment_variables.md#storing-secrets-and-credentials diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..e69de29 diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..c362b5a --- /dev/null +++ b/data/README.md @@ -0,0 +1,29 @@ +# `data` folder overview + +Any data that needs to be stored locally should be saved in this location. This folder, +and its sub-folders, are not version-controlled. + +The sub-folders should be used as follows: + +- `external`: any data that will not be processed at all, such as reference data; +- `raw`: any raw data before any processing; +- `interim`: any raw data that has been partially processed and, for whatever reason, + needs to be stored before further processing is completed; and +- `processed`: any raw or interim data that has been fully processed into its final + state. + +The paths for these directories are loaded as environment variables by the +`.envrc` file. To load them in Python, use any or all of the following code: + +```python +import os + +# Load environment variables for the `data` folder, and its sub-folders +DIR_DATA = os.getenv("DIR_DATA") +DIR_DATA_EXTERNAL = os.getenv("DIR_DATA_EXTERNAL") +DIR_DATA_RAW = os.getenv("DIR_DATA_RAW") +DIR_DATA_INTERIM = os.getenv("DIR_DATA_INTERIM") +DIR_DATA_PROCESSED = os.getenv("DIR_DATA_PROCESSED") +``` + +[docs-envrc]: ../docs/structure/README.md#envrc diff --git a/data/external/.gitkeep b/data/external/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/interim/.gitkeep b/data/interim/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/processed/.gitkeep b/data/processed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/raw/.gitkeep b/data/raw/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..5f6285e --- /dev/null +++ b/docs/README.md @@ -0,0 +1,32 @@ +# `docs` folder overview + +All documentation for the project should be included in this folder in either +reStructuredText or Markdown files, with acceptable formatting for Sphinx. [Guidance on +how to write Sphinx documentation is supplied in the contributor +guide][writing-sphinx-documentation]. + +To build the documentation, run the `docs` command [from `Makefile` using the `make` +utility at the top-level of this repository][docs-makefile]. + +```shell +make docs +``` + +or, alternatively, run: + +```shell +sphinx-build -b linkcheck ./docs ./docs/_build +``` + +The HTML-version of this documentation can then be viewed at `docs/_build/index.html`, +relative to the top-level of this repository. + +## Analytical quality assurance (AQA) + +All analytical quality assurance (AQA) documents can be found in the `docs/aqa` folder. +These files document how this project meets organisational [guidance on producing +quality analysis for HM Government projects][aqua-book]. + +[aqua-book]: https://www.gov.uk/government/publications/the-aqua-book-guidance-on-producing-quality-analysis-for-government +[docs-makefile]: ../docs/structure/README.md#makefile +[writing-sphinx-documentation]: ../docs/contributor_guide/writing_sphinx_documentation.md diff --git a/docs/_static/.gitkeep b/docs/_static/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/aqa/README.md b/docs/aqa/README.md new file mode 100644 index 0000000..6e081ee --- /dev/null +++ b/docs/aqa/README.md @@ -0,0 +1,10 @@ +# Analytical quality assurance + +These pages summarise analytical quality assurance (AQA) required for this project: + +```{toctree} +:maxdepth: 2 +./aqa_plan.md +./data_log.md +./assumptions_caveats.md +``` diff --git a/docs/aqa/aqa_plan.md b/docs/aqa/aqa_plan.md new file mode 100644 index 0000000..3fa183d --- /dev/null +++ b/docs/aqa/aqa_plan.md @@ -0,0 +1,12 @@ +# Analytical quality assurance plan + +This analytical quality assurance (AQA) plan outlines [our implementation of the +Aqua Book][aqua-book] for this project. [Further resources related to the Aqua Book +are also available on GOV.UK][aqua-book-resources]. + +This is a living document, and should be updated and/or modified as necessary. For +example if new tasks not listed here become relevant to project success, please add +them to this plan. + +[aqua-book]: https://www.gov.uk/government/publications/the-aqua-book-guidance-on-producing-quality-analysis-for-government +[aqua-book-resources]: https://www.gov.uk/government/collections/aqua-book-resources diff --git a/docs/aqa/assumptions_caveats.md b/docs/aqa/assumptions_caveats.md new file mode 100644 index 0000000..30a5d03 --- /dev/null +++ b/docs/aqa/assumptions_caveats.md @@ -0,0 +1,30 @@ +# Assumptions and caveats log + +This log contains a list of assumptions and caveats used in this analysis. + +## Definitions + +Assumptions are RAG-rated according to the following definitions for quality and +impact[^1]: + +[^1]: With thanks to the Home Office Analytical Quality Assurance team for these definitions. + +| RAG | Assumption quality | Assumption impact | +|-------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------| +| Green | Reliable assumption, well understood and/or documented; anything up to a validated & recent set of actual data. | Marginal assumptions; their changes have no or limited impact on the outputs. | +| Amber | Some evidence to support the assumption; may vary from a source with poor methodology to a good source that is a few years old. | Assumptions with a relevant, even if not critical, impact on the outputs. | +| Red | Little evidence to support the assumption; may vary from an opinion to a limited data source with poor methodology. | Core assumptions of the analysis; the output would be drastically affected by their change. | + +## Assumption 1: Insert plain English title here + +* Quality: Insert RAG rating here +* Impact: Insert RAG rating here + +Add plain English description here. + +## Assumption 2: Insert plain English title here + +* Quality: Insert RAG rating here +* Impact: Insert RAG rating here + +Add plain English description here. diff --git a/docs/aqa/data_log.md b/docs/aqa/data_log.md new file mode 100644 index 0000000..0a020f8 --- /dev/null +++ b/docs/aqa/data_log.md @@ -0,0 +1,30 @@ +# Data log + +This log contains a list of data sources used in this analysis. + +## Definitions + +Assumptions are RAG-rated according to the following definitions for quality and +suitability[^1]: + +[^1]: With thanks to the Home Office Analytical Quality Assurance team for these definitions. + +| RAG | Data quality | Data suitability | +|-------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Green | Data is well understood and there are no major issues with quality. Minor issues are understood and documented. | Data is best available for the required purpose and has been validated (for example against published statistics). | +| Amber | Data is well understood. There are quality issues (for example missing values, step changes, large number of outliers) that can be explained, documented or shown to have negligible impact. | Not the ideal data set for the analysis, but the best available at the time. Results will reflect the fact that it is not the ideal data set and it will subject to sensitivity analysis where appropriate. | +| Red | Data is not well understood. There are major quality issues that cannot be fully explained and/or have a significant impact on analysis outputs. | There are concerns about the suitability of the data set for this application, which could negatively affect the quality and accuracy of the analysis. Its derivation / sample size is not known. | + +## Source 1: Insert plain English title here + +* Quality: Insert RAG rating here +* Suitability: Insert RAG rating here + +Add plain English description here. + +### Source 2: Insert plain English title here + +* Quality: Insert RAG rating here +* Suitability: Insert RAG rating here + +Add plain English description here. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..54ec47c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,201 @@ +# Intro workshop about GOV.UK data science documentation build configuration file +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this autogenerated +# file. +# +# All configuration values have a default; values that are commented out serve to show +# the default. + +# If extensions (or modules to document with autodoc) are in another directory, add +# these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys + +sys.path.insert(0, os.path.abspath("..")) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +needs_sphinx = "4.0" + +# Add any Sphinx extension module names here, as strings. They can be extensions coming +# with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosectionlabel", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "myst_parser", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. You can specify multiple suffix as a list of +# string: +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = "Intro workshop about GOV.UK data science" +author = "alphagov" + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the built +# documents. +# The short X.Y.Z version. +version = "0.0.1" +# The full version, including alpha/beta/rc tags. +release = "0.0.1" + +# List of patterns, relative to source directory, that match files and directories to +# ignore when looking for source files. These patterns also affect html_static_path and +# html_extra_path +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "README.md"] + +# -- Options for HTML output ----------------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for a list of +# builtin themes. +html_theme = "alabaster" + +# Theme options are theme-specific and customize the look and feel of a theme further. +# For a list of options available for each theme, see the documentation. +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. " v documentation" by +# default. +# html_title = "None" + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top of the +# sidebar. +# html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of the +# docs. This file should be a +# Windows icon file (.ico) being 16x16 or 32x32 pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, relative +# to this directory. They are copied after the builtin static files, so a file named +# "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# Add any extra paths that contain custom files (such as robots.txt or .htaccess) here, +# relative to this directory. These files are copied directly to the root of the +# documentation. +# html_extra_path = [] + +# If not None, a 'Last updated on:' timestamp is inserted at every page bottom, using +# the given strftime format. The empty string is equivalent to '%b %d, %Y'. +# html_last_updated_fmt = None + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to template +# names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will contain a +# tag referring to it. The value of this option must be the base URL from +# which the finished HTML is served. +# html_use_opensearch = "" + +# This is the file name suffix for HTML files (for example ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. Sphinx supports +# the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' +# html_search_language = "en" + +# A dictionary with options for the search language support, empty by default. 'ja' +# uses this config value. 'zh' user can custom change `jieba` dictionary path. +# html_search_options = {"type": "default"} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# html_search_scorer = "scorer.js" + +# Output file base name for HTML help builder. +htmlhelp_basename = "govuk-data-science-workshopdoc" + +# -- Options for autosection output ---------------------------------------------------- + +# Prefix document path to section labels, otherwise autogenerated labels would look +# like 'heading' rather than 'path/to/file:heading' +autosectionlabel_prefix_document = True + +# -- Options for autosummary output ---------------------------------------------------- + +# Set the autosummary to generate stub files +autosummary_generate = True + +# -- Options for Napoleon extension ---------------------------------------------------- + +# Napoleon settings to enable parsing of Google- and NumPy-style docstrings. +# napoleon_google_docstring = True +# napoleon_numpy_docstring = True +# napoleon_include_init_with_doc = False +# napoleon_include_private_with_doc = False +# napoleon_include_special_with_doc = True +# napoleon_use_admonition_for_examples = False +# napoleon_use_admonition_for_notes = False +# napoleon_use_admonition_for_references = False +# napoleon_use_ivar = False +# napoleon_use_param = True +# napoleon_use_rtype = True + +# -- Options for MyST ------------------------------------------------------------------ + +# Enforce heading anchors for h1 to h6 headings +myst_heading_anchors = 6 + +# Enable MyST extensions +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "html_admonition", + "html_image", + # "linkify", + "replacements", + "smartquotes", + "substitution", +] diff --git a/docs/contributor_guide/CODE_OF_CONDUCT.md b/docs/contributor_guide/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..a119d5a --- /dev/null +++ b/docs/contributor_guide/CODE_OF_CONDUCT.md @@ -0,0 +1,98 @@ +# Code of conduct for `govuk-data-science-workshop` + +Contributors to this repository hosted by `alphagov` are expected to follow the +Contributor Covenant Code of Conduct, and those working within Her Majesty's Government +are also expected to follow the Civil Service Code. + +## Civil Service Code + +Contributors working within Her Majesty's Government must review the [Civil Service +Code][civil-service-code], and are expected to follow it in their contributions. + +## Contributor Covenant Code of Conduct + +### Definitions + +Where this Code of Conduct says: + +- "Project", we mean this `govuk-data-science-workshop` GitHub repository; +- "Maintainer", we mean the `alphagov` organisation owners; and +- "Leadership", we mean both `alphagov` organisation owners, line managers, and other + leadership within the Government Digital Service. + +### Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and +maintainers pledge to make participation in our project, and our community a +harassment-free experience for everyone, regardless of age, body size, disability, +ethnicity, sex characteristics, gender identity and expression, level of experience, +education, socio-economic status, nationality, personal appearance, race, religion, or +sexual identity and orientation. + +### Our Standards + +Examples of behaviour that contributes to creating a positive environment include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behaviour by participants include: + +- The use of sexualised language or imagery and unwelcome sexual attention or advances +- Trolling, insulting/derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or electronic address, + without explicit permission +- Other conduct which could reasonably be considered inappropriate in a professional + setting + +### Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behaviour and are expected to take appropriate and fair corrective action in response +to any instances of unacceptable behaviour. + +Project maintainers have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are not +aligned to this Code of Conduct, or to ban temporarily or permanently any contributor +for other behaviours that they deem inappropriate, threatening, offensive, or harmful. + +### Scope + +This Code of Conduct applies within all project spaces, and it also applies when an +individual is representing the project or its community in public spaces. Examples of +representing a project or community include using an official project e-mail address, +posting using an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +### Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behaviour may be reported by +contacting the project team at +[gov.uk-data-science@digital.cabinet-office.gov.uk][email-address]. All complaints will be +reviewed and investigated and will result in a response that is deemed necessary and +appropriate to the circumstances. The project team is obligated to maintain +confidentiality with regard to the reporter of an incident. Further details of +specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may +face temporary or permanent repercussions as determined by other members of the +project's leadership. + +### Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][contributor-covenant], +version 1.4, available at +[https://www.contributor-covenant.org/version/1/4/code-of-conduct/][contributor-covenant-code-of-conduct], +and the `alphagov` Code of Conduct available at +[https://github.com/alphagov/.github/blob/main/CODE_OF_CONDUCT.md][alphagov-code-of-conduct]. + +[alphagov-code-of-conduct]: https://github.com/alphagov/.github/blob/main/CODE_OF_CONDUCT.md +[civil-service-code]: https://www.gov.uk/government/publications/civil-service-code/the-civil-service-code +[contributor-covenant]: https://www.contributor-covenant.org +[contributor-covenant-code-of-conduct]: https://www.contributor-covenant.org/version/1/4/code-of-conduct/ +[email-address]: mailto:gov.uk-data-science@digital.cabinet-office.gov.uk diff --git a/docs/contributor_guide/CONTRIBUTING.md b/docs/contributor_guide/CONTRIBUTING.md new file mode 100644 index 0000000..224ce8c --- /dev/null +++ b/docs/contributor_guide/CONTRIBUTING.md @@ -0,0 +1,128 @@ +# Contributing + +We love contributions! We've compiled this documentation to help you understand our +contributing guidelines. [If you still have questions, please contact us][email] and +we'd be happy to help! + +## Code of Conduct + +[Please read `CODE_OF_CONDUCT.md` before contributing][code-of-conduct]. + +## Getting started + +To start contributing, open your terminal, and install the required Python packages, +and [pre-commit hooks][pre-commit] using: + +```shell +pip install -r requirements.txt +pre-commit install +``` + +or the `make` command: + +```shell +make requirements +``` + +The pre-commit hooks are a security feature to ensure, for example, no secrets[^1], +large data files, and Jupyter notebook outputs are accidentally committed into the +repository. [For more information on pre-commit hooks see our +documentation][docs-pre-commit-hooks]. + +[^1]: [Only secrets of specific patterns are detected by the pre-commit + hooks][docs-pre-commit-hooks-secrets-definition]. + +## Code conventions + +[We mainly follow the GDS Way in our code conventions][gds-way]. + +### Git and GitHub + +We use Git to version control the source code. [Please read the GDS Way for details on +Git best practice][gds-way-git]. This includes how to write good commit messages, use +`git rebase` for local branches and `git merge --no-ff` for merges, as well as using +`git push --force-with-lease` instead of `git push -f`. + +[If you want to modify the `.gitignore` files, see the template +documentation][docs-updating-gitignore] for further details. + +Our source code is stored on GitHub. Pull requests into `main` require at least one +approved review. + +### Python + +For Python code, [we follow the GDS Way Python style guide][gds-way-python] with a line +length of 88; the flake8 pre-commit hook should help with this! + +### Markdown + +Local links can be written as normal, but external links should be referenced at the +bottom of the Markdown file for clarity. For example: + +```md +Use a [local link to reference the `README.md`](../../README.md) file, but [an external +link for GOV.UK][gov-uk]. + +[gov-uk]: https://www.gov.uk/ +``` + +We also try to wrap Markdown to a line length of 88 characters, but this is not +strictly enforced in all cases, for example with long hyperlinks. + +## Testing + +[Tests are written using the `pytest` framework][pytest], with its configuration in the +`pyproject.toml` file. Note, only tests in the `tests` folder are run. To run the +tests, enter the following command in your terminal: + +```shell +pytest +``` + +### Code coverage + +[Code coverage of Python scripts is measured using the `coverage` Python +package][coverage]; its configuration can be found in `pyproject.toml`. Note coverage +only extends to Python scripts in the `src` folder. + +To run code coverage, and view it as an HTML report, enter the following command in +your terminal: + +```shell +coverage run -m pytest +coverage html +``` + +or use the `make` command: + +```shell +make coverage_html +``` + +The HTML report can be accessed at `htmlcov/index.html`. + +## Documentation + +[We write our documentation in MyST Markdown for use in Sphinx][myst]. This is mainly +stored in the `docs` folder, unless it's more appropriate to store it elsewhere, like +this file. + +[Please read our guidance on how to write accessible +documentation][docs-write-accessible-documentation], as well as our [guidance on +writing Sphinx documentation][docs-write-sphinx-documentation]. This allows you to +build the documentation into an accessible, searchable website. + +[code-of-conduct]: ./CODE_OF_CONDUCT.md +[coverage]: https://coverage.readthedocs.io/ +[docs-pre-commit-hooks]: ./pre_commit_hooks.md +[docs-pre-commit-hooks-secrets-definition]: ./pre_commit_hooks.md#definition-of-a-secret-according-to-detect-secrets +[docs-updating-gitignore]: ./updating_gitignore.md +[docs-write-accessible-documentation]: ./writing_accessible_documentation.md +[docs-write-sphinx-documentation]: ./writing_sphinx_documentation.md +[email]: mailto:gov.uk-data-science@digital.cabinet-office.gov.uk +[gds-way]: https://gds-way.cloudapps.digital/ +[gds-way-git]: https://gds-way.cloudapps.digital/standards/source-code.html +[gds-way-python]: https://gds-way.cloudapps.digital/manuals/programming-languages/python/python.html#python-style-guide +[myst]: https://myst-parser.readthedocs.io/ +[pre-commit]: https://pre-commit.com/ +[pytest]: https://docs.pytest.org/ diff --git a/docs/contributor_guide/README.md b/docs/contributor_guide/README.md new file mode 100644 index 0000000..2fec5f9 --- /dev/null +++ b/docs/contributor_guide/README.md @@ -0,0 +1,13 @@ +# Contributing guide + +This is the contributor guide for the `govuk-data-science-workshop` project. + +```{toctree} +:maxdepth: 2 +./CODE_OF_CONDUCT.md +./CONTRIBUTING.md +./pre_commit_hooks.md +./updating_gitignore.md +./writing_accessible_documentation.md +./writing_sphinx_documentation.md +``` diff --git a/docs/contributor_guide/pre_commit_hooks.md b/docs/contributor_guide/pre_commit_hooks.md new file mode 100644 index 0000000..d4a8805 --- /dev/null +++ b/docs/contributor_guide/pre_commit_hooks.md @@ -0,0 +1,173 @@ +# Pre-commit hooks + +[This repository uses the Python package `pre-commit` to manage pre-commit +hooks][pre-commit]. Pre-commit hooks are actions which are run automatically, typically +on each commit, to perform some common set of tasks. For example, a pre-commit hook +might be used to run any code linting automatically before code is committed, ensuring +common code quality. + +## Purpose + +For this repository, we are using `pre-commit` for a number of purposes: + +- checking for secrets being committed accidentally — there is a strict [definition of + a "secret"](#definition-of-a-secret-according-to-detect-secrets); and +- checking for any large files (over 5 MB) being committed. +- cleaning Jupyter notebooks, which means removing all outputs, execution counts, + Python kernels, and, for Google Colaboratory (Colab), stripping out user information. + +We have configured `pre-commit` to run automatically on every commit. By running on +each commit, we ensure that `pre-commit` will be able to detect all contraventions and +keep our repository in a healthy state. + +```{note} Pre-commit hooks and Google Colab +No pre-commit hooks will be run on Google Colab notebooks pushed directly to GitHub. +For security reasons, it is recommended that you manually download your notebook, and +commit up locally to ensure pre-commit hooks are run on your changes. +``` + +## Installation + +In order for `pre-commit` to run, action is needed to configure it on your system. + +- install the `pre-commit` package into your Python environment from + `requirements.txt`; and +- run `pre-commit install` in your terminal to set up `pre-commit` to run when code is + committed. + +## Using the `detect-secrets` pre-commit hook + +```{note} Secret detection limitations +The `detect-secrets` package does its best to prevent accidental committing of secrets, +but it may miss things. Instead, focus on good software development practices! See the +[definition of a secret for further +information](#definition-of-a-secret-according-to-detect-secrets). +``` + +[We use `detect-secrets` to check that no secrets are accidentally +committed][detect-secrets]. This hook requires you to generate a baseline file if one +is not already present within the root directory. To create the baseline file, run the +following at the root of the repository: + +```shell +detect-secrets scan > .secrets.baseline +``` + +Next, audit the baseline that has been generated by running: + +```shell +detect-secrets audit .secrets.baseline +``` + +When you run this command, you'll enter an interactive console. This will present you +with a list of high-entropy string and/or anything which could be a secret. It will +then ask you to verify whether this is the case. This allows the hook to remember false +positives in the future, and alert you to new secrets. + +### Definition of a "secret" according to `detect-secrets` + +The `detect-secrets` documentation, as of January 2021, says it works: + +> ...by running periodic diff outputs against heuristically crafted \[regular +> expression\] statements, to identify whether any new secret has been committed. + +This means it uses regular expression patterns to scan your code changes for anything +that looks like a secret according to the patterns. By definition, there are only a +limited number of patterns, so the `detect-secrets` package cannot detect every +conceivable type of secret. + +To understand what types of secrets will be detected, read the `detect-secrets` +documentation on caveats, and the list of supported plugins. Also, you should use +secret variable names with words that will trip the KeywordDetector plugin; see the +[`DENYLIST` variable for the full list of words][detect-secrets-keyword-detector]. + +### If `pre-commit` detects secrets during commit + +If `pre-commit` detects any secrets when you try to create a commit, it will detail +what it found and where to go to check the secret. + +If the detected secret is a false positive, there are two options to resolve this, and +prevent your commit from being blocked: + +- [inline allowlisting of false positives + (recommended)](#inline-allowlisting-recommended); or +- [updating the `.secrets.baseline` to include the false + positives](#updating-secretsbaseline). + +In either case, if an actual secret is detected (or a combination of actual secrets and +false positives), first remove the actual secret. Then following either of these +processes. + +#### Inline allowlisting (recommended) + +To exclude a false positive, add a `pragma` comment such as: + +```python +secret = "Password123" # pragma: allowlist secret +``` + +or + +```python +# pragma: allowlist nextline secret +secret = "Password123" +``` + +If the detected secret is actually a secret (or other sensitive information), remove +the secret and re-commit; there is no need to add any `pragma` comments. + +If your commit contains a mixture of false positives and actual secrets, remove the +actual secrets first before adding `pragma` comments to the false positives. + +#### Updating `.secrets.baseline` + +To exclude a false positive, you can also [update the `.secrets.baseline` by repeating +the same two commands as in the initial +setup](#using-the-detect-secrets-pre-commit-hook). + +During auditing, if the detected secret is actually a secret (or other sensitive +information), remove the secret and re-commit. There is no need to update the +`.secrets.baseline` file in this case. + +If your commit contains a mixture of false positives and actual secrets, remove the +actual secrets first before updating and auditing the `.secrets.baseline` file. + +## Keeping specific Jupyter notebook outputs + +It may be necessary or useful to keep certain output cells of a Jupyter notebook, for +example charts or graphs visualising some set of data. To do this, [according to the +documentation for the `nbstripout` package][nbstripout], either: + +1. add a `keep_output` tag to the desired cell; or +2. add `"keep_output": true` to the desired cell's metadata. + +You can access cell tags or metadata in Jupyter by enabling the "Tags" or +"Edit Metadata" toolbar (View > Cell Toolbar > Tags; View > Cell Toolbar > +Edit Metadata). + +For the tags approach, enter `keep_output` in the text field for each desired cell, and +press the "Add tag" button. For the metadata approach, press the "Edit Metadata" button +on each desired cell, and edit the metadata to look like this: + +```json +{ + "keep_output": true +} +``` + +This will tell the hook not to strip the resulting output of the desired cell(s), +allowing the output(s) to be committed. + +```{note} Tags and metadata on Google Colab +Currently (March 2020) there is no way to add tags and/or metadata to Google Colab +notebooks. + +It's strongly suggested that you download the Colab as a .ipynb file, and edit tags +and/or metadata using Jupyter before committing the code if you want to keep some +outputs. +``` + +[detect-secrets]: https://github.com/Yelp/detect-secrets +[detect-secrets-plugins]: https://github.com/Yelp/detect-secrets#currently-supported-plugins +[nbstripout]: https://github.com/kynan/nbstripout +[pre-commit]: https://pre-commit.com/ diff --git a/docs/contributor_guide/updating_gitignore.md b/docs/contributor_guide/updating_gitignore.md new file mode 100644 index 0000000..52cd119 --- /dev/null +++ b/docs/contributor_guide/updating_gitignore.md @@ -0,0 +1,10 @@ +# Updating the `.gitignore` file + +[The `.gitignore` used in this repository was created with generic exclusions from +gitignore.io][gitignore-io], with project-specific exclusions listed afterwards. + +If you want to add exclusions for new programming languages and/or IDEs, use the first +line to recreate the generic exclusions from gitignore.io. Add all other +project-specific exclusions afterwards. + +[gitignore-io]: https://www.toptal.com/developers/gitignore diff --git a/docs/contributor_guide/writing_accessible_documentation.md b/docs/contributor_guide/writing_accessible_documentation.md new file mode 100644 index 0000000..35467c5 --- /dev/null +++ b/docs/contributor_guide/writing_accessible_documentation.md @@ -0,0 +1,52 @@ +# Writing accessible documentation + +[You can build this project's documentation into a website using +Sphinx][docs-write-sphinx-documentation]. If you work in the public sector, and build a +website, by law the website must be accessible. + +The full name of the accessibility regulations is the Public Sector Bodies (Websites +and Mobile Applications) (No. 2) Accessibility Regulations 2018. + +It came into force on 23 September 2018, and all public sector bodies have to meet +these requirements unless exempt. [GOV.UK has further details to help you understand +the impact of the 2018 requirements][govuk-accessibility] + +We use the following checklist to determine how accessible our documentation is, when +rendered as a website using Sphinx. + +- [check the website against the WAVE Web Accessibility Evaluation Tool][wave] +- check that link text is descriptive +- check the hierarchy of page headings, which should go in order from `h2` to `h4` with + no gaps +- remove italics, and bold text +- only use block capitals inside curly braces for placeholders in code examples +- check for accessible language + - use [`alex.js` to identify insensitive, and inconsiderate writing][alex-js] + - replace instances of `click` with `select` or `choose` + - remove latin phrases (`e.g.`, `i.e.`, `ad hoc`, `via`) + - [use GOV.UK inclusive language][govuk-language] + - [replace negative contractions][negative-contractions] + - aim not to have long sentences (maximum 25 words per sentence) + - aim not to have long paragraphs (maximum 5 lines per paragraph) + - check for unique titles in documentation + - check diagrams and images for alternative text as well as surrounding contextual + text + - remove diagrams/images that do not add anything to a user's understanding + - remove screenshots if possible + - [use accessible SVGs][govuk-design-system-images] + - [check for inaccessible formats][govuk-accessible-formats] + +This checklist was created by the Government Digital Service (GDS) technical writing +team with help from the GDS accessibility team. We then [draft a suitable accessibility +statement for the project; an example is available on +GOV.UK][govuk-sample-accessibility]. + +[alex-js]: https://alexjs.com/ +[docs-write-sphinx-documentation]: ./writing_sphinx_documentation.md +[govuk-accessible-formats]: https://www.gov.uk/guidance/how-to-publish-on-gov-uk/accessible-pdfs +[govuk-accessibility]: https://www.gov.uk/guidance/accessibility-requirements-for-public-sector-websites-and-apps +[govuk-design-system-images]: https://design-system.service.gov.uk/styles/images/ +[govuk-language]: https://www.gov.uk/government/publications/inclusive-communication/inclusive-language-words-to-use-and-avoid-when-writing-about-disability +[govuk-sample-accessibility]: https://www.gov.uk/government/publications/sample-accessibility-statement +[negative-contractions]: https://www.englishclub.com/vocabulary/contractions-negative.htm +[wave]: https://wave.webaim.org/ diff --git a/docs/contributor_guide/writing_sphinx_documentation.md b/docs/contributor_guide/writing_sphinx_documentation.md new file mode 100644 index 0000000..cc80669 --- /dev/null +++ b/docs/contributor_guide/writing_sphinx_documentation.md @@ -0,0 +1,148 @@ +# Writing Sphinx documentation + +[This project is set up to produce documentation using Sphinx][sphinx]; this page +should give you a quick overview on how to write documentation for it. If you'd like to +know how to write good documentation take a look at [Write the Docs guide on writing +documentation][writethedocs]. [For Agile projects, consider documenting +late][agilemodeling] as well. + +## Why should I bother? And why Sphinx? + +Keeping as much of the documentation in a centralised location is a good thing. It +means contributors, users, and anyone else can quickly find as much information as they +need to understand and/or run what you've done. + +Sphinx is a Python-based package to compile documentation into different formats, +including HTML. This means you can write your documentation and, with a single terminal +command, build it into a searchable website. + +It's widely used, such as for the documentation of the [`pandas`][pandas], and +[PyTorch][pytorch] Python packages as well as many [others][sphinx-examples]. It is +highly customisable with different extensions, and themes. Included with this project +is: + +- support for both [reStructuredText (ReST)][sphinx-rest], and [ReST-enabled + Markdown][myst]; +- automatic building of documentation from Python docstrings; and +- support for [ReStructuredText][docstring-rst], [NumPy][docstring-numpy], or + [Google][docstring-google] docstring formats. + +### Creating a searchable website + +To create a website with your documentation, run the following command in your terminal +at the top-level of this project: + +```shell +make docs +``` + +This should create an HTML version of your documentation accessible from +`docs/_build/index.html`. + +## Writing in reStructuredText + +[Sphinx provides good documentation on writing in ReST][sphinx-rest] — we would highly +recommend reading that for guidance. We will cover automatically creating docstrings in +the next subsection. + +### Automatically creating docstring documentation (ReST) + +Let us say that `src/__init__.py` has functions called `hello` and `world` imported +into it, and both have docstrings. To automatically generate docstring documentation, +create a ReST file, and add the following line to reference the `src` module: + +```rest +.. currentmodule:: src +``` + +Then, elsewhere in the body, [call the `autosummary` directive to generate the +docstrings as ReST stub files][sphinx-autosummary]. + +```rest +.. autosummary:: + :toctree: api/ + + hello + world + +``` + +[This will create something similar to the `pandas` API +reference][pandas-api-reference]. + +## Writing in ReST-enabled Markdown + +[We use the `myst-parser` package (MyST) to write Markdown that can also include ReST +elements][myst]. The package documentation is detailed, so we would recommend reviewing +it. We will cover some of the more widely used elements in the following subsections. + +### Embedding ReST directives + +[Most ReST directives can be embedded into MyST Markdown][myst-rst-directives]. + +### Automatically creating docstring documentation (MyST Markdown) + +Let us say that `src/__init__.py` has functions called `hello` and `world` imported +into it, and both have docstrings. To automatically generate docstring documentation, +create a Markdown file, and add the following line to reference the `src` module: + +````md +```{eval-rst} +.. currentmodule:: src +``` +```` + +Then, elsewhere in the body, [call the `autosummary` directive to generate the +docstrings as ReST stub files][sphinx-autosummary]. + +````md +```{eval-rst} +.. autosummary:: + :toctree: api/ + + hello + world + +``` +```` + +### Including Markdown files outside the `docs` folder + +[MyST lets you include Markdown files outside the `docs` folder][myst-include]. + +If a Markdown file (`../example.md`) only contains links that do not reference anything +else in this project (including images), create a Markdown file within the `docs` +folder with the following lines: + +````md +```{include} ../example.md +``` +```` + +However, if it includes relative links referencing other files in this project +(including images), we need to tell MyST what those links actually refer. For example, +if the relative link is `../hello/world.md`, we need to create a Markdown file within +the `docs` folder with the following lines: + +````md +```{include} ../example.md +:relative-docs: ../hello +:relative-images: +``` +```` + +[agilemodeling]: http://agilemodeling.com/essays/documentLate.htm +[docstring-google]: https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings +[docstring-numpy]: https://numpydoc.readthedocs.io/en/latest/format.html +[docstring-rst]: https://www.python.org/dev/peps/pep-0287/ +[myst]: https://myst-parser.readthedocs.io/ +[myst-include]: https://myst-parser.readthedocs.io/en/latest/sphinx/use.html#include-a-file-from-outside-the-docs-folder-like-readme-md +[myst-rst-directives]: https://myst-parser.readthedocs.io/en/latest/syntax/syntax.html +[pandas]: https://pandas.pydata.org/docs/ +[pandas-api-reference]: https://pandas.pydata.org/docs/reference/index.html +[pytorch]: https://pytorch.org/docs/stable/index.html +[sphinx]: https://www.sphinx-doc.org/ +[sphinx-autosummary]: https://www.sphinx-doc.org/en/master/usage/extensions/autosummary.html +[sphinx-examples]: https://www.sphinx-doc.org/en/master/examples.html +[sphinx-rest]: https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html +[writethedocs]: https://www.writethedocs.org/guide/writing/beginners-guide-to-docs/ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..8e4234d --- /dev/null +++ b/docs/index.md @@ -0,0 +1,14 @@ +```{include} ../README.md +:relative-docs: ./docs +``` + +```{toctree} +:hidden: +:maxdepth: 2 +self +./user_guide/README.md +./contributor_guide/README.md +./aqa/README.md +./structure/README.md +./reference/README.md +``` diff --git a/docs/reference/README.md b/docs/reference/README.md new file mode 100644 index 0000000..d018b63 --- /dev/null +++ b/docs/reference/README.md @@ -0,0 +1,14 @@ +# `src` API reference + +This page gives an overview of all public `src` objects, functions and methods. All +classes and functions exposed in `src.*` namespace are public. + +```{toctree} +:maxdepth: 2 +./make_data.md +./make_features.md +./make_models.md +./make_visualisations.md +./utils.md +./src.md +``` diff --git a/docs/reference/make_data.md b/docs/reference/make_data.md new file mode 100644 index 0000000..3411b49 --- /dev/null +++ b/docs/reference/make_data.md @@ -0,0 +1,15 @@ +# Data generation + +These `src` package functions generate data. + +```{eval-rst} +.. currentmodule:: src +``` + +## Function heading + +```{eval-rst} +.. autosummary:: + :toctree: api/ + +``` diff --git a/docs/reference/make_features.md b/docs/reference/make_features.md new file mode 100644 index 0000000..359be20 --- /dev/null +++ b/docs/reference/make_features.md @@ -0,0 +1,15 @@ +# Feature generation + +These `src` package functions create features. + +```{eval-rst} +.. currentmodule:: src +``` + +## Function heading + +```{eval-rst} +.. autosummary:: + :toctree: api/ + +``` diff --git a/docs/reference/make_models.md b/docs/reference/make_models.md new file mode 100644 index 0000000..e1fa8e8 --- /dev/null +++ b/docs/reference/make_models.md @@ -0,0 +1,15 @@ +# Model generation + +These `src` package functions create models. + +```{eval-rst} +.. currentmodule:: src +``` + +## Function heading + +```{eval-rst} +.. autosummary:: + :toctree: api/ + +``` diff --git a/docs/reference/make_visualisations.md b/docs/reference/make_visualisations.md new file mode 100644 index 0000000..323a950 --- /dev/null +++ b/docs/reference/make_visualisations.md @@ -0,0 +1,15 @@ +# Create visualisations + +These `src` package functions create visualisations. + +```{eval-rst} +.. currentmodule:: src +``` + +## Function heading + +```{eval-rst} +.. autosummary:: + :toctree: api/ + +``` diff --git a/docs/reference/src.md b/docs/reference/src.md new file mode 100644 index 0000000..cfbcb90 --- /dev/null +++ b/docs/reference/src.md @@ -0,0 +1,15 @@ +# Package functions + +These are miscellaneous functions in the `src` package. + +```{eval-rst} +.. currentmodule:: src +``` + +## Function heading + +```{eval-rst} +.. autosummary:: + :toctree: api/ + +``` diff --git a/docs/reference/utils.md b/docs/reference/utils.md new file mode 100644 index 0000000..640a975 --- /dev/null +++ b/docs/reference/utils.md @@ -0,0 +1,15 @@ +# Create utility functions + +These `src` package functions create utility functions. + +```{eval-rst} +.. currentmodule:: src +``` + +## Function heading + +```{eval-rst} +.. autosummary:: + :toctree: api/ + +``` diff --git a/docs/structure/README.md b/docs/structure/README.md new file mode 100644 index 0000000..9f6450a --- /dev/null +++ b/docs/structure/README.md @@ -0,0 +1,166 @@ +# `govuk-data-science-workshop` structure + +This page provides information on the repository's structure. The repository's folder +structure is explained here: + +```{toctree} +:maxdepth: 2 +./data.md +./docs.md +./notebooks.md +./outputs.md +./src.md +./tests.md +``` + +## Top-level files + +Each subsection here contains a brief description about the files at the top-level of +this Git repository. + +### `.envrc` + +A file containing environment variables for the Git repository that can be selectively +loaded. [`.envrc` uses the `direnv` shell extension to load these environment +variables][direnv]. + +This file contains a `sed` command to output a `.env` file with all the environment +variables. This may be useful for sourcing environment variables, for example in +conjunction with PyCharm's EnvFile plugin. + +To ensure this `sed` command works correctly, make sure any file paths listed in this +file are absolute file paths (recommended). Relative file paths using other +environment variables only work for Python users. Environment variable names can +only contain letters, numbers or underscores as well. For example: + +```shell +export DIR_DATA=$(pwd)/data # fine for Python and R users +export DIR_DATA_EXTERNAL=$(pwd)/data/external # fine for Python and R users +export DIR_DATA_EXTERNAL=./data/external # fine for Python and R users +export DIR_DATA_EXTERNAL=$DIR_DATA/external # fine for Python users only +export DIR-DATA-EXTERNAL=$DIR_DATA/external # will break the `sed` command! +``` + +### `.flake8` + +A configuration file for the `flake8` Python package that provides linting. This file +is based on the [common configuration described in the GDS Way][gds-way-flake8]. + +### `.gitignore` + +A `.gitignore` file to ignore certain files and folders from this Git repository. [See +the contributor guide to modift the `.gitignore` file][docs-updating-gitignore]. + +### `.pre-commit-config.yaml` + +[A pre-commit hook configuration file][docs-pre-commit-hooks]. + +### `.secrets` + +A file to store all secrets and credentials as environment variables. [This file is +read-in by `.envrc`](#envrc), when [loading environment variables with the `direnv` +shell extension][direnv], but is not tracked by Git. + +### `.secrets.baseline` + +[Baseline file for the `detect-secrets` to detect secrets][detect-secrets]. In +conjunction with `pre-commit`, `detect-secrets` prevents secrets from being committed +to the repository. The baseline file flags secret-like data that the user deliberately +wishes to commit the to repository. + +### `CODE_OF_CONDUCT.md` + +[The Code of Conduct for contributors to this project][code-of-conduct], including +maintainers and `alphagov` organisation owners. + +### `conftest.py` + +File to contain shared fixture functions for the `pytest` tests in the `tests` folder. + +### `CONTRIBUTING.md` + +The contributing guidelines for this project. + +### `LICENSE` + +The licence for this project. Unless stated otherwise, the codebase is released under +the MIT License. This covers both the codebase and any sample code in the +documentation. The documentation is © Crown copyright and available under the terms of +the Open Government 3.0 licence. + +### `Makefile` + +The `Makefile` contains a set of commands for the `make` utility. Run the `help` +command for further information at the top-level of the Git repository. + +```shell +make help +``` + +### `pyproject.toml` + +A file containing Python project settings. This includes configuration settings for: + +- [`isort`](#isort) +- [`pytest`](#pytest) +- [code coverage](#code-coverage) + +#### `isort` + +Python imports are arranged according to the [specification defined by `black`][black]. + +#### `pytest` + +To run the tests within the `tests` folder using the `pytest` Python package, enter +the following command: + +```shell +pytest +``` + +#### Code coverage + +To run code coverage using the `coverage` Python package with `pytest`, enter the +following command: + +```shell +coverage run -m pytest +coverage html +``` + +or using the `make` command: + +```shell +make coverage_html +``` + +A code coverage report in HTML will be produced on the code in the `src` folder. This +HTML report can be accessed at `htmlcov/index.html`. + +### `README.md` + +An overview of the Git repository, including all necessary instructions to run the code. + +### `requirements.txt` + +A list of Python package requirements for this Git repository, which can be installed +using the `pip install` command. + +```shell +pip install --requirement requirements.txt +``` + +Alternatively, to install the requirements file along with pre-commit hooks, run the +following command: + +```shell +make requirements +``` + +[black]: https://black.readthedocs.io/en/stable/ +[code-of-conduct]:../contributor_guide/CODE_OF_CONDUCT.md +[detect-secrets]: https://github.com/Yelp/detect-secrets +[direnv]: https://direnv.net/ +[docs-pre-commit-hooks]: ../contributor_guide/pre_commit_hooks.md +[docs-updating-gitignore]: ../contributor_guide/updating_gitignore.md +[gds-way-flake8]: https://gds-way.cloudapps.digital/manuals/programming-languages/python/python.html#common-configuration diff --git a/docs/structure/data.md b/docs/structure/data.md new file mode 100644 index 0000000..f06a7ee --- /dev/null +++ b/docs/structure/data.md @@ -0,0 +1,3 @@ +```{include} ../../data/README.md +:relative-docs: ../docs/structure +``` diff --git a/docs/structure/docs.md b/docs/structure/docs.md new file mode 100644 index 0000000..b95ce84 --- /dev/null +++ b/docs/structure/docs.md @@ -0,0 +1,3 @@ +```{include} ../README.md +:relative-docs: ../docs +``` diff --git a/docs/structure/notebooks.md b/docs/structure/notebooks.md new file mode 100644 index 0000000..b950f2c --- /dev/null +++ b/docs/structure/notebooks.md @@ -0,0 +1,3 @@ +```{include} ../../notebooks/README.md +:relative-docs: ../docs/structure +``` diff --git a/docs/structure/outputs.md b/docs/structure/outputs.md new file mode 100644 index 0000000..f15500f --- /dev/null +++ b/docs/structure/outputs.md @@ -0,0 +1,3 @@ +```{include} ../../outputs/README.md +:relative-docs: ../docs/structure +``` diff --git a/docs/structure/src.md b/docs/structure/src.md new file mode 100644 index 0000000..6c3a21c --- /dev/null +++ b/docs/structure/src.md @@ -0,0 +1,2 @@ +```{include} ../../src/README.md +``` diff --git a/docs/structure/tests.md b/docs/structure/tests.md new file mode 100644 index 0000000..66d3ee5 --- /dev/null +++ b/docs/structure/tests.md @@ -0,0 +1,2 @@ +```{include} ../../tests/README.md +``` diff --git a/docs/user_guide/README.md b/docs/user_guide/README.md new file mode 100644 index 0000000..c17333a --- /dev/null +++ b/docs/user_guide/README.md @@ -0,0 +1,8 @@ +# User guide + +This is the user guide for the `govuk-data-science-workshop` project. + +```{toctree} +:maxdepth: 2 +./loading_environment_variables.md +``` diff --git a/docs/user_guide/loading_environment_variables.md b/docs/user_guide/loading_environment_variables.md new file mode 100644 index 0000000..0dba5b7 --- /dev/null +++ b/docs/user_guide/loading_environment_variables.md @@ -0,0 +1,77 @@ +# Loading environment variables + +[We use `direnv` to load environment variables][direnv], as these are only loaded when +inside the project folder. This can prevent accidental conflicts with identically named +variables. + +## Using `direnv` + +To load the environment variables, first [follow the `direnv` installation +instructions](#installing-direnv), and [make sure you have a `.secrets` file to store +secrets and credentials](#storing-secrets-and-credentials). Then: + +1. Open your terminal; +2. Navigate to the project folder; and + - You should see the following message: + ```shell + direnv: error .envrc is blocked. Run `direnv allow` to approve its content. + ``` +3. Allow `direnv`. + ```shell + direnv allow + ``` + +You only need to do this once, and again each time `.envrc` and `.secrets` are modified. + +### Installing `direnv` + +These instructions assume you are running on macOS with administrator privileges using +a bash terminal. For other ways of installing `direnv`, and its shell hooks, consult +the `direnv` documentation. + +1. Open your terminal; +2. [Install `direnv` using Homebrew][homebrew]; + ```shell + brew install direnv + ``` +3. Add the shell hooks to your `.bash_profile`; + ```shell + echo 'eval "$(direnv hook bash)"' >> ~/.bash_profile + ``` +4. Check that the shell hooks have been added correctly; and + ```shell + cat ~/.bash_profile + ``` + - This should display `eval "$(direnv hook bash)"` +5. Restart your terminal. + +## Storing secrets and credentials + +Secrets and credentials must be stored in the `.secrets` file. This file is not +version-controlled, so no secrets should be committed to GitHub. + +In your terminal navigate to the root folder, and create a `.secrets` file. + +```shell +touch .secrets +``` + +Open this new `.secrets` file using your preferred text editor, and add any secrets as +environmental variables. For example, to add a JSON credentials file for Google +BigQuery, save the following changes to `.secrets`. + +```shell +export GOOGLE_APPLICATION_CREDENTIALS="path/to/credentials.json" +``` + +Once complete, make sure the `.secrets` file has the following line uncommented out: + +```shell +source_env ".secrets" +``` + +This ensures [`direnv`][direnv] loads the `.secrets` file using `.envrc` without +version-controlling `.secrets`. + +[direnv]: https://direnv.net/ +[homebrew]: https://brew.sh/ diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..b657204 --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,7 @@ +# `notebooks` folder overview + +All Jupyter notebooks should be stored in this folder. + +The `.envrc` file should automatically add the entire project path into the +`PYTHONPATH` environment variable. This should allow you to directly import `src` in +your notebook. diff --git a/outputs/README.md b/outputs/README.md new file mode 100644 index 0000000..74fb5bc --- /dev/null +++ b/outputs/README.md @@ -0,0 +1,12 @@ +# `outputs` folder overview + +All outputs from the project should be stored here. This folder path for these +directories is loaded as an environment variable by the `.envrc` file; to load them in +Python, use the following code: + +```python +import os + +# Load environment variables for the `outputs` folder +DIR_OUTPUTS = os.getenv("DIR_OUTPUTS") +``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d8d811a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +# `coverage` configurations +[tool.coverage.run] +source = [ + "./src" +] + +[tool.coverage.report] +exclude_lines = [ + "if __name__ == .__main__.:" +] + +# `isort` configurations +[tool.isort] +profile = "black" + +# `pytest` configurations +[tool.pytest.ini_options] +addopts = [ + "-vv", + "--doctest-modules" +] +doctest_optionflags = "NORMALIZE_WHITESPACE" +testpaths = [ + "./tests" +] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..52ebf20 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +coverage +detect-secrets==1.0.3 +myst-parser +pre-commit +pytest +Sphinx +toml diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..e737ef5 --- /dev/null +++ b/src/README.md @@ -0,0 +1,24 @@ +# `src` package overview + +All functions for this project, should be stored in this folder. All tests should be +stored in the `tests` folder, which is one-level above this folder in the main project +directory. + +The sub-folders should be used as follows: + +- `make_data`: data processing-related functions +- `make_features`: feature-related functions, for example, functions to create features + from processed data +- `make_models`: model-related functions +- `make_visualisations`: functions to produce visualisations +- `utils`: utility functions that are helpful in the project + +Feel free to create/rename/delete these folders as required, as they will not be +necessary for each and every project. + +It is strongly suggested that you import functions in the `src/__init__.py` script. You +should also try to use absolute imports in this script whenever possible. Relative +imports are not discouraged, but can be an issue for projects where the directory +structure is likely to change. See [PEP 328 for details on absolute imports][pep-328]. + +[pep-328]: https://www.python.org/dev/peps/pep-0328/ diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/make_data/__init__.py b/src/make_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/make_features/__init__.py b/src/make_features/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/make_models/__init__.py b/src/make_models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/make_visualisations/__init__.py b/src/make_visualisations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..eac4207 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,3 @@ +# `tests` folder overview + +All tests for the functions defined in the `src` folder should be stored here.