Skip to content

Commit

Permalink
♻️ REFACTOR: Move archive backend to aiida/storage (#5375)
Browse files Browse the repository at this point in the history
This PR moves the archive storage backend implementation (`sqlite_zip`) into `aiida/storage`
alongside the "main" `psql_dos` backend.
It includes both the storage reading and migration functionality.

Inline with this synchronisation of the archive and the storage backend:

- A number of the archive specific exceptions have been replaced with general storage ones:
  - `UnreadableArchiveError`/`CorruptArchive` -> `CorruptStorage`
  -  `IncompatibleArchiveVersionError` -> `IncompatibleStorageSchema`
  - `ArchiveMigrationError` -> `StorageMigrationError`
  - `ArchiveClosedError` -> `ClosedStorage`
- `MIGRATE_LOGGER` has been moved to `aiida/storage/log.py`
- `verdi archive inspect` has been deprecated
    and replaced by `verdi archive version` and `verdi archive info`,
    to bring it inline with `verdi storage`.

alembic migrations have been added to the `sqlite_zip` migration logic,
to support future migrations of the sqlite database.
Alongside this, the non-null restrictions for legacy archive fields have been relaxed,
to improve robustness.
Subsequent migrations have been added;
first, to replace any existing (unwanted) null values with default values,
then to modify the schema to re-add the non-null constraints.

Finally, additional tests have been put in place (in `tests/tools/archive/test_schema.py`),
to ensure that the sqlite schema is always synchronised with the postgresql schema.
  • Loading branch information
chrisjsewell committed Mar 6, 2022
1 parent 09765ec commit bc9ae71
Show file tree
Hide file tree
Showing 106 changed files with 2,562 additions and 1,467 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Expand Up @@ -103,6 +103,7 @@ repos:
aiida/storage/psql_dos/backend.py|
aiida/storage/psql_dos/orm/querybuilder/.*py|
aiida/storage/psql_dos/utils.py|
aiida/storage/sqlite_zip/.*.py|
aiida/tools/graph/graph_traversers.py|
aiida/tools/groups/paths.py|
aiida/tools/archive/.*py|
Expand Down
122 changes: 62 additions & 60 deletions aiida/cmdline/commands/cmd_archive.py
Expand Up @@ -11,18 +11,19 @@
"""`verdi archive` command."""
from enum import Enum
import logging
from pathlib import Path
import traceback
from typing import List, Tuple
import urllib.request

import click
from click_spinner import spinner
import tabulate

from aiida.cmdline.commands.cmd_verdi import verdi
from aiida.cmdline.params import arguments, options
from aiida.cmdline.params.types import GroupParamType, PathOrUrl
from aiida.cmdline.utils import decorators, echo
from aiida.common.exceptions import CorruptStorage, IncompatibleStorageSchema, UnreachableStorage
from aiida.common.links import GraphTraversalRules
from aiida.common.log import AIIDA_LOGGER

Expand All @@ -36,66 +37,68 @@ def verdi_archive():
"""Create, inspect and import AiiDA archives."""


@verdi_archive.command('inspect')
@verdi_archive.command('version')
@click.argument('path', nargs=1, type=click.Path(exists=True, readable=True))
def archive_version(path):
"""Print the current version of an archive's schema."""
# note: this mirrors `cmd_storage:storage_version`
# it is currently hardcoded to the `SqliteZipBackend`, but could be generalized in the future
from aiida.storage.sqlite_zip.backend import SqliteZipBackend
storage_cls = SqliteZipBackend
profile = storage_cls.create_profile(path)
head_version = storage_cls.version_head()
try:
profile_version = storage_cls.version_profile(profile)
except (UnreachableStorage, CorruptStorage) as exc:
echo.echo_critical(f'archive file version unreadable: {exc}')
echo.echo(f'Latest archive schema version: {head_version!r}')
echo.echo(f'Archive schema version of {Path(path).name!r}: {profile_version!r}')


@verdi_archive.command('info')
@click.argument('path', nargs=1, type=click.Path(exists=True, readable=True))
@click.option('--statistics', is_flag=True, help='Provides more in-detail statistically relevant data.')
def archive_info(path, statistics):
"""Summarise the contents of an archive."""
# note: this mirrors `cmd_storage:storage_info`
# it is currently hardcoded to the `SqliteZipBackend`, but could be generalized in the future
from aiida.storage.sqlite_zip.backend import SqliteZipBackend
try:
storage = SqliteZipBackend(SqliteZipBackend.create_profile(path))
except (UnreachableStorage, CorruptStorage) as exc:
echo.echo_critical(f'archive file unreadable: {exc}')
except IncompatibleStorageSchema as exc:
echo.echo_critical(f'archive version incompatible: {exc}')
with spinner():
try:
data = storage.get_info(statistics=statistics)
finally:
storage.close()

echo.echo_dictionary(data, sort_keys=False, fmt='yaml')


@verdi_archive.command('inspect', hidden=True)
@click.argument('archive', nargs=1, type=click.Path(exists=True, readable=True))
@click.option('-v', '--version', is_flag=True, help='Print the archive format version and exit.')
@click.option('-m', '--meta-data', is_flag=True, help='Print the meta data contents and exit.')
@click.option('-d', '--database', is_flag=True, help='Include information on entities in the database.')
def inspect(archive, version, meta_data, database):
@decorators.deprecated_command(
'This command has been deprecated and will be removed soon. '
'Please call `verdi archive version` or `verdi archive info` instead.\n'
)
@click.pass_context
def inspect(ctx, archive, version, meta_data, database): # pylint: disable=unused-argument
"""Inspect contents of an archive without importing it.
By default a summary of the archive contents will be printed.
The various options can be used to change exactly what information is displayed.
.. deprecated:: v2.0.0, use `verdi archive version` or `verdi archive info` instead.
"""
from aiida.tools.archive.abstract import get_format
from aiida.tools.archive.exceptions import UnreadableArchiveError

archive_format = get_format()
latest_version = archive_format.latest_version
try:
current_version = archive_format.read_version(archive)
except UnreadableArchiveError as exc:
echo.echo_critical(f'archive file of unknown format: {exc}')

if version:
echo.echo(current_version)
return

if current_version != latest_version:
echo.echo_critical(
f"Archive version is not the latest: '{current_version}' != '{latest_version}'. "
'Use `verdi migrate` to upgrade to the latest version'
)

with archive_format.open(archive, 'r') as archive_reader:
metadata = archive_reader.get_metadata()

if meta_data:
echo.echo_dictionary(metadata, sort_keys=False)
return

statistics = {
name: metadata[key] for key, name in [
['export_version', 'Version archive'],
['aiida_version', 'Version aiida'],
['compression', 'Compression'],
['ctime', 'Created'],
['mtime', 'Modified'],
] if key in metadata
}
if 'conversion_info' in metadata:
statistics['Conversion info'] = '\n'.join(metadata['conversion_info'])

echo.echo(tabulate.tabulate(statistics.items()))

if database:
echo.echo('')
echo.echo('Database statistics')
echo.echo('-------------------')
with spinner():
with archive_format.open(archive, 'r') as archive_reader:
data = archive_reader.get_backend().get_info(statistics=True)
echo.echo_dictionary(data, sort_keys=False, fmt='yaml')
ctx.invoke(archive_version, path=archive)
elif database:
ctx.invoke(archive_info, path=archive, statistics=True)
else:
ctx.invoke(archive_info, path=archive, statistics=False)


@verdi_archive.command('create')
Expand Down Expand Up @@ -136,7 +139,7 @@ def create(
create_backward, return_backward, call_calc_backward, call_work_backward, include_comments, include_logs,
include_authinfos, compress, batch_size, test_run
):
"""Write subsets of the provenance graph to a single file.
"""Create an archive from all or part of a profiles's data.
Besides Nodes of the provenance graph, you can archive Groups, Codes, Computers, Comments and Logs.
Expand Down Expand Up @@ -214,7 +217,7 @@ def create(
help='Archive format version to migrate to (defaults to latest version).',
)
def migrate(input_file, output_file, force, in_place, version):
"""Migrate an export archive to a more recent format version."""
"""Migrate an archive to a more recent schema version."""
from aiida.common.progress_reporter import set_progress_bar_tqdm, set_progress_reporter
from aiida.tools.archive.abstract import get_format

Expand Down Expand Up @@ -248,7 +251,7 @@ def migrate(input_file, output_file, force, in_place, version):
f'{error.__class__.__name__}:{error}'
)

echo.echo_success(f'migrated the archive to version {version}')
echo.echo_success(f'migrated the archive to version {version!r}')


class ExtrasImportCode(Enum):
Expand Down Expand Up @@ -333,7 +336,7 @@ def import_archive(
ctx, archives, webpages, extras_mode_existing, extras_mode_new, comment_mode, include_authinfos, migration,
batch_size, import_group, group, test_run
):
"""Import data from an AiiDA archive file.
"""Import archived data to a profile.
The archive can be specified by its relative or absolute file path, or its HTTP URL.
"""
Expand Down Expand Up @@ -424,12 +427,11 @@ def _import_archive_and_migrate(archive: str, web_based: bool, import_kwargs: di
:param archive: the path or URL to the archive
:param web_based: If the archive needs to be downloaded first
:param import_kwargs: keyword arguments to pass to the import function
:param try_migration: whether to try a migration if the import raises IncompatibleArchiveVersionError
:param try_migration: whether to try a migration if the import raises `IncompatibleStorageSchema`
"""
from aiida.common.folders import SandboxFolder
from aiida.tools.archive.abstract import get_format
from aiida.tools.archive.exceptions import IncompatibleArchiveVersionError
from aiida.tools.archive.imports import import_archive as _import_archive

archive_format = get_format()
Expand All @@ -452,7 +454,7 @@ def _import_archive_and_migrate(archive: str, web_based: bool, import_kwargs: di
echo.echo_report(f'starting import: {archive}')
try:
_import_archive(archive_path, archive_format=archive_format, **import_kwargs)
except IncompatibleArchiveVersionError as exception:
except IncompatibleStorageSchema as exception:
if try_migration:

echo.echo_report(f'incompatible version detected for {archive}, trying migration')
Expand Down
6 changes: 3 additions & 3 deletions aiida/manage/configuration/profile.py
Expand Up @@ -127,9 +127,9 @@ def storage_cls(self) -> Type['StorageBackend']:
if self.storage_backend == 'psql_dos':
from aiida.storage.psql_dos.backend import PsqlDosBackend
return PsqlDosBackend
if self.storage_backend == 'archive.sqlite':
from aiida.tools.archive.implementations.sqlite.backend import ArchiveReadOnlyBackend
return ArchiveReadOnlyBackend
if self.storage_backend == 'sqlite_zip':
from aiida.storage.sqlite_zip.backend import SqliteZipBackend
return SqliteZipBackend
raise ValueError(f'unknown storage backend type: {self.storage_backend}')

@property
Expand Down
1 change: 1 addition & 0 deletions aiida/storage/log.py
Expand Up @@ -12,3 +12,4 @@
from aiida.common.log import AIIDA_LOGGER

STORAGE_LOGGER = AIIDA_LOGGER.getChild('storage')
MIGRATE_LOGGER = STORAGE_LOGGER.getChild('migrate')
2 changes: 1 addition & 1 deletion aiida/storage/psql_dos/__init__.py
Expand Up @@ -7,7 +7,7 @@
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.aiida.net #
###########################################################################
"""Module with implementation of the storage backend using SqlAlchemy and the disk-objectstore."""
"""Module with implementation of the storage backend using PostGreSQL and the disk-objectstore."""

# AUTO-GENERATED

Expand Down
2 changes: 1 addition & 1 deletion aiida/storage/psql_dos/backend.py
Expand Up @@ -55,7 +55,7 @@ def version_head(cls) -> str:
return cls.migrator.get_schema_version_head()

@classmethod
def version_profile(cls, profile: Profile) -> None:
def version_profile(cls, profile: Profile) -> Optional[str]:
return cls.migrator(profile).get_schema_version_profile(check_legacy=True)

@classmethod
Expand Down
14 changes: 2 additions & 12 deletions aiida/storage/psql_dos/migrations/env.py
Expand Up @@ -16,18 +16,8 @@ def run_migrations_online():
The connection should have been passed to the config, which we use to configue the migration context.
"""
from aiida.storage.psql_dos.models.base import get_orm_metadata

# pylint: disable=unused-import
from aiida.common.exceptions import DbContentError
from aiida.storage.psql_dos.models.authinfo import DbAuthInfo
from aiida.storage.psql_dos.models.base import Base
from aiida.storage.psql_dos.models.comment import DbComment
from aiida.storage.psql_dos.models.computer import DbComputer
from aiida.storage.psql_dos.models.group import DbGroup
from aiida.storage.psql_dos.models.log import DbLog
from aiida.storage.psql_dos.models.node import DbLink, DbNode
from aiida.storage.psql_dos.models.settings import DbSetting
from aiida.storage.psql_dos.models.user import DbUser
config = context.config # pylint: disable=no-member

connection = config.attributes.get('connection', None)
Expand All @@ -43,7 +33,7 @@ def run_migrations_online():

context.configure( # pylint: disable=no-member
connection=connection,
target_metadata=Base.metadata,
target_metadata=get_orm_metadata(),
transaction_per_migration=True,
aiida_profile=aiida_profile,
on_version_apply=on_version_apply
Expand Down
18 changes: 8 additions & 10 deletions aiida/storage/psql_dos/migrator.py
Expand Up @@ -33,6 +33,7 @@

from aiida.common import exceptions
from aiida.manage.configuration.profile import Profile
from aiida.storage.log import MIGRATE_LOGGER
from aiida.storage.psql_dos.models.settings import DbSetting
from aiida.storage.psql_dos.utils import create_sqlalchemy_engine

Expand Down Expand Up @@ -197,8 +198,6 @@ def migrate(self) -> None:
:raises: :class:`~aiida.common.exceptions.UnreachableStorage` if the storage cannot be accessed
"""
from aiida.cmdline.utils import echo

# the database can be in one of a few states:
# 1. Completely empty -> we can simply initialise it with the current ORM schema
# 2. Legacy django database -> we transfer the version to alembic, migrate to the head of the django branch,
Expand All @@ -211,7 +210,7 @@ def migrate(self) -> None:
if not inspect(connection).has_table(self.alembic_version_tbl_name):
if not inspect(connection).has_table(self.django_version_table.name):
# the database is assumed to be empty, so we need to initialise it
echo.echo_report('initialising empty storage schema')
MIGRATE_LOGGER.report('initialising empty storage schema')
self.initialise()
return
# the database is a legacy django one,
Expand All @@ -238,10 +237,10 @@ def migrate(self) -> None:
if 'django' in branches or 'sqlalchemy' in branches:
# migrate up to the top of the respective legacy branches
if 'django' in branches:
echo.echo_report('Migrating to the head of the legacy django branch')
MIGRATE_LOGGER.report('Migrating to the head of the legacy django branch')
self.migrate_up('django@head')
elif 'sqlalchemy' in branches:
echo.echo_report('Migrating to the head of the legacy sqlalchemy branch')
MIGRATE_LOGGER.report('Migrating to the head of the legacy sqlalchemy branch')
self.migrate_up('sqlalchemy@head')
# now re-stamp with the comparable revision on the main branch
with self._connection_context() as connection:
Expand All @@ -251,7 +250,7 @@ def migrate(self) -> None:
connection.commit()

# finally migrate to the main head revision
echo.echo_report('Migrating to the head of the main branch')
MIGRATE_LOGGER.report('Migrating to the head of the main branch')
self.migrate_up('main@head')

def migrate_up(self, version: str) -> None:
Expand Down Expand Up @@ -284,7 +283,7 @@ def _alembic_script(cls):
return ScriptDirectory.from_config(cls._alembic_config())

@contextlib.contextmanager
def _alembic_connect(self, _connection: Optional[Connection] = None):
def _alembic_connect(self, _connection: Optional[Connection] = None) -> Iterator[Config]:
"""Context manager to return an instance of an Alembic configuration.
The profiles's database connection is added in the `attributes` property, through which it can then also be
Expand All @@ -297,16 +296,15 @@ def _alembic_connect(self, _connection: Optional[Connection] = None):

def _callback(step: MigrationInfo, **kwargs): # pylint: disable=unused-argument
"""Callback to be called after a migration step is executed."""
from aiida.cmdline.utils import echo
from_rev = step.down_revision_ids[0] if step.down_revision_ids else '<base>'
echo.echo_report(f'- {from_rev} -> {step.up_revision_id}')
MIGRATE_LOGGER.report(f'- {from_rev} -> {step.up_revision_id}')

config.attributes['on_version_apply'] = _callback # pylint: disable=unsupported-assignment-operation

yield config

@contextlib.contextmanager
def _migration_context(self, _connection: Optional[Connection] = None) -> MigrationContext:
def _migration_context(self, _connection: Optional[Connection] = None) -> Iterator[MigrationContext]:
"""Context manager to return an instance of an Alembic migration context.
This migration context will have been configured with the current database connection, which allows this context
Expand Down
33 changes: 33 additions & 0 deletions aiida/storage/sqlite_zip/__init__.py
@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
###########################################################################
# Copyright (c), The AiiDA team. All rights reserved. #
# This file is part of the AiiDA code. #
# #
# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.aiida.net #
###########################################################################
"""Module with implementation of the storage backend,
using an SQLite database and repository files, within a zipfile.
The content of the zip file is::
|- storage.zip
|- metadata.json
|- db.sqlite3
|- repo/
|- hashkey1
|- hashkey2
...
For quick access, the metadata (such as the version) is stored in a `metadata.json` file,
at the "top" of the zip file, with the sqlite database, just below it, then the repository files.
Repository files are named by their SHA256 content hash.
This storage method is primarily intended for the AiiDA archive,
as a read-only storage method.
This is because sqlite and zip are not suitable for concurrent write access.
The archive format originally used a JSON file to store the database,
and these revisions are handled by the `version_profile` and `migrate` backend methods.
"""

0 comments on commit bc9ae71

Please sign in to comment.