diff --git a/README.md b/README.md index 6ac76c76..de04a821 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,12 @@ ![Python](https://img.shields.io/badge/python-3.9-blue.svg) ![Python](https://img.shields.io/badge/python-3.10-blue.svg) [![PyPI version](https://badge.fury.io/py/biocypher.svg)](https://badge.fury.io/py/biocypher) -[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) -![Docs build](https://github.com/biocypher/biocypher/actions/workflows/sphinx_autodoc.yml/badge.svg) -[![Downloads](https://static.pepy.tech/badge/biocypher)](https://pepy.tech/project/biocypher) -[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) +[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) +![Docs build](https://github.com/biocypher/biocypher/actions/workflows/sphinx_autodoc.yml/badge.svg) +[![Downloads](https://static.pepy.tech/badge/biocypher)](https://pepy.tech/project/biocypher) +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) -[![Powered by the Bioregistry](https://img.shields.io/static/v1?label=Powered%20by&message=Bioregistry&color=BA274A&style=flat&logo=image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACgAAAAoCAYAAACM/rhtAAAACXBIWXMAAAEnAAABJwGNvPDMAAAAGXRFWHRTb2Z0d2FyZQB3d3cuaW5rc2NhcGUub3Jnm+48GgAACi9JREFUWIWtmXl41MUZxz/z291sstmQO9mQG0ISwHBtOOSwgpUQhApWgUfEowKigKI81actypaqFbWPVkGFFKU0Vgs+YgvhEAoqEUESrnDlEEhCbkLYJtlkk9399Y/N/rKbzQXt96+Zed+Z9/t7Z+adeecnuA1s5yFVSGrLOAf2qTiEEYlUZKIAfYdKE7KoBLkQSc4XgkPfXxz/owmT41ZtiVtR3j94eqxQq5aDeASIvkVb12RBtt0mb5xZsvfa/5XgnqTMcI3Eq7IQjwM+7jJJo8YvNhK/qDBUOl8A7JZWWqqu01Jeg6Pd1nW4NuBjjax6eWrRruv/M8EDqTMflmXeB0Jcbb6RIRhmTCJ0ymgC0wYjadTd9nW0tWMu+In63NNU7c3FWtvgJpXrZVlakVGU8/ltEcwzGjU3miI/ABa72vwTB5K45AEi7x2PUEl9fZsHZLuDmgPHuLJpJ82lle6iTSH6mpXp+fnt/Sa4yzhbp22yfwFkgnMaBy17kPhFmQh1997qLxztNkq35XB505fINtf0iz1WvfTQ7Pxdlj4Jdnjuny5yvpEhjHh7FQOGD/YyZi4owS86HJ+QQMDpJaBf3jUXlHD21+8q0y4LDppV/vfNO7+jzV3Pa6SOac0E8I8fSPonpm7JAVR+eRhzwU/Ofj+e49tpT/HdtGXcyLvQJ8HAtCTGfmJCF2dwfpTMz4NszX/uqqdyr+xPyVwoEK+C03PGrDX4GkJ7NBJ+txH/hCgAit7cRlNxOY62dmzmZgwzJvZJUh2gI/xnRmoOHsfe3AqQ/kho0qXs+pLzLh3FgwdT54YKxLsAQq0mbf1zHuTsltZejemHJSrlgGGDPGTXc09zdM5qTi59jZbKOg+Zb1QYI95+XokEQogPDifPDnPJFQ8uCkl8FyGmACQtn4dhxp3KINX7jnHi0ZeJnT8dla8Plbu+48zzfyJ08kh8ggIACB4zlIAhsURm3EnML6eB6Fzep1a+SUt5DS2VddTs+4GQccPRhgV1kowIQRaChhMXAPxkIev/Vl+8R/HgnqTMmI4gjH/iQOIXZSqdzQUlXDB9RPyi+1DrdVx67WMursvCkDERXYxB0ROSIOKecURMG+tBzkXAhbYbZk6teNPLkwmPzUIX71wuMiw+MHx2nEJQrWIFHSdE4pIHlFDisLZxYe1HhIwfTtLK+RSu30rVnlxGvrOapOcW9DsW3vH6CgKS4zxIXlz3Fw8dSaMmcfEcV9XHYbc/DSCZMEkgFoJzY0TeO17pVL7jANbaBoauWUJlTi4VOw+T9sazBKYl0ZB/qV/kALThQRi3vOJB0lpzw0vPMONOtOHOqRcyi7bzkEqanJo3HogBMGROUrziaGundGsOsQsyUPn6UPx2NvELZxIybhinn3uLyx9uVwaW7XbqjxdQmr2X0uy93Dh+Dtlu9zCu9vdj1PsvEWwcii7OwJAXFnoRFCoVhoxJrmr0gOQWo9qBfaorXodOHq0o1x8roN3cSMyC6ZT942uQBIlL53Jl804sV6oY9/fXAGg4WcjFdZuxlFV7GNPFRzFs7VKCRiV7ejJrTa/eDr1rFKXZOQCocEyTgHQAyUdD4B2d4cF8pohg4zC0YUFU7z5C9Jy7sVvbKPtsH6GT0tCGBtFwspBTz/zRixyApbSKk8te5+aZ4l4JdUVQWpIScmQhjGocUjJCRhcTieSjURQTF89FtttpuVaLpaya8Knp1B3OQ5Zlag/nU//9cmScS6EnONrauWjazIQv3kCoVD3quUPS+uAXHU7z1SpATpEQchSA78AwD0WVnxa1XkdjURlCJRGQHMfN/EuEjk9jyr4NRN47Hltjc58Gm0sraTjZ/w3l5BLuKkZJdFzT1f5+3Sq3NZjRDNAjaX1orb2BX2wEmkA9fvGGbvW7Q+OlUu+2wlIqdx+h3dzkJVPrda5iQJ93p+DRqcQ/PhsAw8xJ6AfHdkhuIVvoEribLl/jxKOv4Gi34T8omgnb1yOk7sdTA01AiK3J6yoGgP+gaPwHOdOP6LlTlXb3mNYXAlI8da9/e0pJBZovV2BrakYzQK/I3bg0SsiiCqClqs/0wAPB6UOVo6k3+CdEETwm1aPtP+dLlLJPSKAHOYDWCoVLlYTkKAKcCU4vO7IrhErFsLVLPXZ+V0haDcN+v8xjB9strdQfPavUA0ckefRxWNuwVNS6rBRKQB44r+Lmc5f7TRAgaFQyYzb9Dv/4gd18ASQ8/gsC0zwJNJVcw97aeWmOcDtaAW6eLXZLBchTC8EhWXbW6o+cInhMipetuu9OUvTWNnwNodzx+krlvAQIGjmECV+spyH/Ak3F5QDok+OoPXicip2HiJiWTuH6rQx6eh7BxlT0STH4xUbSUl6Df/xAIqaO9bBVn3taKUuy/ZAwYZImpvx4FYjVRgQzOec9r1vK0TmrldMiIDkO45ZXegxLLrRW13P0/heQHQ4CUhIYvfElNIHOtWaztNJ4qZQBqfFKLg3OMz135rNY624ClB0tHJcomTA5ZMGnANbaBmoOHPMy5hvZebNuLCoj71frXIN0i9pDJzj24IsIlUTCo7NI3/KyQg5ArfMleEyKBzmA6r1HO8eV+dSEySEB2G3yRpwZP1c2f+n1GjB07RIlcwNoKi7j3G839EhQF2cg6fmHmbznPRKevJ/GorIedV1wtLVzJesrV9WqQtoIHRfWjreSjwGar1ZRui3Ho7PfwHBGb3jRg6S1roGeoIuNJGBIPKV/zSF31irOrn4HXAu9B1zduhtLecelQxZZ9xTtrgC342Df8IwQyaYqBMKEWo0xaw1BI4d4DNJSWcfF32fRWnuD5NWPEDZ5lIe8NDuHq1v+ha2xGdkho4szYJg1hbj501EH6OgJ5oIS8hf/oWPm5HqNrE51vdt4nC/7k+9bIIT8GYA2Ipixn5jwjQrrZsju0XT5GubTRfiEBqFPisUvOrzPPi0VdeQ9YcJ63bWmxbzphTk7XHKvA/DrlJkfAU+Bcy2N+fA3vZK0WVoxny4idOKIfn+IO7lTz7zRObWCjdMv7VnhruOV9dws9F8u4CsAS1k1J54wYS4o6arWaaS8hvLP998yuZtnisl7wuROLkdjsKzqqtfL45FjB8gzwZnIJy6dS8Jjs3p8ausvHG3tXN26mytZO5W8Rcjsbg1Qze/X45ELHY9I7wHLXG26+CgSl8zFkDGh3zdkF2S7nep9PzhzmnK3FEGwUWOwrJr6zTdeL529EnRhf3LmfCHEBkBZiNrwIAwZkwi9a5Qzh9D6dNvXYW3jZkEJ9UdOOYPwdY/gXgdiufuGuC2C4Hy3kWXrOhmeBLQeA6jV6GLC8Y0KR613Hn+2phZaK69jqah1P/hdsCKLLIfGtnbG+f3eyfHtEHTh38mzom2SY4WQWQjE9tnBE+XIZKuQNrqCcH9wSwRdMGGSJiTnpatwTJOFMIKcgvPVX/kNIcM1gSgC8iTZfii3aEL+7fyG+C+6O8izl1GE5gAAAABJRU5ErkJggg==)](https://github.com/biopragmatics/bioregistry) +[![Powered by the Bioregistry](https://img.shields.io/static/v1?label=Powered%20by&message=Bioregistry&color=BA274A&style=flat&logo=image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACgAAAAoCAYAAACM/rhtAAAACXBIWXMAAAEnAAABJwGNvPDMAAAAGXRFWHRTb2Z0d2FyZQB3d3cuaW5rc2NhcGUub3Jnm+48GgAACi9JREFUWIWtmXl41MUZxz/z291sstmQO9mQG0ISwHBtOOSwgpUQhApWgUfEowKigKI81actypaqFbWPVkGFFKU0Vgs+YgvhEAoqEUESrnDlEEhCbkLYJtlkk9399Y/N/rKbzQXt96+Zed+Z9/t7Z+adeecnuA1s5yFVSGrLOAf2qTiEEYlUZKIAfYdKE7KoBLkQSc4XgkPfXxz/owmT41ZtiVtR3j94eqxQq5aDeASIvkVb12RBtt0mb5xZsvfa/5XgnqTMcI3Eq7IQjwM+7jJJo8YvNhK/qDBUOl8A7JZWWqqu01Jeg6Pd1nW4NuBjjax6eWrRruv/M8EDqTMflmXeB0Jcbb6RIRhmTCJ0ymgC0wYjadTd9nW0tWMu+In63NNU7c3FWtvgJpXrZVlakVGU8/ltEcwzGjU3miI/ABa72vwTB5K45AEi7x2PUEl9fZsHZLuDmgPHuLJpJ82lle6iTSH6mpXp+fnt/Sa4yzhbp22yfwFkgnMaBy17kPhFmQh1997qLxztNkq35XB505fINtf0iz1WvfTQ7Pxdlj4Jdnjuny5yvpEhjHh7FQOGD/YyZi4owS86HJ+QQMDpJaBf3jUXlHD21+8q0y4LDppV/vfNO7+jzV3Pa6SOac0E8I8fSPonpm7JAVR+eRhzwU/Ofj+e49tpT/HdtGXcyLvQJ8HAtCTGfmJCF2dwfpTMz4NszX/uqqdyr+xPyVwoEK+C03PGrDX4GkJ7NBJ+txH/hCgAit7cRlNxOY62dmzmZgwzJvZJUh2gI/xnRmoOHsfe3AqQ/kho0qXs+pLzLh3FgwdT54YKxLsAQq0mbf1zHuTsltZejemHJSrlgGGDPGTXc09zdM5qTi59jZbKOg+Zb1QYI95+XokEQogPDifPDnPJFQ8uCkl8FyGmACQtn4dhxp3KINX7jnHi0ZeJnT8dla8Plbu+48zzfyJ08kh8ggIACB4zlIAhsURm3EnML6eB6Fzep1a+SUt5DS2VddTs+4GQccPRhgV1kowIQRaChhMXAPxkIev/Vl+8R/HgnqTMmI4gjH/iQOIXZSqdzQUlXDB9RPyi+1DrdVx67WMursvCkDERXYxB0ROSIOKecURMG+tBzkXAhbYbZk6teNPLkwmPzUIX71wuMiw+MHx2nEJQrWIFHSdE4pIHlFDisLZxYe1HhIwfTtLK+RSu30rVnlxGvrOapOcW9DsW3vH6CgKS4zxIXlz3Fw8dSaMmcfEcV9XHYbc/DSCZMEkgFoJzY0TeO17pVL7jANbaBoauWUJlTi4VOw+T9sazBKYl0ZB/qV/kALThQRi3vOJB0lpzw0vPMONOtOHOqRcyi7bzkEqanJo3HogBMGROUrziaGundGsOsQsyUPn6UPx2NvELZxIybhinn3uLyx9uVwaW7XbqjxdQmr2X0uy93Dh+Dtlu9zCu9vdj1PsvEWwcii7OwJAXFnoRFCoVhoxJrmr0gOQWo9qBfaorXodOHq0o1x8roN3cSMyC6ZT942uQBIlL53Jl804sV6oY9/fXAGg4WcjFdZuxlFV7GNPFRzFs7VKCRiV7ejJrTa/eDr1rFKXZOQCocEyTgHQAyUdD4B2d4cF8pohg4zC0YUFU7z5C9Jy7sVvbKPtsH6GT0tCGBtFwspBTz/zRixyApbSKk8te5+aZ4l4JdUVQWpIScmQhjGocUjJCRhcTieSjURQTF89FtttpuVaLpaya8Knp1B3OQ5Zlag/nU//9cmScS6EnONrauWjazIQv3kCoVD3quUPS+uAXHU7z1SpATpEQchSA78AwD0WVnxa1XkdjURlCJRGQHMfN/EuEjk9jyr4NRN47Hltjc58Gm0sraTjZ/w3l5BLuKkZJdFzT1f5+3Sq3NZjRDNAjaX1orb2BX2wEmkA9fvGGbvW7Q+OlUu+2wlIqdx+h3dzkJVPrda5iQJ93p+DRqcQ/PhsAw8xJ6AfHdkhuIVvoEribLl/jxKOv4Gi34T8omgnb1yOk7sdTA01AiK3J6yoGgP+gaPwHOdOP6LlTlXb3mNYXAlI8da9/e0pJBZovV2BrakYzQK/I3bg0SsiiCqClqs/0wAPB6UOVo6k3+CdEETwm1aPtP+dLlLJPSKAHOYDWCoVLlYTkKAKcCU4vO7IrhErFsLVLPXZ+V0haDcN+v8xjB9strdQfPavUA0ckefRxWNuwVNS6rBRKQB44r+Lmc5f7TRAgaFQyYzb9Dv/4gd18ASQ8/gsC0zwJNJVcw97aeWmOcDtaAW6eLXZLBchTC8EhWXbW6o+cInhMipetuu9OUvTWNnwNodzx+krlvAQIGjmECV+spyH/Ak3F5QDok+OoPXicip2HiJiWTuH6rQx6eh7BxlT0STH4xUbSUl6Df/xAIqaO9bBVn3taKUuy/ZAwYZImpvx4FYjVRgQzOec9r1vK0TmrldMiIDkO45ZXegxLLrRW13P0/heQHQ4CUhIYvfElNIHOtWaztNJ4qZQBqfFKLg3OMz135rNY624ClB0tHJcomTA5ZMGnANbaBmoOHPMy5hvZebNuLCoj71frXIN0i9pDJzj24IsIlUTCo7NI3/KyQg5ArfMleEyKBzmA6r1HO8eV+dSEySEB2G3yRpwZP1c2f+n1GjB07RIlcwNoKi7j3G839EhQF2cg6fmHmbznPRKevJ/GorIedV1wtLVzJesrV9WqQtoIHRfWjreSjwGar1ZRui3Ho7PfwHBGb3jRg6S1roGeoIuNJGBIPKV/zSF31irOrn4HXAu9B1zduhtLecelQxZZ9xTtrgC342Df8IwQyaYqBMKEWo0xaw1BI4d4DNJSWcfF32fRWnuD5NWPEDZ5lIe8NDuHq1v+ha2xGdkho4szYJg1hbj501EH6OgJ5oIS8hf/oWPm5HqNrE51vdt4nC/7k+9bIIT8GYA2Ipixn5jwjQrrZsju0XT5GubTRfiEBqFPisUvOrzPPi0VdeQ9YcJ63bWmxbzphTk7XHKvA/DrlJkfAU+Bcy2N+fA3vZK0WVoxny4idOKIfn+IO7lTz7zRObWCjdMv7VnhruOV9dws9F8u4CsAS1k1J54wYS4o6arWaaS8hvLP998yuZtnisl7wuROLkdjsKzqqtfL45FjB8gzwZnIJy6dS8Jjs3p8ausvHG3tXN26mytZO5W8Rcjsbg1Qze/X45ELHY9I7wHLXG26+CgSl8zFkDGh3zdkF2S7nep9PzhzmnK3FEGwUWOwrJr6zTdeL529EnRhf3LmfCHEBkBZiNrwIAwZkwi9a5Qzh9D6dNvXYW3jZkEJ9UdOOYPwdY/gXgdiufuGuC2C4Hy3kWXrOhmeBLQeA6jV6GLC8Y0KR613Hn+2phZaK69jqah1P/hdsCKLLIfGtnbG+f3eyfHtEHTh38mzom2SY4WQWQjE9tnBE+XIZKuQNrqCcH9wSwRdMGGSJiTnpatwTJOFMIKcgvPVX/kNIcM1gSgC8iTZfii3aEL+7fyG+C+6O8izl1GE5gAAAABJRU5ErkJggg==)](https://github.com/biopragmatics/bioregistry) ## ❓ Description Knowledge graphs (KGs) are an [approach to knowledge @@ -60,8 +60,8 @@ please join our community at https://biocypher.zulipchat.com! > This disclaimer was adapted from the [Pooch](https://github.com/fatiando/pooch) project. ## ✍️ Citation -The BioCypher paper has been peer-reviewed in -[Nature Biotechnology](https://www.nature.com/articles/s41587-023-01848-y). +The BioCypher paper has been peer-reviewed in +[Nature Biotechnology](https://www.nature.com/articles/s41587-023-01848-y). Before, it was available as a preprint at https://arxiv.org/abs/2212.13543. ## Acknowledgements diff --git a/biocypher/__init__.py b/biocypher/__init__.py index 52c067da..6222ea76 100644 --- a/biocypher/__init__.py +++ b/biocypher/__init__.py @@ -13,14 +13,14 @@ """ __all__ = [ - '__version__', - '__author__', - 'module_data', - 'config', - 'logfile', - 'log', - 'Driver', - 'BioCypher', + "__version__", + "__author__", + "module_data", + "config", + "logfile", + "log", + "Driver", + "BioCypher", ] from ._core import BioCypher @@ -30,11 +30,10 @@ class Driver(BioCypher): - # initialise parent class but log a warning def __init__(self, *args, **kwargs): logger.warning( - 'The class `Driver` is deprecated and will be removed in a future ' - 'release. Please use `BioCypher` instead.' + "The class `Driver` is deprecated and will be removed in a future " + "release. Please use `BioCypher` instead." ) super().__init__(*args, **kwargs) diff --git a/biocypher/_config/__init__.py b/biocypher/_config/__init__.py index 584a30a5..3d421c1e 100644 --- a/biocypher/_config/__init__.py +++ b/biocypher/_config/__init__.py @@ -23,10 +23,10 @@ import yaml import appdirs -__all__ = ['module_data', 'module_data_path', 'read_config', 'config', 'reset'] +__all__ = ["module_data", "module_data_path", "read_config", "config", "reset"] -_USER_CONFIG_DIR = appdirs.user_config_dir('biocypher', 'saezlab') -_USER_CONFIG_FILE = os.path.join(_USER_CONFIG_DIR, 'conf.yaml') +_USER_CONFIG_DIR = appdirs.user_config_dir("biocypher", "saezlab") +_USER_CONFIG_FILE = os.path.join(_USER_CONFIG_DIR, "conf.yaml") class MyLoader(yaml.SafeLoader): @@ -34,18 +34,18 @@ def construct_scalar(self, node): # Check if the scalar contains double quotes and an escape sequence value = super().construct_scalar(node) q = bool(node.style == '"') - b = bool('\\' in value.encode('unicode_escape').decode('utf-8')) + b = bool("\\" in value.encode("unicode_escape").decode("utf-8")) if q and b: warnings.warn( ( - 'Double quotes detected in YAML configuration scalar: ' + "Double quotes detected in YAML configuration scalar: " f"{value.encode('unicode_escape')}. " - 'These allow escape sequences and may cause problems, for ' + "These allow escape sequences and may cause problems, for " "instance with the Neo4j admin import files (e.g. '\\t'). " - 'Make sure you wanted to do this, and use single quotes ' - 'whenever possible.' + "Make sure you wanted to do this, and use single quotes " + "whenever possible." ), - category=UserWarning + category=UserWarning, ) return value @@ -57,7 +57,7 @@ def module_data_path(name: str) -> str: here = os.path.dirname(os.path.abspath(__file__)) - return os.path.join(here, f'{name}.yaml') + return os.path.join(here, f"{name}.yaml") def module_data(name: str) -> Any: @@ -71,11 +71,8 @@ def module_data(name: str) -> Any: def _read_yaml(path: str) -> Optional[dict]: - if os.path.exists(path): - - with open(path, 'r') as fp: - + with open(path, "r") as fp: return yaml.load(fp.read(), Loader=MyLoader) @@ -89,18 +86,22 @@ def read_config() -> dict: TODO explain path configuration """ - defaults = module_data('biocypher_config') + defaults = module_data("biocypher_config") user = _read_yaml(_USER_CONFIG_FILE) or {} # TODO account for .yml? - local = _read_yaml('biocypher_config.yaml' - ) or _read_yaml('config/biocypher_config.yaml') or {} + local = ( + _read_yaml("biocypher_config.yaml") + or _read_yaml("config/biocypher_config.yaml") + or {} + ) for key in defaults: - - value = local[key] if key in local else user[key] if key in user else None + value = ( + local[key] if key in local else user[key] if key in user else None + ) if value is not None: - if type(defaults[key]) == str: # first level config (like title) + if type(defaults[key]) == str: # first level config (like title) defaults[key] = value else: defaults[key].update(value) @@ -114,20 +115,17 @@ def config(*args, **kwargs) -> Optional[Any]: """ if args and kwargs: - raise ValueError( - 'Setting and getting values in the same call is not allowed.', + "Setting and getting values in the same call is not allowed.", ) if args: - - result = tuple(globals()['_config'].get(key, None) for key in args) + result = tuple(globals()["_config"].get(key, None) for key in args) return result[0] if len(result) == 1 else result for key, value in kwargs.items(): - - globals()['_config'][key].update(value) + globals()["_config"][key].update(value) def reset(): @@ -135,7 +133,7 @@ def reset(): Reload configuration from the config files. """ - globals()['_config'] = read_config() + globals()["_config"] = read_config() reset() diff --git a/biocypher/_config/biocypher_config.yaml b/biocypher/_config/biocypher_config.yaml index 8fae981f..a31167be 100644 --- a/biocypher/_config/biocypher_config.yaml +++ b/biocypher/_config/biocypher_config.yaml @@ -109,5 +109,3 @@ postgresql: delimiter: '\t' # import_call_bin_prefix: '' # path to "psql" # import_call_file_prefix: '/path/to/files' - - \ No newline at end of file diff --git a/biocypher/_connect.py b/biocypher/_connect.py index 88f3b3aa..3e2a2a93 100644 --- a/biocypher/_connect.py +++ b/biocypher/_connect.py @@ -13,7 +13,7 @@ """ from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from typing import Optional from collections.abc import Iterable @@ -27,10 +27,10 @@ from ._ontology import Ontology from ._translate import Translator -__all__ = ['_Neo4jDriver'] +__all__ = ["_Neo4jDriver"] -class _Neo4jDriver(): +class _Neo4jDriver: """ Manages a BioCypher connection to a Neo4j database using the ``neo4j_utils.Driver`` class. @@ -58,6 +58,7 @@ class _Neo4jDriver(): translator (Translator): The translator to use for mapping. """ + def __init__( self, database_name: str, @@ -71,7 +72,6 @@ def __init__( fetch_size: int = 1000, increment_version: bool = True, ): - self._ontology = ontology self._translator = translator @@ -89,23 +89,18 @@ def __init__( # check for biocypher config in connected graph if wipe: - self.init_db() if increment_version: - # set new current version node self._update_meta_graph() def _update_meta_graph(self): - - logger.info('Updating Neo4j meta graph.') + logger.info("Updating Neo4j meta graph.") # find current version node db_version = self._driver.query( - 'MATCH (v:BioCypher) ' - 'WHERE NOT (v)-[:PRECEDES]->() ' - 'RETURN v', + "MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v", ) # add version node self.add_biocypher_nodes(self._ontology) @@ -113,11 +108,11 @@ def _update_meta_graph(self): # connect version node to previous if db_version[0]: previous = db_version[0][0] - previous_id = previous['v']['id'] + previous_id = previous["v"]["id"] e_meta = BioCypherEdge( previous_id, - self._ontology.get_dict().get('node_id'), - 'PRECEDES', + self._ontology.get_dict().get("node_id"), + "PRECEDES", ) self.add_biocypher_edges(e_meta) @@ -132,7 +127,7 @@ def init_db(self): need of the database """ - logger.info('Initialising database.') + logger.info("Initialising database.") self._create_constraints() def _create_constraints(self): @@ -144,17 +139,16 @@ def _create_constraints(self): constraints on the id of all entities represented as nodes. """ - logger.info('Creating constraints for node types in config.') + logger.info("Creating constraints for node types in config.") # get structure for leaf in self._ontology.extended_schema.items(): label = _misc.sentencecase_to_pascalcase(leaf[0]) - if leaf[1]['represented_as'] == 'node': - + if leaf[1]["represented_as"] == "node": s = ( - f'CREATE CONSTRAINT `{label}_id` ' - f'IF NOT EXISTS ON (n:`{label}`) ' - 'ASSERT n.id IS UNIQUE' + f"CREATE CONSTRAINT `{label}_id` " + f"IF NOT EXISTS ON (n:`{label}`) " + "ASSERT n.id IS UNIQUE" ) self._driver.query(s) @@ -246,38 +240,36 @@ def add_biocypher_nodes( """ try: - nodes = _misc.to_list(nodes) entities = [node.get_dict() for node in nodes] except AttributeError: - - msg = 'Nodes must have a `get_dict` method.' + msg = "Nodes must have a `get_dict` method." logger.error(msg) raise ValueError(msg) - logger.info(f'Merging {len(entities)} nodes.') + logger.info(f"Merging {len(entities)} nodes.") entity_query = ( - 'UNWIND $entities AS ent ' - 'CALL apoc.merge.node([ent.node_label], ' - '{id: ent.node_id}, ent.properties, ent.properties) ' - 'YIELD node ' - 'RETURN node' + "UNWIND $entities AS ent " + "CALL apoc.merge.node([ent.node_label], " + "{id: ent.node_id}, ent.properties, ent.properties) " + "YIELD node " + "RETURN node" ) - method = 'explain' if explain else 'profile' if profile else 'query' + method = "explain" if explain else "profile" if profile else "query" result = getattr(self._driver, method)( entity_query, parameters={ - 'entities': entities, + "entities": entities, }, ) - logger.info('Finished merging nodes.') + logger.info("Finished merging nodes.") return result @@ -326,28 +318,23 @@ def add_biocypher_edges( rels = [] try: - for e in edges: - - if hasattr(e, 'get_node'): - + if hasattr(e, "get_node"): nodes.append(e.get_node()) rels.append(e.get_source_edge().get_dict()) rels.append(e.get_target_edge().get_dict()) else: - rels.append(e.get_dict()) except AttributeError: - - msg = 'Edges and nodes must have a `get_dict` method.' + msg = "Edges and nodes must have a `get_dict` method." logger.error(msg) raise ValueError(msg) self.add_biocypher_nodes(nodes) - logger.info(f'Merging {len(rels)} edges.') + logger.info(f"Merging {len(rels)} edges.") # cypher query @@ -355,41 +342,40 @@ def add_biocypher_edges( # properties on match and on create; # TODO add node labels? node_query = ( - 'UNWIND $rels AS r ' - 'MERGE (src {id: r.source_id}) ' - 'MERGE (tar {id: r.target_id}) ' + "UNWIND $rels AS r " + "MERGE (src {id: r.source_id}) " + "MERGE (tar {id: r.target_id}) " ) - self._driver.query(node_query, parameters={'rels': rels}) + self._driver.query(node_query, parameters={"rels": rels}) edge_query = ( - 'UNWIND $rels AS r ' - 'MATCH (src {id: r.source_id}) ' - 'MATCH (tar {id: r.target_id}) ' - 'WITH src, tar, r ' - 'CALL apoc.merge.relationship' - '(src, r.relationship_label, NULL, ' - 'r.properties, tar, r.properties) ' - 'YIELD rel ' - 'RETURN rel' + "UNWIND $rels AS r " + "MATCH (src {id: r.source_id}) " + "MATCH (tar {id: r.target_id}) " + "WITH src, tar, r " + "CALL apoc.merge.relationship" + "(src, r.relationship_label, NULL, " + "r.properties, tar, r.properties) " + "YIELD rel " + "RETURN rel" ) - method = 'explain' if explain else 'profile' if profile else 'query' + method = "explain" if explain else "profile" if profile else "query" - result = getattr(self._driver, - method)(edge_query, parameters={ - 'rels': rels - }) + result = getattr(self._driver, method)( + edge_query, parameters={"rels": rels} + ) - logger.info('Finished merging edges.') + logger.info("Finished merging edges.") return result def get_driver( dbms: str, - translator: 'Translator', - ontology: 'Ontology', + translator: "Translator", + ontology: "Ontology", ): """ Function to return the writer class. @@ -400,14 +386,14 @@ def get_driver( dbms_config = _config(dbms) - if dbms == 'neo4j': + if dbms == "neo4j": return _Neo4jDriver( - database_name=dbms_config['database_name'], - wipe=dbms_config['wipe'], - uri=dbms_config['uri'], - user=dbms_config['user'], - password=dbms_config['password'], - multi_db=dbms_config['multi_db'], + database_name=dbms_config["database_name"], + wipe=dbms_config["wipe"], + uri=dbms_config["uri"], + user=dbms_config["user"], + password=dbms_config["password"], + multi_db=dbms_config["multi_db"], ontology=ontology, translator=translator, ) diff --git a/biocypher/_core.py b/biocypher/_core.py index a6096eb0..2cf6f796 100644 --- a/biocypher/_core.py +++ b/biocypher/_core.py @@ -12,34 +12,36 @@ BioCypher core module. Interfaces with the user and distributes tasks to submodules. """ -from typing import Dict, List, Optional +from typing import Optional + from more_itertools import peekable + import pandas as pd from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from ._write import get_writer -from ._pandas import Pandas from ._config import config as _config from ._config import update_from_file as _file_update from ._create import BioCypherEdge, BioCypherNode +from ._pandas import Pandas from ._connect import get_driver from ._mapping import OntologyMapping from ._ontology import Ontology from ._translate import Translator from ._deduplicate import Deduplicator -__all__ = ['BioCypher'] +__all__ = ["BioCypher"] -SUPPORTED_DBMS = ['neo4j', 'postgresql'] +SUPPORTED_DBMS = ["neo4j", "postgresql"] REQUIRED_CONFIG = [ - 'dbms', - 'offline', - 'strict_mode', - 'head_ontology', + "dbms", + "offline", + "strict_mode", + "head_ontology", ] @@ -75,6 +77,7 @@ class BioCypher: provided, the default value 'biocypher-out' will be used. """ + def __init__( self, dbms: str = None, @@ -88,65 +91,64 @@ def __init__( # legacy params db_name: str = None, ): - # Update configuration if custom path is provided if biocypher_config_path: _file_update(biocypher_config_path) if db_name: logger.warning( - 'The parameter `db_name` is deprecated. Please set the ' - '`database_name` setting in the `biocypher_config.yaml` file ' - 'instead.' + "The parameter `db_name` is deprecated. Please set the " + "`database_name` setting in the `biocypher_config.yaml` file " + "instead." ) - _config(**{db_name: {'database_name': db_name}}) + _config(**{db_name: {"database_name": db_name}}) # Load configuration - self.base_config = _config('biocypher') + self.base_config = _config("biocypher") # Check for required configuration for key in REQUIRED_CONFIG: if key not in self.base_config: - raise ValueError(f'Configuration key {key} is required.') + raise ValueError(f"Configuration key {key} is required.") # Set configuration - mandatory - self._dbms = dbms or self.base_config['dbms'] + self._dbms = dbms or self.base_config["dbms"] if offline is None: - self._offline = self.base_config['offline'] + self._offline = self.base_config["offline"] else: self._offline = offline if strict_mode is None: - self._strict_mode = self.base_config['strict_mode'] + self._strict_mode = self.base_config["strict_mode"] else: self._strict_mode = strict_mode self._schema_config_path = schema_config_path or self.base_config.get( - 'schema_config_path' + "schema_config_path" ) if not self._schema_config_path: raise ValueError( - 'BioCypher requires a schema configuration; please provide a ' - 'path to the schema configuration YAML file via ' - '`biocypher_config.yaml` or `BioCypher` class parameter.' + "BioCypher requires a schema configuration; please provide a " + "path to the schema configuration YAML file via " + "`biocypher_config.yaml` or `BioCypher` class parameter." ) - self._head_ontology = head_ontology or self.base_config['head_ontology'] + self._head_ontology = head_ontology or self.base_config["head_ontology"] # Set configuration - optional self._output_directory = output_directory or self.base_config.get( - 'output_directory' + "output_directory" ) self._tail_ontologies = tail_ontologies or self.base_config.get( - 'tail_ontologies' + "tail_ontologies" ) if self._dbms not in SUPPORTED_DBMS: raise ValueError( - f'DBMS {self._dbms} not supported. ' - f'Please select from {SUPPORTED_DBMS}.' + f"DBMS {self._dbms} not supported. " + f"Please select from {SUPPORTED_DBMS}." ) # Initialize @@ -156,7 +158,7 @@ def __init__( self._ontology = None self._writer = None self._pd = None - + def _get_deduplicator(self) -> Deduplicator: """ Create deduplicator if not exists and return. @@ -222,7 +224,7 @@ def _get_writer(self): strict_mode=self._strict_mode, ) else: - raise NotImplementedError('Cannot get writer in online mode.') + raise NotImplementedError("Cannot get writer in online mode.") def _get_driver(self): """ @@ -237,12 +239,12 @@ def _get_driver(self): deduplicator=self._get_deduplicator(), ) else: - raise NotImplementedError('Cannot get driver in offline mode.') + raise NotImplementedError("Cannot get driver in offline mode.") def write_nodes(self, nodes, batch_size: int = int(1e6)) -> bool: """ Write nodes to database. Either takes an iterable of tuples (if given, - translates to ``BioCypherNode`` objects) or an iterable of + translates to ``BioCypherNode`` objects) or an iterable of ``BioCypherNode`` objects. Args: @@ -287,7 +289,7 @@ def write_edges(self, edges, batch_size: int = int(1e6)) -> bool: # write edge files return self._writer.write_edges(tedges, batch_size=batch_size) - def to_df(self) -> List[pd.DataFrame]: + def to_df(self) -> list[pd.DataFrame]: """ Convert entities to a pandas DataFrame for each entity type and return a list. @@ -303,9 +305,8 @@ def to_df(self) -> List[pd.DataFrame]: raise ValueError( "No pandas instance found. Please call `add()` first." ) - + return self._pd.dfs - def add(self, entities): """ @@ -323,7 +324,9 @@ def add(self, entities): entities = peekable(entities) - if isinstance(entities.peek(), BioCypherNode) or isinstance(entities.peek(), BioCypherEdge): + if isinstance(entities.peek(), BioCypherNode) or isinstance( + entities.peek(), BioCypherEdge + ): tentities = entities elif len(entities.peek()) < 4: tentities = self._translator.translate_nodes(entities) @@ -367,11 +370,11 @@ def merge_edges(self, edges) -> bool: Merge edges into database. Either takes an iterable of tuples (if given, translates to ``BioCypherEdge`` objects) or an iterable of ``BioCypherEdge`` objects. - + Args: - edges (iterable): An iterable of edges to merge into the database. + edges (iterable): An iterable of edges to merge into the database. - Returns: + Returns: bool: True if successful. """ @@ -388,7 +391,7 @@ def merge_edges(self, edges) -> bool: # OVERVIEW AND CONVENIENCE METHODS ### - def log_missing_input_labels(self) -> Optional[Dict[str, List[str]]]: + def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]: """ Get the set of input labels encountered without an entry in the @@ -405,19 +408,19 @@ def log_missing_input_labels(self) -> Optional[Dict[str, List[str]]]: if mt: msg = ( - 'Input entities not accounted for due to them not being ' - 'present in the `schema_config.yaml` configuration file ' - '(this is not necessarily a problem, if you did not intend ' - 'to include them in the database; see the log for details): \n' + "Input entities not accounted for due to them not being " + "present in the `schema_config.yaml` configuration file " + "(this is not necessarily a problem, if you did not intend " + "to include them in the database; see the log for details): \n" ) for k, v in mt.items(): - msg += f' {k}: {v} \n' + msg += f" {k}: {v} \n" logger.info(msg) return mt else: - logger.info('No missing labels in input.') + logger.info("No missing labels in input.") return None def log_duplicates(self) -> None: @@ -429,46 +432,44 @@ def log_duplicates(self) -> None: dn = self._deduplicator.get_duplicate_nodes() if dn: - ntypes = dn[0] nids = dn[1] - msg = ('Duplicate node types encountered (IDs in log): \n') + msg = "Duplicate node types encountered (IDs in log): \n" for typ in ntypes: - msg += f' {typ}\n' + msg += f" {typ}\n" logger.info(msg) - idmsg = ('Duplicate node IDs encountered: \n') + idmsg = "Duplicate node IDs encountered: \n" for _id in nids: - idmsg += f' {_id}\n' + idmsg += f" {_id}\n" logger.debug(idmsg) else: - logger.info('No duplicate nodes in input.') + logger.info("No duplicate nodes in input.") de = self._deduplicator.get_duplicate_edges() if de: - etypes = de[0] eids = de[1] - msg = ('Duplicate edge types encountered (IDs in log): \n') + msg = "Duplicate edge types encountered (IDs in log): \n" for typ in etypes: - msg += f' {typ}\n' + msg += f" {typ}\n" logger.info(msg) - idmsg = ('Duplicate edge IDs encountered: \n') + idmsg = "Duplicate edge IDs encountered: \n" for _id in eids: - idmsg += f' {_id}\n' + idmsg += f" {_id}\n" logger.debug(idmsg) else: - logger.info('No duplicate edges in input.') + logger.info("No duplicate edges in input.") def show_ontology_structure(self, **kwargs) -> None: """ @@ -498,7 +499,7 @@ def write_import_call(self) -> None: if not self._offline: raise NotImplementedError( - 'Cannot write import call in online mode.' + "Cannot write import call in online mode." ) self._writer.write_import_call() @@ -520,7 +521,7 @@ def translate_term(self, term: str) -> str: self.start_ontology() return self._translator.translate_term(term) - + def summary(self) -> None: """ Wrapper for showing ontology structure and logging duplicates and diff --git a/biocypher/_create.py b/biocypher/_create.py index ca33e21b..0e6b7c00 100644 --- a/biocypher/_create.py +++ b/biocypher/_create.py @@ -13,16 +13,16 @@ """ from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from typing import Union from dataclasses import field, dataclass import os __all__ = [ - 'BioCypherEdge', - 'BioCypherNode', - 'BioCypherRelAsNode', + "BioCypherEdge", + "BioCypherNode", + "BioCypherRelAsNode", ] @@ -53,7 +53,7 @@ class BioCypherNode: node_id: str node_label: str - preferred_id: str = 'id' + preferred_id: str = "id" properties: dict = field(default_factory=dict) def __post_init__(self): @@ -64,47 +64,50 @@ def __post_init__(self): Replace unwanted characters in properties. """ - self.properties['id'] = self.node_id - self.properties['preferred_id'] = self.preferred_id or None + self.properties["id"] = self.node_id + self.properties["preferred_id"] = self.preferred_id or None # TODO actually make None possible here; as is, "id" is the default in # the dataclass as well as in the configuration file - if ':TYPE' in self.properties.keys(): + if ":TYPE" in self.properties.keys(): logger.warning( "Keyword ':TYPE' is reserved for Neo4j. " - 'Removing from properties.', + "Removing from properties.", # "Renaming to 'type'." ) # self.properties["type"] = self.properties[":TYPE"] - del self.properties[':TYPE'] + del self.properties[":TYPE"] for k, v in self.properties.items(): if isinstance(v, str): self.properties[k] = ( v.replace( os.linesep, - ' ', - ).replace( - '\n', - ' ', - ).replace( - '\r', - ' ', + " ", + ) + .replace( + "\n", + " ", + ) + .replace( + "\r", + " ", ) ) elif isinstance(v, list): - self.properties[k] = ( - [ - val.replace( - os.linesep, - ' ', - ).replace( - '\n', - ' ', - ).replace('\r', ' ') for val in v - ] - ) + self.properties[k] = [ + val.replace( + os.linesep, + " ", + ) + .replace( + "\n", + " ", + ) + .replace("\r", " ") + for val in v + ] def get_id(self) -> str: """ @@ -123,7 +126,7 @@ def get_label(self) -> str: str: node_label """ return self.node_label - + def get_type(self) -> str: """ Returns primary node label. @@ -161,9 +164,9 @@ def get_dict(self) -> dict: properties as second-level dict. """ return { - 'node_id': self.node_id, - 'node_label': self.node_label, - 'properties': self.properties, + "node_id": self.node_id, + "node_label": self.node_label, + "properties": self.properties, } @@ -204,30 +207,30 @@ def __post_init__(self): Check for reserved keywords. """ - if ':TYPE' in self.properties.keys(): + if ":TYPE" in self.properties.keys(): logger.debug( "Keyword ':TYPE' is reserved for Neo4j. " - 'Removing from properties.', + "Removing from properties.", # "Renaming to 'type'." ) # self.properties["type"] = self.properties[":TYPE"] - del self.properties[':TYPE'] - elif 'id' in self.properties.keys(): + del self.properties[":TYPE"] + elif "id" in self.properties.keys(): logger.debug( "Keyword 'id' is reserved for Neo4j. " - 'Removing from properties.', + "Removing from properties.", # "Renaming to 'type'." ) # self.properties["type"] = self.properties[":TYPE"] - del self.properties['id'] - elif '_ID' in self.properties.keys(): + del self.properties["id"] + elif "_ID" in self.properties.keys(): logger.debug( "Keyword '_ID' is reserved for Postgres. " - 'Removing from properties.', + "Removing from properties.", # "Renaming to 'type'." ) # self.properties["type"] = self.properties[":TYPE"] - del self.properties['_ID'] + del self.properties["_ID"] def get_id(self) -> Union[str, None]: """ @@ -295,11 +298,11 @@ def get_dict(self) -> dict: dict. """ return { - 'relationship_id': self.relationship_id or None, - 'source_id': self.source_id, - 'target_id': self.target_id, - 'relationship_label': self.relationship_label, - 'properties': self.properties, + "relationship_id": self.relationship_id or None, + "source_id": self.source_id, + "target_id": self.target_id, + "relationship_label": self.relationship_label, + "properties": self.properties, } @@ -331,20 +334,20 @@ class BioCypherRelAsNode: def __post_init__(self): if not isinstance(self.node, BioCypherNode): raise TypeError( - f'BioCypherRelAsNode.node must be a BioCypherNode, ' - f'not {type(self.node)}.', + f"BioCypherRelAsNode.node must be a BioCypherNode, " + f"not {type(self.node)}.", ) if not isinstance(self.source_edge, BioCypherEdge): raise TypeError( - f'BioCypherRelAsNode.source_edge must be a BioCypherEdge, ' - f'not {type(self.source_edge)}.', + f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, " + f"not {type(self.source_edge)}.", ) if not isinstance(self.target_edge, BioCypherEdge): raise TypeError( - f'BioCypherRelAsNode.target_edge must be a BioCypherEdge, ' - f'not {type(self.target_edge)}.', + f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, " + f"not {type(self.target_edge)}.", ) def get_node(self) -> BioCypherNode: diff --git a/biocypher/_deduplicate.py b/biocypher/_deduplicate.py index e1cd5c69..5ac79abb 100644 --- a/biocypher/_deduplicate.py +++ b/biocypher/_deduplicate.py @@ -1,9 +1,10 @@ from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from ._create import BioCypherEdge, BioCypherNode + class Deduplicator: """ Singleton class responsible of deduplicating BioCypher inputs. Maintains @@ -18,13 +19,13 @@ class Deduplicator: """ def __init__(self): - self.seen_node_ids = set() - self.duplicate_node_ids = set() - self.duplicate_node_types = set() + self.seen_node_ids = set() + self.duplicate_node_ids = set() + self.duplicate_node_types = set() - self.seen_edges = {} - self.duplicate_edge_ids = set() - self.duplicate_edge_types = set() + self.seen_edges = {} + self.duplicate_edge_ids = set() + self.duplicate_edge_types = set() def node_seen(self, node: BioCypherNode) -> bool: """ @@ -39,13 +40,15 @@ def node_seen(self, node: BioCypherNode) -> bool: if node.get_id() in self.seen_node_ids: self.duplicate_node_ids.add(node.get_id()) if node.get_label() not in self.duplicate_node_types: - logger.warning(f"Duplicate node type {node.get_label()} found. ") + logger.warning( + f"Duplicate node type {node.get_label()} found. " + ) self.duplicate_node_types.add(node.get_label()) return True - + self.seen_node_ids.add(node.get_id()) return False - + def edge_seen(self, edge: BioCypherEdge) -> bool: """ Adds an edge to the instance and checks if it has been seen before. @@ -71,10 +74,10 @@ def edge_seen(self, edge: BioCypherEdge) -> bool: logger.warning(f"Duplicate edge type {edge.get_type()} found. ") self.duplicate_edge_types.add(edge.get_type()) return True - + self.seen_edges[edge.get_type()].add(_id) return False - + def get_duplicate_nodes(self): """ Function to return a list of duplicate nodes. @@ -99,4 +102,4 @@ def get_duplicate_edges(self): if self.duplicate_edge_types: return (self.duplicate_edge_types, self.duplicate_edge_ids) else: - return None \ No newline at end of file + return None diff --git a/biocypher/_logger.py b/biocypher/_logger.py index bb09a825..c936a44f 100644 --- a/biocypher/_logger.py +++ b/biocypher/_logger.py @@ -12,7 +12,7 @@ Configuration of the module logger. """ -__all__ = ['get_logger', 'log', 'logfile'] +__all__ = ["get_logger", "log", "logfile"] from datetime import datetime import os @@ -23,7 +23,7 @@ from biocypher._metadata import __version__ -def get_logger(name: str = 'biocypher') -> logging.Logger: +def get_logger(name: str = "biocypher") -> logging.Logger: """ Access the module logger, create a new one if does not exist yet. @@ -45,7 +45,6 @@ def get_logger(name: str = 'biocypher') -> logging.Logger: """ if not logging.getLogger(name).hasHandlers(): - # create logger logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) @@ -53,18 +52,19 @@ def get_logger(name: str = 'biocypher') -> logging.Logger: # formatting file_formatter = logging.Formatter( - '%(asctime)s\t%(levelname)s\tmodule:%(module)s\n%(message)s', + "%(asctime)s\t%(levelname)s\tmodule:%(module)s\n%(message)s", ) - stdout_formatter = logging.Formatter('%(levelname)s -- %(message)s') + stdout_formatter = logging.Formatter("%(levelname)s -- %(message)s") # file name and creation now = datetime.now() - date_time = now.strftime('%Y%m%d-%H%M%S') + date_time = now.strftime("%Y%m%d-%H%M%S") - logdir = _config.config('biocypher' - ).get('log_directory') or 'biocypher-log' + logdir = ( + _config.config("biocypher").get("log_directory") or "biocypher-log" + ) os.makedirs(logdir, exist_ok=True) - logfile = os.path.join(logdir, f'biocypher-{date_time}.log') + logfile = os.path.join(logdir, f"biocypher-{date_time}.log") # handlers # stream handler @@ -75,7 +75,7 @@ def get_logger(name: str = 'biocypher') -> logging.Logger: # file handler file_handler = logging.FileHandler(logfile) - if _config.config('biocypher').get('debug'): + if _config.config("biocypher").get("debug"): file_handler.setLevel(logging.DEBUG) else: file_handler.setLevel(logging.INFO) @@ -87,8 +87,8 @@ def get_logger(name: str = 'biocypher') -> logging.Logger: logger.addHandler(stdout_handler) # startup message - logger.info(f'This is BioCypher v{__version__}.') - logger.info(f'Logging into `{logfile}`.') + logger.info(f"This is BioCypher v{__version__}.") + logger.info(f"Logging into `{logfile}`.") return logging.getLogger(name) @@ -107,7 +107,6 @@ def log(): """ with open(logfile()) as fp: - pydoc.pager(fp.read()) diff --git a/biocypher/_mapping.py b/biocypher/_mapping.py index 7a242bfe..1269b28a 100644 --- a/biocypher/_mapping.py +++ b/biocypher/_mapping.py @@ -14,7 +14,7 @@ """ from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from typing import Optional from urllib.request import urlopen @@ -29,8 +29,8 @@ class OntologyMapping: """ Class to store the ontology mapping and extensions. """ - def __init__(self, config_file: str = None): + def __init__(self, config_file: str = None): self.schema = self._read_config(config_file) self.extended_schema = self._extend_schema() @@ -40,21 +40,16 @@ def _read_config(self, config_file: str = None): Read the configuration file and store the ontology mapping and extensions. """ if config_file is None: - - schema_config = _config.module_data('schema_config') + schema_config = _config.module_data("schema_config") # load yaml file from web - elif config_file.startswith('http'): - + elif config_file.startswith("http"): with urlopen(config_file) as f: - schema_config = yaml.safe_load(f) # get graph state from config (assume file is local) else: - - with open(config_file, 'r') as f: - + with open(config_file, "r") as f: schema_config = yaml.safe_load(f) return schema_config @@ -78,30 +73,28 @@ def _extend_schema(self, d: Optional[dict] = None) -> dict: # first pass: get parent leaves with direct representation in ontology for k, v in d.items(): - # k is not an entity - if 'represented_as' not in v: + if "represented_as" not in v: continue # preferred_id optional: if not provided, use `id` - if not v.get('preferred_id'): - v['preferred_id'] = 'id' + if not v.get("preferred_id"): + v["preferred_id"] = "id" # k is an entity that is present in the ontology - if 'is_a' not in v: + if "is_a" not in v: extended_schema[k] = v # second pass: "vertical" inheritance d = self._vertical_property_inheritance(d) for k, v in d.items(): - if 'is_a' in v: - + if "is_a" in v: # prevent loops - if k == v['is_a']: + if k == v["is_a"]: logger.warning( - f'Loop detected in ontology mapping: {k} -> {v}. ' - 'Removing item. Please fix the inheritance if you want ' - 'to use this item.' + f"Loop detected in ontology mapping: {k} -> {v}. " + "Removing item. Please fix the inheritance if you want " + "to use this item." ) continue @@ -112,16 +105,15 @@ def _extend_schema(self, d: Optional[dict] = None) -> dict: mi_leaves = {} ms_leaves = {} for k, v in d.items(): - # k is not an entity - if 'represented_as' not in v: + if "represented_as" not in v: continue - if isinstance(v.get('preferred_id'), list): + if isinstance(v.get("preferred_id"), list): mi_leaves = self._horizontal_inheritance_pid(k, v) extended_schema.update(mi_leaves) - elif isinstance(v.get('source'), list): + elif isinstance(v.get("source"), list): ms_leaves = self._horizontal_inheritance_source(k, v) extended_schema.update(ms_leaves) @@ -132,40 +124,38 @@ def _vertical_property_inheritance(self, d): Inherit properties from parents to children and update `d` accordingly. """ for k, v in d.items(): - # k is not an entity - if 'represented_as' not in v: + if "represented_as" not in v: continue # k is an entity that is present in the ontology - if 'is_a' not in v: + if "is_a" not in v: continue # "vertical" inheritance: inherit properties from parent - if v.get('inherit_properties', False): - + if v.get("inherit_properties", False): # get direct ancestor - if isinstance(v['is_a'], list): - parent = v['is_a'][0] + if isinstance(v["is_a"], list): + parent = v["is_a"][0] else: - parent = v['is_a'] + parent = v["is_a"] # ensure child has properties and exclude_properties - if 'properties' not in v: - v['properties'] = {} - if 'exclude_properties' not in v: - v['exclude_properties'] = {} + if "properties" not in v: + v["properties"] = {} + if "exclude_properties" not in v: + v["exclude_properties"] = {} # update properties of child - parent_props = self.schema[parent].get('properties', {}) + parent_props = self.schema[parent].get("properties", {}) if parent_props: - v['properties'].update(parent_props) + v["properties"].update(parent_props) parent_excl_props = self.schema[parent].get( - 'exclude_properties', {} + "exclude_properties", {} ) if parent_excl_props: - v['exclude_properties'].update(parent_excl_props) + v["exclude_properties"].update(parent_excl_props) # update schema (d) d[k] = v @@ -182,9 +172,9 @@ def _horizontal_inheritance_pid(self, key, value): leaves = {} - preferred_id = value['preferred_id'] - input_label = value.get('input_label') or value['label_in_input'] - represented_as = value['represented_as'] + preferred_id = value["preferred_id"] + input_label = value.get("input_label") or value["label_in_input"] + represented_as = value["represented_as"] # adjust lengths max_l = max( @@ -208,40 +198,38 @@ def _horizontal_inheritance_pid(self, key, value): reps = represented_as for pid, lab, rep in zip(pids, input_label, reps): - - skey = pid + '.' + key + skey = pid + "." + key svalue = { - 'preferred_id': pid, - 'input_label': lab, - 'represented_as': rep, + "preferred_id": pid, + "input_label": lab, + "represented_as": rep, # mark as virtual - 'virtual': True, + "virtual": True, } # inherit is_a if exists - if 'is_a' in value.keys(): - + if "is_a" in value.keys(): # treat as multiple inheritance - if isinstance(value['is_a'], list): - v = list(value['is_a']) + if isinstance(value["is_a"], list): + v = list(value["is_a"]) v.insert(0, key) - svalue['is_a'] = v + svalue["is_a"] = v else: - svalue['is_a'] = [key, value['is_a']] + svalue["is_a"] = [key, value["is_a"]] else: # set parent as is_a - svalue['is_a'] = key + svalue["is_a"] = key # inherit everything except core attributes for k, v in value.items(): if k not in [ - 'is_a', - 'preferred_id', - 'input_label', - 'label_in_input', - 'represented_as', + "is_a", + "preferred_id", + "input_label", + "label_in_input", + "represented_as", ]: svalue[k] = v @@ -259,9 +247,9 @@ def _horizontal_inheritance_source(self, key, value): leaves = {} - source = value['source'] - input_label = value.get('input_label') or value['label_in_input'] - represented_as = value['represented_as'] + source = value["source"] + input_label = value.get("input_label") or value["label_in_input"] + represented_as = value["represented_as"] # adjust lengths src_l = len(source) @@ -279,40 +267,38 @@ def _horizontal_inheritance_source(self, key, value): reps = represented_as for src, lab, rep in zip(source, labels, reps): - - skey = src + '.' + key + skey = src + "." + key svalue = { - 'source': src, - 'input_label': lab, - 'represented_as': rep, + "source": src, + "input_label": lab, + "represented_as": rep, # mark as virtual - 'virtual': True, + "virtual": True, } # inherit is_a if exists - if 'is_a' in value.keys(): - + if "is_a" in value.keys(): # treat as multiple inheritance - if isinstance(value['is_a'], list): - v = list(value['is_a']) + if isinstance(value["is_a"], list): + v = list(value["is_a"]) v.insert(0, key) - svalue['is_a'] = v + svalue["is_a"] = v else: - svalue['is_a'] = [key, value['is_a']] + svalue["is_a"] = [key, value["is_a"]] else: # set parent as is_a - svalue['is_a'] = key + svalue["is_a"] = key # inherit everything except core attributes for k, v in value.items(): if k not in [ - 'is_a', - 'source', - 'input_label', - 'label_in_input', - 'represented_as', + "is_a", + "source", + "input_label", + "label_in_input", + "represented_as", ]: svalue[k] = v diff --git a/biocypher/_metadata.py b/biocypher/_metadata.py index e8cac084..cbc1426c 100644 --- a/biocypher/_metadata.py +++ b/biocypher/_metadata.py @@ -11,7 +11,7 @@ Package metadata (version, authors, etc). """ -__all__ = ['get_metadata'] +__all__ = ["get_metadata"] import os import pathlib @@ -19,7 +19,7 @@ import toml -_VERSION = '0.5.17' +_VERSION = "0.5.17" def get_metadata(): @@ -31,46 +31,41 @@ def get_metadata(): """ here = pathlib.Path(__file__).parent - pyproj_toml = 'pyproject.toml' + pyproj_toml = "pyproject.toml" meta = {} for project_dir in (here, here.parent): - toml_path = str(project_dir.joinpath(pyproj_toml).absolute()) if os.path.exists(toml_path): - pyproject = toml.load(toml_path) meta = { - 'name': pyproject['tool']['poetry']['name'], - 'version': pyproject['tool']['poetry']['version'], - 'author': pyproject['tool']['poetry']['authors'], - 'license': pyproject['tool']['poetry']['license'], - 'full_metadata': pyproject, + "name": pyproject["tool"]["poetry"]["name"], + "version": pyproject["tool"]["poetry"]["version"], + "author": pyproject["tool"]["poetry"]["authors"], + "license": pyproject["tool"]["poetry"]["license"], + "full_metadata": pyproject, } break if not meta: - try: - meta = { k.lower(): v for k, v in importlib.metadata.metadata(here.name).items() } except importlib.metadata.PackageNotFoundError: - pass - meta['version'] = meta.get('version', None) or _VERSION + meta["version"] = meta.get("version", None) or _VERSION return meta metadata = get_metadata() -__version__ = metadata.get('version', None) -__author__ = metadata.get('author', None) -__license__ = metadata.get('license', None) +__version__ = metadata.get("version", None) +__author__ = metadata.get("author", None) +__license__ = metadata.get("license", None) diff --git a/biocypher/_misc.py b/biocypher/_misc.py index 82fc0b40..b516a048 100644 --- a/biocypher/_misc.py +++ b/biocypher/_misc.py @@ -13,7 +13,7 @@ """ from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from typing import ( Any, @@ -31,7 +31,7 @@ import networkx as nx import stringcase -__all__ = ['LIST_LIKE', 'SIMPLE_TYPES', 'ensure_iterable', 'to_list'] +__all__ = ["LIST_LIKE", "SIMPLE_TYPES", "ensure_iterable", "to_list"] SIMPLE_TYPES = ( bytes, @@ -60,11 +60,9 @@ def to_list(value: Any) -> list: """ if isinstance(value, LIST_LIKE): - value = list(value) else: - value = [value] return value @@ -75,7 +73,7 @@ def ensure_iterable(value: Any) -> Iterable: Returns iterables, except strings, wraps simple types into tuple. """ - return value if isinstance(value, LIST_LIKE) else (value, ) + return value if isinstance(value, LIST_LIKE) else (value,) def create_tree_visualisation(inheritance_tree: Union[dict, nx.Graph]) -> str: @@ -84,7 +82,6 @@ def create_tree_visualisation(inheritance_tree: Union[dict, nx.Graph]) -> str: """ if isinstance(inheritance_tree, nx.Graph): - inheritance_tree = nx.to_dict_of_lists(inheritance_tree) # unlist values inheritance_tree = {k: v[0] for k, v in inheritance_tree.items() if v} @@ -95,56 +92,48 @@ def create_tree_visualisation(inheritance_tree: Union[dict, nx.Graph]) -> str: root = list(parents - classes) if len(root) > 1: - - if 'entity' in root: - - root = 'entity' # default: good standard? TODO + if "entity" in root: + root = "entity" # default: good standard? TODO else: - raise ValueError( - 'Inheritance tree cannot have more than one root node. ' - f'Found {len(root)}: {root}.' + "Inheritance tree cannot have more than one root node. " + f"Found {len(root)}: {root}." ) else: - root = root[0] if not root: # find key whose value is None - root = list(inheritance_tree.keys())[list(inheritance_tree.values() - ).index(None)] + root = list(inheritance_tree.keys())[ + list(inheritance_tree.values()).index(None) + ] tree = Tree() tree.create_node(root, root) while classes: - for child in classes: - parent = inheritance_tree[child] if parent in tree.nodes.keys() or parent == root: - tree.create_node(child, child, parent=parent) for node in tree.nodes.keys(): - if node in classes: - classes.remove(node) return tree # string conversion, adapted from Biolink Model Toolkit -lowercase_pattern = re.compile(r'[a-zA-Z]*[a-z][a-zA-Z]*') -underscore_pattern = re.compile(r'(? str: +def from_pascal(s: str, sep: str = " ") -> str: underscored = underscore_pattern.sub(sep, s) lowercased = lowercase_pattern.sub( lambda match: match.group(0).lower(), @@ -163,7 +152,7 @@ def pascalcase_to_sentencecase(s: str) -> str: Returns: string in sentence case form """ - return from_pascal(s, sep=' ') + return from_pascal(s, sep=" ") def snakecase_to_sentencecase(s: str) -> str: @@ -202,7 +191,7 @@ def sentencecase_to_pascalcase(s: str) -> str: Returns: string in PascalCase form """ - return re.sub(r'(?:^| )([a-zA-Z])', lambda match: match.group(1).upper(), s) + return re.sub(r"(?:^| )([a-zA-Z])", lambda match: match.group(1).upper(), s) def to_lower_sentence_case(s: str) -> str: @@ -216,9 +205,9 @@ def to_lower_sentence_case(s: str) -> str: Returns: string in lower sentence case form """ - if '_' in s: + if "_" in s: return snakecase_to_sentencecase(s) - elif ' ' in s: + elif " " in s: return s.lower() elif s[0].isupper(): return pascalcase_to_sentencecase(s) diff --git a/biocypher/_ontology.py b/biocypher/_ontology.py index 0738d7b9..e22e4465 100644 --- a/biocypher/_ontology.py +++ b/biocypher/_ontology.py @@ -17,7 +17,7 @@ from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from typing import Optional from datetime import datetime @@ -40,6 +40,7 @@ class OntologyAdapter: labels are formatted in lower sentence case. In some cases, this means that we replace underscores with spaces. """ + def __init__( self, ontology_file: str, @@ -63,7 +64,7 @@ def __init__( node in the head ontology that should be used to join to the root node of the tail ontology. Defaults to None. - merge_nodes (bool): If True, head and tail join nodes will be + merge_nodes (bool): If True, head and tail join nodes will be merged, using the label of the head join node. If False, the tail join node will be attached as a child of the head join node. @@ -76,7 +77,7 @@ def __init__( be removed. Defaults to True. """ - logger.info(f'Instantiating OntologyAdapter class for {ontology_file}.') + logger.info(f"Instantiating OntologyAdapter class for {ontology_file}.") self._ontology_file = ontology_file self._root_label = root_label @@ -93,7 +94,6 @@ def __init__( ) def _rdf_to_nx(self, g, root_label, switch_id_and_label=True): - # Loop through all labels in the ontology for s, _, o in g.triples((None, rdflib.RDFS.label, None)): # If the label is the root label, set the root node to the subject of the label @@ -102,7 +102,7 @@ def _rdf_to_nx(self, g, root_label, switch_id_and_label=True): break else: raise ValueError( - f'Could not find root node with label {root_label}' + f"Could not find root node with label {root_label}" ) # Create a directed graph to represent the ontology as a tree @@ -110,7 +110,6 @@ def _rdf_to_nx(self, g, root_label, switch_id_and_label=True): # Define a recursive function to add subclasses to the graph def add_subclasses(node): - # Only add nodes that have a label if (node, rdflib.RDFS.label, None) not in g: return @@ -119,25 +118,23 @@ def add_subclasses(node): if nx_id not in G: G.add_node(nx_id) - G.nodes[nx_id]['label'] = nx_label + G.nodes[nx_id]["label"] = nx_label # Recursively add all subclasses of the node to the graph for s, _, o in g.triples((None, rdflib.RDFS.subClassOf, node)): - # Only add nodes that have a label if (s, rdflib.RDFS.label, None) not in g: continue s_id, s_label = _get_nx_id_and_label(s) G.add_node(s_id) - G.nodes[s_id]['label'] = s_label + G.nodes[s_id]["label"] = s_label G.add_edge(s_id, nx_id) add_subclasses(s) add_parents(s) def add_parents(node): - # Only add nodes that have a label if (node, rdflib.RDFS.label, None) not in g: return @@ -146,7 +143,6 @@ def add_parents(node): # Recursively add all parents of the node to the graph for s, _, o in g.triples((node, rdflib.RDFS.subClassOf, None)): - # Only add nodes that have a label if (o, rdflib.RDFS.label, None) not in g: continue @@ -158,15 +154,16 @@ def add_parents(node): continue G.add_node(o_id) - G.nodes[o_id]['label'] = o_label + G.nodes[o_id]["label"] = o_label G.add_edge(nx_id, o_id) add_parents(o) def _get_nx_id_and_label(node): node_id_str = self._remove_prefix(str(node)) - node_label_str = str(g.value(node, - rdflib.RDFS.label)).replace('_', ' ') + node_label_str = str(g.value(node, rdflib.RDFS.label)).replace( + "_", " " + ) node_label_str = _misc.to_lower_sentence_case(node_label_str) nx_id = node_label_str if switch_id_and_label else node_id_str @@ -185,7 +182,7 @@ def _remove_prefix(self, uri: str) -> str: everything before the last separator. """ if self._remove_prefixes: - return uri.rsplit('#', 1)[-1].rsplit('/', 1)[-1] + return uri.rsplit("#", 1)[-1].rsplit("/", 1)[-1] else: return uri @@ -202,17 +199,17 @@ def _get_format(self, ontology_file): """ Get the format of the ontology file. """ - if ontology_file.endswith('.owl'): - return 'application/rdf+xml' - elif ontology_file.endswith('.obo'): - raise NotImplementedError('OBO format not yet supported') - elif ontology_file.endswith('.rdf'): - return 'application/rdf+xml' - elif ontology_file.endswith('.ttl'): - return 'ttl' + if ontology_file.endswith(".owl"): + return "application/rdf+xml" + elif ontology_file.endswith(".obo"): + raise NotImplementedError("OBO format not yet supported") + elif ontology_file.endswith(".rdf"): + return "application/rdf+xml" + elif ontology_file.endswith(".ttl"): + return "ttl" else: raise ValueError( - f'Could not determine format of ontology file {ontology_file}' + f"Could not determine format of ontology file {ontology_file}" ) def get_nx_graph(self): @@ -254,10 +251,11 @@ class Ontology: while an arbitrary number of other resources can become "tail" ontologies at arbitrary fusion points inside the "head" ontology. """ + def __init__( self, head_ontology: dict, - ontology_mapping: 'OntologyMapping', + ontology_mapping: "OntologyMapping", tail_ontologies: Optional[dict] = None, ): """ @@ -311,21 +309,21 @@ def _load_ontologies(self) -> None: instance variable (head) or a dictionary (tail). """ - logger.info('Loading ontologies...') + logger.info("Loading ontologies...") self._head_ontology = OntologyAdapter( - self._head_ontology_meta['url'], - self._head_ontology_meta['root_node'], + self._head_ontology_meta["url"], + self._head_ontology_meta["root_node"], ) if self._tail_ontology_meta: self._tail_ontologies = {} for key, value in self._tail_ontology_meta.items(): self._tail_ontologies[key] = OntologyAdapter( - ontology_file = value['url'], - root_label = value['tail_join_node'], - head_join_node = value['head_join_node'], - merge_nodes = value.get('merge_nodes', True), + ontology_file=value["url"], + root_label=value["tail_join_node"], + head_join_node=value["head_join_node"], + merge_nodes=value.get("merge_nodes", True), ) def _assert_join_node(self, adapter: OntologyAdapter) -> None: @@ -342,10 +340,9 @@ def _assert_join_node(self, adapter: OntologyAdapter) -> None: head_join_node = adapter.get_head_join_node() if head_join_node not in self._head_ontology.get_nx_graph().nodes: - raise ValueError( - f'Head join node {head_join_node} not found in ' - f'head ontology.' + f"Head join node {head_join_node} not found in " + f"head ontology." ) def _join_ontologies(self, adapter: OntologyAdapter) -> None: @@ -383,11 +380,9 @@ def _join_ontologies(self, adapter: OntologyAdapter) -> None: # as parent of tail join node tail_ontology_subtree.add_node( head_join_node, - **self._head_ontology.get_nx_graph().nodes[head_join_node] - ) - tail_ontology_subtree.add_edge( - tail_join_node, head_join_node + **self._head_ontology.get_nx_graph().nodes[head_join_node], ) + tail_ontology_subtree.add_edge(tail_join_node, head_join_node) # else rename tail join node to match head join node if necessary elif not tail_join_node == head_join_node: @@ -409,46 +404,43 @@ def _extend_ontology(self) -> None: self._nx_graph = self._head_ontology.get_nx_graph().copy() for key, value in self.extended_schema.items(): - - if not value.get('is_a'): - - if self._nx_graph.has_node(value.get('synonym_for')): - + if not value.get("is_a"): + if self._nx_graph.has_node(value.get("synonym_for")): continue - + if not self._nx_graph.has_node(key): - raise ValueError( - f'Node {key} not found in ontology, but also has no ' - 'inheritance definition. Please check your schema for ' - 'spelling errors or a missing `is_a` definition.' + f"Node {key} not found in ontology, but also has no " + "inheritance definition. Please check your schema for " + "spelling errors or a missing `is_a` definition." ) - + continue - parents = _misc.to_list(value.get('is_a')) + parents = _misc.to_list(value.get("is_a")) child = key while parents: parent = parents.pop(0) if parent not in self._nx_graph.nodes: - self._nx_graph.add_node(parent) self._nx_graph.nodes[parent][ - 'label'] = _misc.sentencecase_to_pascalcase(parent) + "label" + ] = _misc.sentencecase_to_pascalcase(parent) # mark parent as user extension - self._nx_graph.nodes[parent]['user_extension'] = True + self._nx_graph.nodes[parent]["user_extension"] = True self._extended_nodes.add(parent) if child not in self._nx_graph.nodes: self._nx_graph.add_node(child) self._nx_graph.nodes[child][ - 'label'] = _misc.sentencecase_to_pascalcase(child) + "label" + ] = _misc.sentencecase_to_pascalcase(child) # mark child as user extension - self._nx_graph.nodes[child]['user_extension'] = True + self._nx_graph.nodes[child]["user_extension"] = True self._extended_nodes.add(child) self._nx_graph.add_edge(child, parent) @@ -463,29 +455,28 @@ def _connect_biolink_classes(self) -> None: if not self._nx_graph: self._nx_graph = self._head_ontology.get_nx_graph().copy() - if 'entity' not in self._nx_graph.nodes: + if "entity" not in self._nx_graph.nodes: return # biolink classes that are disjoint from entity disjoint_classes = [ - 'frequency qualifier mixin', - 'chemical entity to entity association mixin', - 'ontology class', - 'relationship quantifier', - 'physical essence or occurrent', - 'gene or gene product', - 'subject of investigation', + "frequency qualifier mixin", + "chemical entity to entity association mixin", + "ontology class", + "relationship quantifier", + "physical essence or occurrent", + "gene or gene product", + "subject of investigation", ] for node in disjoint_classes: - if not self._nx_graph.nodes.get(node): - self._nx_graph.add_node(node) self._nx_graph.nodes[node][ - 'label'] = _misc.sentencecase_to_pascalcase(node) + "label" + ] = _misc.sentencecase_to_pascalcase(node) - self._nx_graph.add_edge(node, 'entity') + self._nx_graph.add_edge(node, "entity") def _add_properties(self) -> None: """ @@ -495,21 +486,18 @@ def _add_properties(self) -> None: """ for key, value in self.extended_schema.items(): - if key in self._nx_graph.nodes: - self._nx_graph.nodes[key].update(value) - if value.get('synonym_for'): - + if value.get("synonym_for"): # change node label to synonym - if value['synonym_for'] not in self._nx_graph.nodes: + if value["synonym_for"] not in self._nx_graph.nodes: raise ValueError( f'Node {value["synonym_for"]} not found in ontology.' ) self._nx_graph = nx.relabel_nodes( - self._nx_graph, {value['synonym_for']: key} + self._nx_graph, {value["synonym_for"]: key} ) def get_ancestors(self, node_label: str) -> list: @@ -541,18 +529,17 @@ def show_ontology_structure(self, to_disk: str = None, full: bool = False): """ if not self._nx_graph: - raise ValueError('Ontology not loaded.') + raise ValueError("Ontology not loaded.") if not self._tail_ontologies: - msg = f'Showing ontology structure based on {self._head_ontology._ontology_file}' + msg = f"Showing ontology structure based on {self._head_ontology._ontology_file}" else: - msg = f'Showing ontology structure based on {len(self._tail_ontology_meta)+1} ontologies: ' + msg = f"Showing ontology structure based on {len(self._tail_ontology_meta)+1} ontologies: " print(msg) if not full: - # set of leaves and their intermediate parents up to the root filter_nodes = set(self.extended_schema.keys()) @@ -563,19 +550,17 @@ def show_ontology_structure(self, to_disk: str = None, full: bool = False): G = self._nx_graph.subgraph(filter_nodes) else: - G = self._nx_graph if not to_disk: - # create tree tree = _misc.create_tree_visualisation(G) # add synonym information for node in self.extended_schema: - if self.extended_schema[node].get('synonym_for'): + if self.extended_schema[node].get("synonym_for"): tree.nodes[node].tag = ( - f'{node} = ' + f"{node} = " f"{self.extended_schema[node].get('synonym_for')}" ) @@ -584,26 +569,24 @@ def show_ontology_structure(self, to_disk: str = None, full: bool = False): return tree else: - # convert lists/dicts to strings for vis only for node in G.nodes: - # rename node and use former id as label - label = G.nodes[node].get('label') + label = G.nodes[node].get("label") if not label: label = node G = nx.relabel_nodes(G, {node: label}) - G.nodes[label]['label'] = node + G.nodes[label]["label"] = node for attrib in G.nodes[label]: if type(G.nodes[label][attrib]) in [list, dict]: G.nodes[label][attrib] = str(G.nodes[label][attrib]) - path = os.path.join(to_disk, 'ontology_structure.graphml') + path = os.path.join(to_disk, "ontology_structure.graphml") - logger.info(f'Writing ontology structure to {path}.') + logger.info(f"Writing ontology structure to {path}.") nx.write_graphml(G, path) @@ -616,10 +599,10 @@ def get_dict(self) -> dict: """ d = { - 'node_id': self._get_current_id(), - 'node_label': 'BioCypher', - 'properties': { - 'schema': 'self.extended_schema', + "node_id": self._get_current_id(), + "node_label": "BioCypher", + "properties": { + "schema": "self.extended_schema", }, } @@ -635,5 +618,4 @@ def _get_current_id(self): """ now = datetime.now() - return now.strftime('v%Y%m%d-%H%M%S') - \ No newline at end of file + return now.strftime("v%Y%m%d-%H%M%S") diff --git a/biocypher/_pandas.py b/biocypher/_pandas.py index 44821d91..24898b4a 100644 --- a/biocypher/_pandas.py +++ b/biocypher/_pandas.py @@ -1,5 +1,7 @@ import pandas as pd -from ._create import BioCypherNode, BioCypherEdge + +from ._create import BioCypherEdge, BioCypherNode + class Pandas: def __init__(self, ontology, translator, deduplicator): @@ -16,9 +18,13 @@ def _separate_entity_types(self, entities): """ lists = {} for entity in entities: - if not isinstance(entity, BioCypherNode) and not isinstance(entity, BioCypherEdge): - raise TypeError(f"Expected a BioCypherNode or BioCypherEdge, got {type(entity)}.") - + if not isinstance(entity, BioCypherNode) and not isinstance( + entity, BioCypherEdge + ): + raise TypeError( + f"Expected a BioCypherNode or BioCypherEdge, got {type(entity)}." + ) + if isinstance(entity, BioCypherNode): seen = self.deduplicator.node_seen(entity) elif isinstance(entity, BioCypherEdge): @@ -26,7 +32,7 @@ def _separate_entity_types(self, entities): if seen: continue - + _type = entity.get_label() if not _type in lists: lists[_type] = [] @@ -45,10 +51,14 @@ def add_tables(self, entities): self._add_entity_df(_type, _entities) def _add_entity_df(self, _type, _entities): - df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities])) - #replace "properties." with "" in column names + df = pd.DataFrame( + pd.json_normalize([node.get_dict() for node in _entities]) + ) + # replace "properties." with "" in column names df.columns = [col.replace("properties.", "") for col in df.columns] if _type not in self.dfs: self.dfs[_type] = df else: - self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True) + self.dfs[_type] = pd.concat( + [self.dfs[_type], df], ignore_index=True + ) diff --git a/biocypher/_translate.py b/biocypher/_translate.py index 3b3bee29..663d11ab 100644 --- a/biocypher/_translate.py +++ b/biocypher/_translate.py @@ -14,7 +14,7 @@ """ from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from typing import Any, Union, Optional from collections.abc import Iterable, Generator @@ -25,7 +25,7 @@ from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode from ._mapping import OntologyMapping -__all__ = ['BiolinkAdapter', 'Translator'] +__all__ = ["BiolinkAdapter", "Translator"] class Translator: @@ -40,8 +40,9 @@ class Translator: Provides utility functions for translating between input and output labels and cypher queries. """ + def __init__( - self, ontology_mapping: 'OntologyMapping', strict_mode: bool = False + self, ontology_mapping: "OntologyMapping", strict_mode: bool = False ): """ Args: @@ -85,30 +86,28 @@ def translate_nodes( """ - self._log_begin_translate(id_type_prop_tuples, 'nodes') + self._log_begin_translate(id_type_prop_tuples, "nodes") for _id, _type, _props in id_type_prop_tuples: - # check for strict mode requirements - required_props = ['source', 'licence', 'version'] + required_props = ["source", "licence", "version"] if self.strict_mode: # rename 'license' to 'licence' in _props - if _props.get('license'): - _props['licence'] = _props.pop('license') + if _props.get("license"): + _props["licence"] = _props.pop("license") for prop in required_props: if prop not in _props: raise ValueError( - f'Property `{prop}` missing from node {_id}. ' - 'Strict mode is enabled, so this is not allowed.' + f"Property `{prop}` missing from node {_id}. " + "Strict mode is enabled, so this is not allowed." ) # find the node in leaves that represents biolink node type _ontology_class = self._get_ontology_mapping(_type) if _ontology_class: - # filter properties for those specified in schema_config if any _filtered_props = self._filter_props(_ontology_class, _props) @@ -123,10 +122,9 @@ def translate_nodes( ) else: - self._record_no_type(_type, _id) - self._log_finish_translate('nodes') + self._log_finish_translate("nodes") def _get_preferred_id(self, _bl_type: str) -> str: """ @@ -134,8 +132,9 @@ def _get_preferred_id(self, _bl_type: str) -> str: """ return ( - self.extended_schema[_bl_type]['preferred_id'] if 'preferred_id' - in self.extended_schema.get(_bl_type, {}) else 'id' + self.extended_schema[_bl_type]["preferred_id"] + if "preferred_id" in self.extended_schema.get(_bl_type, {}) + else "id" ) def _filter_props(self, bl_type: str, props: dict) -> dict: @@ -143,27 +142,22 @@ def _filter_props(self, bl_type: str, props: dict) -> dict: Filters properties for those specified in schema_config if any. """ - filter_props = self.extended_schema[bl_type].get('properties', {}) + filter_props = self.extended_schema[bl_type].get("properties", {}) # strict mode: add required properties (only if there is a whitelist) if self.strict_mode and filter_props: filter_props.update( - { - 'source': 'str', - 'licence': 'str', - 'version': 'str' - }, + {"source": "str", "licence": "str", "version": "str"}, ) exclude_props = self.extended_schema[bl_type].get( - 'exclude_properties', [] + "exclude_properties", [] ) if isinstance(exclude_props, str): exclude_props = [exclude_props] if filter_props and exclude_props: - filtered_props = { k: v for k, v in props.items() @@ -171,21 +165,16 @@ def _filter_props(self, bl_type: str, props: dict) -> dict: } elif filter_props: - filtered_props = { - k: v - for k, v in props.items() if k in filter_props.keys() + k: v for k, v in props.items() if k in filter_props.keys() } elif exclude_props: - filtered_props = { - k: v - for k, v in props.items() if k not in exclude_props + k: v for k, v in props.items() if k not in exclude_props } else: - return props missing_props = [ @@ -193,7 +182,6 @@ def _filter_props(self, bl_type: str, props: dict) -> dict: ] # add missing properties with default values for k in missing_props: - filtered_props[k] = None return filtered_props @@ -218,7 +206,7 @@ def translate_edges( Can optionally possess its own ID. """ - self._log_begin_translate(id_src_tar_type_prop_tuples, 'edges') + self._log_begin_translate(id_src_tar_type_prop_tuples, "edges") # legacy: deal with 4-tuples (no edge id) # TODO remove for performance reasons once safe @@ -230,18 +218,17 @@ def translate_edges( ] for _id, _src, _tar, _type, _props in id_src_tar_type_prop_tuples: - # check for strict mode requirements if self.strict_mode: - if not 'source' in _props: + if not "source" in _props: raise ValueError( - f'Edge {_id if _id else (_src, _tar)} does not have a `source` property.', - ' This is required in strict mode.', + f"Edge {_id if _id else (_src, _tar)} does not have a `source` property.", + " This is required in strict mode.", ) - if not 'licence' in _props: + if not "licence" in _props: raise ValueError( - f'Edge {_id if _id else (_src, _tar)} does not have a `licence` property.', - ' This is required in strict mode.', + f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property.", + " This is required in strict mode.", ) # match the input label (_type) to @@ -249,14 +236,12 @@ def translate_edges( bl_type = self._get_ontology_mapping(_type) if bl_type: - # filter properties for those specified in schema_config if any _filtered_props = self._filter_props(bl_type, _props) - rep = self.extended_schema[bl_type]['represented_as'] - - if rep == 'node': + rep = self.extended_schema[bl_type]["represented_as"] + if rep == "node": if _id: # if it brings its own ID, use it node_id = _id @@ -264,8 +249,11 @@ def translate_edges( else: # source target concat node_id = ( - str(_src) + '_' + str(_tar) + '_' + - '_'.join(str(v) for v in _filtered_props.values()) + str(_src) + + "_" + + str(_tar) + + "_" + + "_".join(str(v) for v in _filtered_props.values()) ) n = BioCypherNode( @@ -277,21 +265,18 @@ def translate_edges( # directionality check TODO generalise to account for # different descriptions of directionality or find a # more consistent solution for indicating directionality - if _filtered_props.get('directed') == True: - - l1 = 'IS_SOURCE_OF' - l2 = 'IS_TARGET_OF' + if _filtered_props.get("directed") == True: + l1 = "IS_SOURCE_OF" + l2 = "IS_TARGET_OF" elif _filtered_props.get( - 'src_role', - ) and _filtered_props.get('tar_role'): - - l1 = _filtered_props.get('src_role') - l2 = _filtered_props.get('tar_role') + "src_role", + ) and _filtered_props.get("tar_role"): + l1 = _filtered_props.get("src_role") + l2 = _filtered_props.get("tar_role") else: - - l1 = l2 = 'IS_PART_OF' + l1 = l2 = "IS_PART_OF" e_s = BioCypherEdge( source_id=_src, @@ -310,13 +295,11 @@ def translate_edges( yield BioCypherRelAsNode(n, e_s, e_t) else: - edge_label = self.extended_schema[bl_type].get( - 'label_as_edge' + "label_as_edge" ) if edge_label is None: - edge_label = bl_type yield BioCypherEdge( @@ -328,10 +311,9 @@ def translate_edges( ) else: - self._record_no_type(_type, (_src, _tar)) - self._log_finish_translate('edges') + self._log_finish_translate("edges") def _record_no_type(self, _type: Any, what: Any) -> None: """ @@ -339,14 +321,12 @@ def _record_no_type(self, _type: Any, what: Any) -> None: schema_config. """ - logger.debug(f'No Biolink type defined for `{_type}`: {what}') + logger.debug(f"No Biolink type defined for `{_type}`: {what}") if self.notype.get(_type, None): - self.notype[_type] += 1 else: - self.notype[_type] = 1 def get_missing_biolink_types(self) -> dict: @@ -359,15 +339,13 @@ def get_missing_biolink_types(self) -> dict: @staticmethod def _log_begin_translate(_input: Iterable, what: str): + n = f"{len(_input)} " if hasattr(_input, "__len__") else "" - n = f'{len(_input)} ' if hasattr(_input, '__len__') else '' - - logger.debug(f'Translating {n}{what} to BioCypher') + logger.debug(f"Translating {n}{what} to BioCypher") @staticmethod def _log_finish_translate(what: str): - - logger.debug(f'Finished translating {what} to BioCypher.') + logger.debug(f"Finished translating {what} to BioCypher.") def _update_ontology_types(self): """ @@ -379,24 +357,19 @@ def _update_ontology_types(self): self._ontology_mapping = {} for key, value in self.extended_schema.items(): - - labels = value.get('input_label') or value.get('label_in_input') + labels = value.get("input_label") or value.get("label_in_input") if isinstance(labels, str): - self._ontology_mapping[labels] = key elif isinstance(labels, list): - for label in labels: self._ontology_mapping[label] = key - if value.get('label_as_edge'): - - self._add_translation_mappings(labels, value['label_as_edge']) + if value.get("label_as_edge"): + self._add_translation_mappings(labels, value["label_as_edge"]) else: - self._add_translation_mappings(labels, key) def _get_ontology_mapping(self, label: str) -> Optional[str]: @@ -433,7 +406,7 @@ def translate(self, query): Translate a cypher query. Only translates labels as of now. """ for key in self.mappings: - query = query.replace(':' + key, ':' + self.mappings[key]) + query = query.replace(":" + key, ":" + self.mappings[key]) return query def reverse_translate(self, query): @@ -442,23 +415,22 @@ def reverse_translate(self, query): now. """ for key in self.reverse_mappings: - - a = ':' + key + ')' - b = ':' + key + ']' + a = ":" + key + ")" + b = ":" + key + "]" # TODO this conditional probably does not cover all cases if a in query or b in query: if isinstance(self.reverse_mappings[key], list): raise NotImplementedError( - 'Reverse translation of multiple inputs not ' - 'implemented yet. Many-to-one mappings are ' - 'not reversible. ' - f'({key} -> {self.reverse_mappings[key]})', + "Reverse translation of multiple inputs not " + "implemented yet. Many-to-one mappings are " + "not reversible. " + f"({key} -> {self.reverse_mappings[key]})", ) else: query = query.replace( a, - ':' + self.reverse_mappings[key] + ')', - ).replace(b, ':' + self.reverse_mappings[key] + ']') + ":" + self.reverse_mappings[key] + ")", + ).replace(b, ":" + self.reverse_mappings[key] + "]") return query def _add_translation_mappings(self, original_name, biocypher_name): @@ -479,12 +451,17 @@ def _add_translation_mappings(self, original_name, biocypher_name): if isinstance(biocypher_name, list): for bn in biocypher_name: - self.reverse_mappings[self.name_sentence_to_pascal(bn, ) - ] = original_name + self.reverse_mappings[ + self.name_sentence_to_pascal( + bn, + ) + ] = original_name else: - self.reverse_mappings[self.name_sentence_to_pascal( - biocypher_name, - )] = original_name + self.reverse_mappings[ + self.name_sentence_to_pascal( + biocypher_name, + ) + ] = original_name @staticmethod def name_sentence_to_pascal(name: str) -> str: @@ -492,9 +469,9 @@ def name_sentence_to_pascal(name: str) -> str: Converts a name in sentence case to pascal case. """ # split on dots if dot is present - if '.' in name: - return '.'.join( - [_misc.sentencecase_to_pascalcase(n) for n in name.split('.')], + if "." in name: + return ".".join( + [_misc.sentencecase_to_pascalcase(n) for n in name.split(".")], ) else: return _misc.sentencecase_to_pascalcase(name) diff --git a/biocypher/_write.py b/biocypher/_write.py index efbd60fb..233b69f9 100644 --- a/biocypher/_write.py +++ b/biocypher/_write.py @@ -17,7 +17,7 @@ from ._logger import logger -logger.debug(f'Loading module {__name__}.') +logger.debug(f"Loading module {__name__}.") from abc import ABC, abstractmethod from types import GeneratorType @@ -31,10 +31,9 @@ from ._config import config as _config from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode -__all__ = ['get_writer'] +__all__ = ["get_writer"] if TYPE_CHECKING: - from ._ontology import Ontology from ._translate import Translator from ._deduplicate import Deduplicator @@ -92,7 +91,7 @@ class _BatchWriter(ABC): Path prefix for the admin import call binary. import_call_file_prefix: - Path prefix for the data files (headers and parts) in the import + Path prefix for the data files (headers and parts) in the import call. wipe: @@ -108,6 +107,7 @@ class _BatchWriter(ABC): skip_duplicate_nodes: Whether to skip duplicate nodes. (Specific to Neo4j.) """ + @abstractmethod def _get_default_import_call_bin_prefix(self): """ @@ -193,14 +193,14 @@ def _get_import_script_name(self) -> str: def __init__( self, - ontology: 'Ontology', - translator: 'Translator', - deduplicator: 'Deduplicator', + ontology: "Ontology", + translator: "Translator", + deduplicator: "Deduplicator", delimiter: str, - array_delimiter: str = ',', + array_delimiter: str = ",", quote: str = '"', output_directory: Optional[str] = None, - db_name: str = 'neo4j', + db_name: str = "neo4j", import_call_bin_prefix: Optional[str] = None, import_call_file_prefix: Optional[str] = None, wipe: bool = True, @@ -209,7 +209,7 @@ def __init__( skip_duplicate_nodes: bool = False, db_user: str = None, db_password: str = None, - db_port: str = None + db_port: str = None, ): self.db_name = db_name self.db_user = db_user @@ -225,7 +225,8 @@ def __init__( self.skip_duplicate_nodes = skip_duplicate_nodes if import_call_bin_prefix is None: - self.import_call_bin_prefix = self._get_default_import_call_bin_prefix( + self.import_call_bin_prefix = ( + self._get_default_import_call_bin_prefix() ) else: self.import_call_bin_prefix = import_call_bin_prefix @@ -248,11 +249,11 @@ def __init__( if os.path.exists(self.outdir): logger.warning( - f'Output directory `{self.outdir}` already exists. ' - 'If this is not planned, file consistency may be compromised.' + f"Output directory `{self.outdir}` already exists. " + "If this is not planned, file consistency may be compromised." ) else: - logger.info(f'Creating output directory `{self.outdir}`.') + logger.info(f"Creating output directory `{self.outdir}`.") os.makedirs(self.outdir) self.parts = {} # dict to store the paths of part files for each label @@ -268,7 +269,6 @@ def outdir(self): return self._outdir - @property def import_call_file_prefix(self): """ @@ -286,12 +286,10 @@ def _process_delimiter(self, delimiter: str) -> str: representation (e.g. tab for '\t'). """ - if delimiter == '\\t': - - return '\t', '\\t' + if delimiter == "\\t": + return "\t", "\\t" else: - return delimiter, delimiter def write_nodes(self, nodes, batch_size: int = int(1e6)): @@ -310,12 +308,12 @@ def write_nodes(self, nodes, batch_size: int = int(1e6)): # write node data passed = self._write_node_data(nodes, batch_size) if not passed: - logger.error('Error while writing node data.') + logger.error("Error while writing node data.") return False # pass property data to header writer per node type written passed = self._write_node_headers() if not passed: - logger.error('Error while writing node headers.') + logger.error("Error while writing node headers.") return False return True @@ -348,7 +346,9 @@ def write_edges( e.get_source_edge(), e.get_target_edge(), ], - ) if isinstance(e, BioCypherRelAsNode) else (None, [e]) + ) + if isinstance(e, BioCypherRelAsNode) + else (None, [e]) for e in edges ) ) @@ -368,17 +368,17 @@ def write_edges( # is this a problem? if the generator or list is empty, we # don't write anything. logger.debug( - 'No edges to write, possibly due to no matched Biolink classes.', + "No edges to write, possibly due to no matched Biolink classes.", ) pass if not passed: - logger.error('Error while writing edge data.') + logger.error("Error while writing edge data.") return False # pass property data to header writer per edge type written passed = self._write_edge_headers() if not passed: - logger.error('Error while writing edge headers.') + logger.error("Error while writing edge headers.") return False return True @@ -401,7 +401,7 @@ def _write_node_data(self, nodes, batch_size): """ if isinstance(nodes, GeneratorType) or isinstance(nodes, peekable): - logger.debug('Writing node CSV from generator.') + logger.debug("Writing node CSV from generator.") bins = defaultdict(list) # dict to store a list for each # label that is passed in @@ -424,7 +424,7 @@ def _write_node_data(self, nodes, batch_size): # check for non-id if not _id: - logger.warning(f'Node {label} has no id; skipping.') + logger.warning(f"Node {label} has no id; skipping.") continue if not label in bins.keys(): @@ -434,20 +434,22 @@ def _write_node_data(self, nodes, batch_size): bin_l[label] = 1 # get properties from config if present - cprops = self.extended_schema.get(label).get('properties', ) + cprops = self.extended_schema.get(label).get( + "properties", + ) if cprops: d = dict(cprops) # add id and preferred id to properties; these are # created in node creation (`_create.BioCypherNode`) - d['id'] = 'str' - d['preferred_id'] = 'str' + d["id"] = "str" + d["preferred_id"] = "str" # add strict mode properties if self.strict_mode: - d['source'] = 'str' - d['version'] = 'str' - d['licence'] = 'str' + d["source"] = "str" + d["version"] = "str" + d["licence"] = "str" else: d = dict(node.get_properties()) @@ -531,7 +533,7 @@ def _write_node_data(self, nodes, batch_size): return True else: if type(nodes) is not list: - logger.error('Nodes must be passed as list or generator.') + logger.error("Nodes must be passed as list or generator.") return False else: @@ -563,14 +565,13 @@ def _write_single_node_list_to_file( bool: The return value. True for success, False otherwise. """ if not all(isinstance(n, BioCypherNode) for n in node_list): - logger.error('Nodes must be passed as type BioCypherNode.') + logger.error("Nodes must be passed as type BioCypherNode.") return False # from list of nodes to list of strings lines = [] for n in node_list: - # check for deviations in properties # node properties n_props = n.get_properties() @@ -584,46 +585,45 @@ def _write_single_node_list_to_file( oprop1 = set(ref_props).difference(n_keys) oprop2 = set(n_keys).difference(ref_props) logger.error( - f'At least one node of the class {n.get_label()} ' - f'has more or fewer properties than another. ' - f'Offending node: {onode!r}, offending property: ' - f'{max([oprop1, oprop2])}. ' - f'All reference properties: {ref_props}, ' - f'All node properties: {n_keys}.', + f"At least one node of the class {n.get_label()} " + f"has more or fewer properties than another. " + f"Offending node: {onode!r}, offending property: " + f"{max([oprop1, oprop2])}. " + f"All reference properties: {ref_props}, " + f"All node properties: {n_keys}.", ) return False line = [n.get_id()] if ref_props: - plist = [] # make all into strings, put actual strings in quotes for k, v in prop_dict.items(): p = n_props.get(k) if p is None: # TODO make field empty instead of ""? - plist.append('') + plist.append("") elif v in [ - 'int', - 'integer', - 'long', - 'float', - 'double', - 'dbl', - 'bool', - 'boolean', + "int", + "integer", + "long", + "float", + "double", + "dbl", + "bool", + "boolean", ]: plist.append(str(p)) else: if isinstance(p, list): plist.append(self._write_array_string(p)) else: - plist.append(f'{self.quote}{str(p)}{self.quote}') + plist.append(f"{self.quote}{str(p)}{self.quote}") line.append(self.delim.join(plist)) line.append(labels) - lines.append(self.delim.join(line) + '\n') + lines.append(self.delim.join(line) + "\n") # avoid writing empty files if lines: @@ -653,7 +653,7 @@ def _write_edge_data(self, edges, batch_size): """ if isinstance(edges, GeneratorType): - logger.debug('Writing edge CSV from generator.') + logger.debug("Writing edge CSV from generator.") bins = defaultdict(list) # dict to store a list for each # label that is passed in @@ -671,8 +671,8 @@ def _write_edge_data(self, edges, batch_size): if not (edge.get_source_id() and edge.get_target_id()): logger.error( - 'Edge must have source and target node. ' - f'Caused by: {edge}', + "Edge must have source and target node. " + f"Caused by: {edge}", ) continue @@ -691,23 +691,23 @@ def _write_edge_data(self, edges, batch_size): cprops = None if label in self.extended_schema: cprops = self.extended_schema.get(label).get( - 'properties', + "properties", ) else: # try via "label_as_edge" for k, v in self.extended_schema.items(): if isinstance(v, dict): - if v.get('label_as_edge') == label: - cprops = v.get('properties') + if v.get("label_as_edge") == label: + cprops = v.get("properties") break if cprops: d = cprops # add strict mode properties if self.strict_mode: - d['source'] = 'str' - d['version'] = 'str' - d['licence'] = 'str' + d["source"] = "str" + d["version"] = "str" + d["licence"] = "str" else: d = dict(edge.get_properties()) @@ -746,7 +746,6 @@ def _write_edge_data(self, edges, batch_size): # after generator depleted, write remainder of bins for label, nl in bins.items(): - passed = self._write_single_edge_list_to_file( nl, label, @@ -768,7 +767,7 @@ def _write_edge_data(self, edges, batch_size): return True else: if type(edges) is not list: - logger.error('Edges must be passed as list or generator.') + logger.error("Edges must be passed as list or generator.") return False else: @@ -800,8 +799,7 @@ def _write_single_edge_list_to_file( """ if not all(isinstance(n, BioCypherEdge) for n in edge_list): - - logger.error('Edges must be passed as type BioCypherEdge.') + logger.error("Edges must be passed as type BioCypherEdge.") return False # from list of edges to list of strings @@ -815,16 +813,16 @@ def _write_single_edge_list_to_file( # compare list order invariant if not set(ref_props) == set(e_keys): - oedge = f'{e.get_source_id()}-{e.get_target_id()}' + oedge = f"{e.get_source_id()}-{e.get_target_id()}" oprop1 = set(ref_props).difference(e_keys) oprop2 = set(e_keys).difference(ref_props) logger.error( - f'At least one edge of the class {e.get_label()} ' - f'has more or fewer properties than another. ' - f'Offending edge: {oedge!r}, offending property: ' - f'{max([oprop1, oprop2])}. ' - f'All reference properties: {ref_props}, ' - f'All edge properties: {e_keys}.', + f"At least one edge of the class {e.get_label()} " + f"has more or fewer properties than another. " + f"Offending edge: {oedge!r}, offending property: " + f"{max([oprop1, oprop2])}. " + f"All reference properties: {ref_props}, " + f"All edge properties: {e_keys}.", ) return False @@ -833,16 +831,16 @@ def _write_single_edge_list_to_file( for k, v in prop_dict.items(): p = e_props.get(k) if p is None: # TODO make field empty instead of ""? - plist.append('') + plist.append("") elif v in [ - 'int', - 'integer', - 'long', - 'float', - 'double', - 'dbl', - 'bool', - 'boolean', + "int", + "integer", + "long", + "float", + "double", + "dbl", + "bool", + "boolean", ]: plist.append(str(p)) else: @@ -850,7 +848,7 @@ def _write_single_edge_list_to_file( plist.append(self._write_array_string(p)) else: plist.append(self.quote + str(p) + self.quote) - + entries = [e.get_source_id()] skip_id = False @@ -861,29 +859,34 @@ def _write_single_edge_list_to_file( elif not self.extended_schema.get(label): # find label in schema by label_as_edge for k, v in self.extended_schema.items(): - if v.get('label_as_edge') == label: + if v.get("label_as_edge") == label: schema_label = k break else: schema_label = label if schema_label: - if self.extended_schema.get(schema_label).get('use_id') == False: + if ( + self.extended_schema.get(schema_label).get("use_id") + == False + ): skip_id = True if not skip_id: - entries.append(e.get_id() or '') + entries.append(e.get_id() or "") if ref_props: entries.append(self.delim.join(plist)) entries.append(e.get_target_id()) - entries.append(self.translator.name_sentence_to_pascal( - e.get_label(), - )) + entries.append( + self.translator.name_sentence_to_pascal( + e.get_label(), + ) + ) lines.append( - self.delim.join(entries) + '\n', + self.delim.join(entries) + "\n", ) # avoid writing empty files @@ -911,39 +914,34 @@ def _write_next_part(self, label: str, lines: list): # list files in self.outdir files = glob.glob( - os.path.join(self.outdir, f'{label_pascal}-part*.csv') + os.path.join(self.outdir, f"{label_pascal}-part*.csv") ) # find file with highest part number if not files: - next_part = 0 else: - next_part = ( max( [ - int( - f.split('.')[-2].split('-')[-1].replace('part', '') - ) for f in files + int(f.split(".")[-2].split("-")[-1].replace("part", "")) + for f in files ], - ) + 1 + ) + + 1 ) # write to file padded_part = str(next_part).zfill(3) logger.info( - f'Writing {len(lines)} entries to {label_pascal}-part{padded_part}.csv', + f"Writing {len(lines)} entries to {label_pascal}-part{padded_part}.csv", ) # store name only in case import_call_file_prefix is set - part = f'{label_pascal}-part{padded_part}.csv' - file_path = os.path.join( - self.outdir, part - ) - - with open(file_path, 'w', encoding='utf-8') as f: + part = f"{label_pascal}-part{padded_part}.csv" + file_path = os.path.join(self.outdir, part) + with open(file_path, "w", encoding="utf-8") as f: # concatenate with delimiter f.writelines(lines) @@ -975,10 +973,9 @@ def write_import_call(self) -> bool: """ file_path = os.path.join(self.outdir, self._get_import_script_name()) - logger.info(f'Writing {self.db_name} import call to `{file_path}`.') - - with open(file_path, 'w', encoding='utf-8') as f: + logger.info(f"Writing {self.db_name} import call to `{file_path}`.") + with open(file_path, "w", encoding="utf-8") as f: f.write(self._construct_import_call()) return True @@ -1000,6 +997,7 @@ class _Neo4jBatchWriter(_BatchWriter): - _construct_import_call - _write_array_string """ + def _get_default_import_call_bin_prefix(self): """ Method to provide the default string for the import call bin prefix. @@ -1007,7 +1005,7 @@ def _get_default_import_call_bin_prefix(self): Returns: str: The default location for the neo4j admin import location """ - return 'bin/' + return "bin/" def _write_array_string(self, string_list): """ @@ -1021,7 +1019,7 @@ def _write_array_string(self, string_list): str: The string representation of an array for the neo4j admin import """ string = self.adelim.join(string_list) - return f'{self.quote}{string}{self.quote}' + return f"{self.quote}{string}{self.quote}" def _write_node_headers(self): """ @@ -1035,56 +1033,55 @@ def _write_node_headers(self): # load headers from data parse if not self.node_property_dict: logger.error( - 'Header information not found. Was the data parsed first?', + "Header information not found. Was the data parsed first?", ) return False for label, props in self.node_property_dict.items(): - - _id = ':ID' + _id = ":ID" # translate label to PascalCase pascal_label = self.translator.name_sentence_to_pascal(label) - header = f'{pascal_label}-header.csv' + header = f"{pascal_label}-header.csv" header_path = os.path.join( self.outdir, header, ) - parts = f'{pascal_label}-part.*' + parts = f"{pascal_label}-part.*" # check if file already exists if os.path.exists(header_path): logger.warning( - f'Header file `{header_path}` already exists. Overwriting.', + f"Header file `{header_path}` already exists. Overwriting.", ) # concatenate key:value in props props_list = [] for k, v in props.items(): - if v in ['int', 'long', 'integer']: - props_list.append(f'{k}:long') - elif v in ['int[]', 'long[]', 'integer[]']: - props_list.append(f'{k}:long[]') - elif v in ['float', 'double', 'dbl']: - props_list.append(f'{k}:double') - elif v in ['float[]', 'double[]']: - props_list.append(f'{k}:double[]') - elif v in ['bool', 'boolean']: + if v in ["int", "long", "integer"]: + props_list.append(f"{k}:long") + elif v in ["int[]", "long[]", "integer[]"]: + props_list.append(f"{k}:long[]") + elif v in ["float", "double", "dbl"]: + props_list.append(f"{k}:double") + elif v in ["float[]", "double[]"]: + props_list.append(f"{k}:double[]") + elif v in ["bool", "boolean"]: # TODO Neo4j boolean support / spelling? - props_list.append(f'{k}:boolean') - elif v in ['bool[]', 'boolean[]']: - props_list.append(f'{k}:boolean[]') - elif v in ['str[]', 'string[]']: - props_list.append(f'{k}:string[]') + props_list.append(f"{k}:boolean") + elif v in ["bool[]", "boolean[]"]: + props_list.append(f"{k}:boolean[]") + elif v in ["str[]", "string[]"]: + props_list.append(f"{k}:string[]") else: - props_list.append(f'{k}') + props_list.append(f"{k}") # create list of lists and flatten - out_list = [[_id], props_list, [':LABEL']] + out_list = [[_id], props_list, [":LABEL"]] out_list = [val for sublist in out_list for val in sublist] - with open(header_path, 'w', encoding='utf-8') as f: + with open(header_path, "w", encoding="utf-8") as f: # concatenate with delimiter row = self.delim.join(out_list) f.write(row) @@ -1099,7 +1096,9 @@ def _write_node_headers(self): self.import_call_file_prefix, parts, ) - self.import_call_nodes.add((import_call_header_path, import_call_parts_path)) + self.import_call_nodes.add( + (import_call_header_path, import_call_parts_path) + ) return True @@ -1115,51 +1114,50 @@ def _write_edge_headers(self): # load headers from data parse if not self.edge_property_dict: logger.error( - 'Header information not found. Was the data parsed first?', + "Header information not found. Was the data parsed first?", ) return False for label, props in self.edge_property_dict.items(): - # translate label to PascalCase pascal_label = self.translator.name_sentence_to_pascal(label) # paths - header = f'{pascal_label}-header.csv' + header = f"{pascal_label}-header.csv" header_path = os.path.join( self.outdir, header, ) - parts = f'{pascal_label}-part.*' + parts = f"{pascal_label}-part.*" # check for file exists if os.path.exists(header_path): logger.warning( - f'File {header_path} already exists. Overwriting.' + f"File {header_path} already exists. Overwriting." ) # concatenate key:value in props props_list = [] for k, v in props.items(): - if v in ['int', 'long', 'integer']: - props_list.append(f'{k}:long') - elif v in ['int[]', 'long[]', 'integer[]']: - props_list.append(f'{k}:long[]') - elif v in ['float', 'double']: - props_list.append(f'{k}:double') - elif v in ['float[]', 'double[]']: - props_list.append(f'{k}:double[]') + if v in ["int", "long", "integer"]: + props_list.append(f"{k}:long") + elif v in ["int[]", "long[]", "integer[]"]: + props_list.append(f"{k}:long[]") + elif v in ["float", "double"]: + props_list.append(f"{k}:double") + elif v in ["float[]", "double[]"]: + props_list.append(f"{k}:double[]") elif v in [ - 'bool', - 'boolean', + "bool", + "boolean", ]: # TODO does Neo4j support bool? - props_list.append(f'{k}:boolean') - elif v in ['bool[]', 'boolean[]']: - props_list.append(f'{k}:boolean[]') - elif v in ['str[]', 'string[]']: - props_list.append(f'{k}:string[]') + props_list.append(f"{k}:boolean") + elif v in ["bool[]", "boolean[]"]: + props_list.append(f"{k}:boolean[]") + elif v in ["str[]", "string[]"]: + props_list.append(f"{k}:string[]") else: - props_list.append(f'{k}') + props_list.append(f"{k}") skip_id = False schema_label = None @@ -1169,25 +1167,28 @@ def _write_edge_headers(self): elif not self.extended_schema.get(label): # find label in schema by label_as_edge for k, v in self.extended_schema.items(): - if v.get('label_as_edge') == label: + if v.get("label_as_edge") == label: schema_label = k break else: schema_label = label - out_list = [':START_ID'] + out_list = [":START_ID"] if schema_label: - if self.extended_schema.get(schema_label).get('use_id') == False: + if ( + self.extended_schema.get(schema_label).get("use_id") + == False + ): skip_id = True if not skip_id: - out_list.append('id') + out_list.append("id") out_list.extend(props_list) - out_list.extend([':END_ID', ':TYPE']) + out_list.extend([":END_ID", ":TYPE"]) - with open(header_path, 'w', encoding='utf-8') as f: + with open(header_path, "w", encoding="utf-8") as f: # concatenate with delimiter row = self.delim.join(out_list) f.write(row) @@ -1202,7 +1203,9 @@ def _write_edge_headers(self): self.import_call_file_prefix, parts, ) - self.import_call_edges.add((import_call_header_path, import_call_parts_path)) + self.import_call_edges.add( + (import_call_header_path, import_call_parts_path) + ) return True @@ -1213,7 +1216,7 @@ def _get_import_script_name(self) -> str: Returns: str: The name of the import script (ending in .sh) """ - return 'neo4j-admin-import-call.sh' + return "neo4j-admin-import-call.sh" def _construct_import_call(self) -> str: """ @@ -1226,8 +1229,8 @@ def _construct_import_call(self) -> str: str: a bash command for neo4j-admin import """ import_call = ( - f'{self.import_call_bin_prefix}neo4j-admin import ' - f'--database={self.db_name} ' + f"{self.import_call_bin_prefix}neo4j-admin import " + f"--database={self.db_name} " f'--delimiter="{self.escaped_delim}" ' f'--array-delimiter="{self.escaped_adelim}" ' ) @@ -1238,11 +1241,11 @@ def _construct_import_call(self) -> str: import_call += f"--quote='{self.quote}' " if self.wipe: - import_call += f'--force=true ' + import_call += f"--force=true " if self.skip_bad_relationships: - import_call += '--skip-bad-relationships=true ' + import_call += "--skip-bad-relationships=true " if self.skip_duplicate_nodes: - import_call += '--skip-duplicate-nodes=true ' + import_call += "--skip-duplicate-nodes=true " # append node import calls for header_path, parts_path in self.import_call_nodes: @@ -1261,6 +1264,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter): specified by ArangoDB for the use of "arangoimport". Output files are similar to Neo4j, but with a different header format. """ + def _get_default_import_call_bin_prefix(self): """ Method to provide the default string for the import call bin prefix. @@ -1268,7 +1272,7 @@ def _get_default_import_call_bin_prefix(self): Returns: str: The default location for the neo4j admin import location """ - return '' + return "" def _get_import_script_name(self) -> str: """ @@ -1277,7 +1281,7 @@ def _get_import_script_name(self) -> str: Returns: str: The name of the import script (ending in .sh) """ - return 'arangodb-import-call.sh' + return "arangodb-import-call.sh" def _write_node_headers(self): """ @@ -1291,19 +1295,19 @@ def _write_node_headers(self): # load headers from data parse if not self.node_property_dict: logger.error( - 'Header information not found. Was the data parsed first?', + "Header information not found. Was the data parsed first?", ) return False for label, props in self.node_property_dict.items(): # create header CSV with ID, properties, labels - _id = '_key' + _id = "_key" # translate label to PascalCase pascal_label = self.translator.name_sentence_to_pascal(label) - header = f'{pascal_label}-header.csv' + header = f"{pascal_label}-header.csv" header_path = os.path.join( self.outdir, header, @@ -1312,28 +1316,27 @@ def _write_node_headers(self): # check if file already exists if os.path.exists(header_path): logger.warning( - f'File {header_path} already exists. Overwriting.' + f"File {header_path} already exists. Overwriting." ) # concatenate key:value in props props_list = [] for k in props.keys(): - - props_list.append(f'{k}') + props_list.append(f"{k}") # create list of lists and flatten # removes need for empty check of property list out_list = [[_id], props_list] out_list = [val for sublist in out_list for val in sublist] - with open(header_path, 'w', encoding='utf-8') as f: + with open(header_path, "w", encoding="utf-8") as f: # concatenate with delimiter row = self.delim.join(out_list) f.write(row) # add collection from schema config collection = self.extended_schema[label].get( - 'db_collection_name', None + "db_collection_name", None ) # add file path to neo4 admin import statement @@ -1341,14 +1344,12 @@ def _write_node_headers(self): parts = self.parts.get(label, []) if not parts: - raise ValueError( - f'No parts found for node label {label}. ' - f'Check that the data was parsed first.', + f"No parts found for node label {label}. " + f"Check that the data was parsed first.", ) for part in parts: - import_call_header_path = os.path.join( self.import_call_file_prefix, header, @@ -1358,7 +1359,13 @@ def _write_node_headers(self): part, ) - self.import_call_nodes.add((import_call_header_path, import_call_parts_path, collection)) + self.import_call_nodes.add( + ( + import_call_header_path, + import_call_parts_path, + collection, + ) + ) return True @@ -1374,54 +1381,50 @@ def _write_edge_headers(self): # load headers from data parse if not self.edge_property_dict: logger.error( - 'Header information not found. Was the data parsed first?', + "Header information not found. Was the data parsed first?", ) return False for label, props in self.edge_property_dict.items(): - # translate label to PascalCase pascal_label = self.translator.name_sentence_to_pascal(label) # paths - header = f'{pascal_label}-header.csv' + header = f"{pascal_label}-header.csv" header_path = os.path.join( self.outdir, header, ) - parts = f'{pascal_label}-part.*' + parts = f"{pascal_label}-part.*" # check for file exists if os.path.exists(header_path): logger.warning( - f'Header file {header_path} already exists. Overwriting.' + f"Header file {header_path} already exists. Overwriting." ) # concatenate key:value in props props_list = [] for k in props.keys(): + props_list.append(f"{k}") - props_list.append(f'{k}') + out_list = ["_from", "_key", *props_list, "_to"] - out_list = ['_from', '_key', *props_list, '_to'] - - with open(header_path, 'w', encoding='utf-8') as f: + with open(header_path, "w", encoding="utf-8") as f: # concatenate with delimiter row = self.delim.join(out_list) f.write(row) # add collection from schema config if not self.extended_schema.get(label): - for _, v in self.extended_schema.items(): - if v.get('label_as_edge') == label: - collection = v.get('db_collection_name', None) + if v.get("label_as_edge") == label: + collection = v.get("db_collection_name", None) break else: - collection = self.extended_schema[label].get( - 'db_collection_name', None + "db_collection_name", None ) # add file path to neo4 admin import statement (import call path @@ -1434,7 +1437,13 @@ def _write_edge_headers(self): self.import_call_file_prefix, parts, ) - self.import_call_edges.add((header_import_call_path, parts_import_call_path, collection,)) + self.import_call_edges.add( + ( + header_import_call_path, + parts_import_call_path, + collection, + ) + ) return True @@ -1449,8 +1458,8 @@ def _construct_import_call(self) -> str: str: a bash command for neo4j-admin import """ import_call = ( - f'{self.import_call_bin_prefix}arangoimp ' - f'--type csv ' + f"{self.import_call_bin_prefix}arangoimp " + f"--type csv " f'--separator="{self.escaped_delim}" ' ) @@ -1459,23 +1468,22 @@ def _construct_import_call(self) -> str: else: import_call += f"--quote='{self.quote}' " - node_lines = '' + node_lines = "" # node import calls: one line per node type for header_path, parts_path, collection in self.import_call_nodes: - line = ( - f'{import_call} ' - f'--headers-file {header_path} ' - f'--file= {parts_path} ' + f"{import_call} " + f"--headers-file {header_path} " + f"--file= {parts_path} " ) if collection: - line += f'--create-collection --collection {collection} ' + line += f"--create-collection --collection {collection} " - node_lines += f'{line}\n' + node_lines += f"{line}\n" - edge_lines = '' + edge_lines = "" # edge import calls: one line per edge type for header_path, parts_path, collection in self.import_call_edges: @@ -1502,15 +1510,15 @@ class _PostgreSQLBatchWriter(_BatchWriter): """ DATA_TYPE_LOOKUP = { - 'str': 'VARCHAR', # VARCHAR needs limit - 'int': 'INTEGER', - 'long': 'BIGINT', - 'float': 'NUMERIC', - 'double': 'NUMERIC', - 'dbl': 'NUMERIC', - 'boolean': 'BOOLEAN', - 'str[]': 'VARCHAR[]', - 'string[]': 'VARCHAR[]' + "str": "VARCHAR", # VARCHAR needs limit + "int": "INTEGER", + "long": "BIGINT", + "float": "NUMERIC", + "double": "NUMERIC", + "dbl": "NUMERIC", + "boolean": "BOOLEAN", + "str[]": "VARCHAR[]", + "string[]": "VARCHAR[]", } def __init__(self, *args, **kwargs): @@ -1524,7 +1532,7 @@ def _get_default_import_call_bin_prefix(self): Returns: str: The default location for the psql command """ - return '' + return "" def _get_data_type(self, string) -> str: try: @@ -1533,7 +1541,7 @@ def _get_data_type(self, string) -> str: logger.info( 'Could not determine data type {string}. Using default "VARCHAR"' ) - return 'VARCHAR' + return "VARCHAR" def _write_array_string(self, string_list) -> str: """ @@ -1546,7 +1554,7 @@ def _write_array_string(self, string_list) -> str: Returns: str: The string representation of an array for postgres COPY """ - string = ','.join(string_list) + string = ",".join(string_list) string = f'"{{{string}}}"' return string @@ -1557,10 +1565,10 @@ def _get_import_script_name(self) -> str: Returns: str: The name of the import script (ending in .sh) """ - return f'{self.db_name}-import-call.sh' + return f"{self.db_name}-import-call.sh" def _adjust_pascal_to_psql(self, string): - string = string.replace('.', '_') + string = string.replace(".", "_") string = string.lower() return string @@ -1576,7 +1584,7 @@ def _write_node_headers(self): # load headers from data parse if not self.node_property_dict: logger.error( - 'Header information not found. Was the data parsed first?', + "Header information not found. Was the data parsed first?", ) return False @@ -1586,7 +1594,7 @@ def _write_node_headers(self): # translate label to PascalCase pascal_label = self.translator.name_sentence_to_pascal(label) - parts = f'{pascal_label}-part*.csv' + parts = f"{pascal_label}-part*.csv" parts_paths = os.path.join(self.outdir, parts) parts_paths = glob.glob(parts_paths) parts_paths.sort() @@ -1595,36 +1603,36 @@ def _write_node_headers(self): pascal_label = self._adjust_pascal_to_psql(pascal_label) table_create_command_path = os.path.join( self.outdir, - f'{pascal_label}-create_table.sql', + f"{pascal_label}-create_table.sql", ) # check if file already exists if os.path.exists(table_create_command_path): logger.warning( - f'File {table_create_command_path} already exists. Overwriting.', + f"File {table_create_command_path} already exists. Overwriting.", ) # concatenate key:value in props - columns = ['_ID VARCHAR'] + columns = ["_ID VARCHAR"] for col_name, col_type in props.items(): col_type = self._get_data_type(col_type) col_name = self._adjust_pascal_to_psql(col_name) - columns.append(f'{col_name} {col_type}') - columns.append('_LABEL VARCHAR[]') + columns.append(f"{col_name} {col_type}") + columns.append("_LABEL VARCHAR[]") - with open(table_create_command_path, 'w', encoding='utf-8') as f: - - command = '' + with open(table_create_command_path, "w", encoding="utf-8") as f: + command = "" if self.wipe: - command += f'DROP TABLE IF EXISTS {pascal_label};\n' + command += f"DROP TABLE IF EXISTS {pascal_label};\n" # table creation requires comma separation - command += f'CREATE TABLE {pascal_label}({",".join(columns)});\n' + command += ( + f'CREATE TABLE {pascal_label}({",".join(columns)});\n' + ) f.write(command) for parts_path in parts_paths: - - # if import_call_file_prefix is set, replace actual path + # if import_call_file_prefix is set, replace actual path # with prefix if self.import_call_file_prefix != self.outdir: parts_path = parts_path.replace( @@ -1633,7 +1641,7 @@ def _write_node_headers(self): ) self._copy_from_csv_commands.add( - f'\\copy {pascal_label} FROM \'{parts_path}\' DELIMITER E\'{self.delim}\' CSV;' + f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;" ) # add file path to import statement @@ -1661,16 +1669,15 @@ def _write_edge_headers(self): # load headers from data parse if not self.edge_property_dict: logger.error( - 'Header information not found. Was the data parsed first?', + "Header information not found. Was the data parsed first?", ) return False for label, props in self.edge_property_dict.items(): - # translate label to PascalCase pascal_label = self.translator.name_sentence_to_pascal(label) - parts_paths = os.path.join(self.outdir, f'{pascal_label}-part*.csv') + parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv") parts_paths = glob.glob(parts_paths) parts_paths.sort() @@ -1678,13 +1685,13 @@ def _write_edge_headers(self): pascal_label = self._adjust_pascal_to_psql(pascal_label) table_create_command_path = os.path.join( self.outdir, - f'{pascal_label}-create_table.sql', + f"{pascal_label}-create_table.sql", ) # check for file exists if os.path.exists(table_create_command_path): logger.warning( - f'File {table_create_command_path} already exists. Overwriting.', + f"File {table_create_command_path} already exists. Overwriting.", ) # concatenate key:value in props @@ -1692,7 +1699,7 @@ def _write_edge_headers(self): for col_name, col_type in props.items(): col_type = self._get_data_type(col_type) col_name = self._adjust_pascal_to_psql(col_name) - if col_name == '_ID': + if col_name == "_ID": # should ideally never happen raise ValueError( "Column name '_ID' is reserved for internal use, " @@ -1700,26 +1707,30 @@ def _write_edge_headers(self): "different name for your column." ) - columns.append(f'{col_name} {col_type}') + columns.append(f"{col_name} {col_type}") # create list of lists and flatten # removes need for empty check of property list out_list = [ - '_START_ID VARCHAR', '_ID VARCHAR', *columns, '_END_ID VARCHAR', - '_TYPE VARCHAR' + "_START_ID VARCHAR", + "_ID VARCHAR", + *columns, + "_END_ID VARCHAR", + "_TYPE VARCHAR", ] - with open(table_create_command_path, 'w', encoding='utf-8') as f: - command = '' + with open(table_create_command_path, "w", encoding="utf-8") as f: + command = "" if self.wipe: - command += f'DROP TABLE IF EXISTS {pascal_label};\n' + command += f"DROP TABLE IF EXISTS {pascal_label};\n" # table creation requires comma separation - command += f'CREATE TABLE {pascal_label}({",".join(out_list)});\n' + command += ( + f'CREATE TABLE {pascal_label}({",".join(out_list)});\n' + ) f.write(command) for parts_path in parts_paths: - # if import_call_file_prefix is set, replace actual path # with prefix if self.import_call_file_prefix != self.outdir: @@ -1729,7 +1740,7 @@ def _write_edge_headers(self): ) self._copy_from_csv_commands.add( - f'\\copy {pascal_label} FROM \'{parts_path}\' DELIMITER E\'{self.delim}\' CSV;' + f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;" ) # add file path to import statement @@ -1740,7 +1751,7 @@ def _write_edge_headers(self): self.outdir, self.import_call_file_prefix, ) - + self.import_call_edges.add(table_create_command_path) return True @@ -1755,59 +1766,62 @@ def _construct_import_call(self) -> str: Returns: str: a bash command for postgresql import """ - import_call = '' + import_call = "" # create tables # At this point, csv files of nodes and edges do not require differentiation for import_file_path in [ - *self.import_call_nodes, *self.import_call_edges + *self.import_call_nodes, + *self.import_call_edges, ]: import_call += f'echo "Setup {import_file_path}..."\n' if {self.db_password}: # set password variable inline - import_call += f'PGPASSWORD={self.db_password} ' - import_call += f'{self.import_call_bin_prefix}psql -f {import_file_path}' - import_call += f' --dbname {self.db_name}' - import_call += f' --port {self.db_port}' - import_call += f' --user {self.db_user}' + import_call += f"PGPASSWORD={self.db_password} " + import_call += ( + f"{self.import_call_bin_prefix}psql -f {import_file_path}" + ) + import_call += f" --dbname {self.db_name}" + import_call += f" --port {self.db_port}" + import_call += f" --user {self.db_user}" import_call += '\necho "Done!"\n' - import_call += '\n' + import_call += "\n" # copy data to tables for command in self._copy_from_csv_commands: - table_part = command.split(' ')[3] + table_part = command.split(" ")[3] import_call += f'echo "Importing {table_part}..."\n' if {self.db_password}: # set password variable inline - import_call += f'PGPASSWORD={self.db_password} ' + import_call += f"PGPASSWORD={self.db_password} " import_call += f'{self.import_call_bin_prefix}psql -c "{command}"' - import_call += f' --dbname {self.db_name}' - import_call += f' --port {self.db_port}' - import_call += f' --user {self.db_user}' + import_call += f" --dbname {self.db_name}" + import_call += f" --port {self.db_port}" + import_call += f" --user {self.db_user}" import_call += '\necho "Done!"\n' - import_call += '\n' + import_call += "\n" return import_call DBMS_TO_CLASS = { - 'neo': _Neo4jBatchWriter, - 'neo4j': _Neo4jBatchWriter, - 'Neo4j': _Neo4jBatchWriter, - 'postgres': _PostgreSQLBatchWriter, - 'postgresql': _PostgreSQLBatchWriter, - 'PostgreSQL': _PostgreSQLBatchWriter, - 'arango': _ArangoDBBatchWriter, - 'arangodb': _ArangoDBBatchWriter, - 'ArangoDB': _ArangoDBBatchWriter, + "neo": _Neo4jBatchWriter, + "neo4j": _Neo4jBatchWriter, + "Neo4j": _Neo4jBatchWriter, + "postgres": _PostgreSQLBatchWriter, + "postgresql": _PostgreSQLBatchWriter, + "PostgreSQL": _PostgreSQLBatchWriter, + "arango": _ArangoDBBatchWriter, + "arangodb": _ArangoDBBatchWriter, + "ArangoDB": _ArangoDBBatchWriter, } def get_writer( dbms: str, - translator: 'Translator', - ontology: 'Ontology', - deduplicator: 'Deduplicator', + translator: "Translator", + ontology: "Ontology", + deduplicator: "Deduplicator", output_directory: str, strict_mode: bool, ): @@ -1835,34 +1849,36 @@ def get_writer( dbms_config = _config(dbms) - timestamp = lambda: datetime.now().strftime('%Y%m%d%H%M%S') - outdir = output_directory or os.path.join('biocypher-out', timestamp()) + timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S") + outdir = output_directory or os.path.join("biocypher-out", timestamp()) outdir = os.path.abspath(outdir) writer = DBMS_TO_CLASS[dbms] if not writer: - raise ValueError(f'Unknown dbms: {dbms}') + raise ValueError(f"Unknown dbms: {dbms}") if writer is not None: return writer( ontology=ontology, translator=translator, deduplicator=deduplicator, - delimiter=dbms_config.get('delimiter'), - array_delimiter=dbms_config.get('array_delimiter'), - quote=dbms_config.get('quote_character'), + delimiter=dbms_config.get("delimiter"), + array_delimiter=dbms_config.get("array_delimiter"), + quote=dbms_config.get("quote_character"), output_directory=outdir, - db_name=dbms_config.get('database_name'), - import_call_bin_prefix=dbms_config.get('import_call_bin_prefix'), - import_call_file_prefix=dbms_config.get('import_call_file_prefix'), - wipe=dbms_config.get('wipe'), + db_name=dbms_config.get("database_name"), + import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"), + import_call_file_prefix=dbms_config.get("import_call_file_prefix"), + wipe=dbms_config.get("wipe"), strict_mode=strict_mode, - skip_bad_relationships=dbms_config.get('skip_bad_relationships' - ), # neo4j - skip_duplicate_nodes=dbms_config.get('skip_duplicate_nodes' - ), # neo4j - db_user=dbms_config.get('user'), # psql - db_password=dbms_config.get('password'), # psql - db_port=dbms_config.get('port'), # psql + skip_bad_relationships=dbms_config.get( + "skip_bad_relationships" + ), # neo4j + skip_duplicate_nodes=dbms_config.get( + "skip_duplicate_nodes" + ), # neo4j + db_user=dbms_config.get("user"), # psql + db_password=dbms_config.get("password"), # psql + db_port=dbms_config.get("port"), # psql ) diff --git a/docs/adapters.md b/docs/adapters.md index 4cf3c6b9..20feaee7 100644 --- a/docs/adapters.md +++ b/docs/adapters.md @@ -30,7 +30,7 @@ tutorial. :::: The project view is built from issues in the [BioCypher GitHub repository]( -https://github.com/biocypher/biocypher/issues), which carry ``Fields`` (a +https://github.com/biocypher/biocypher/issues), which carry ``Fields`` (a GitHub Projects-specific attribute) to describe their category and features. In detail, these are as follows: @@ -118,4 +118,4 @@ RETURN n ``` For more information on how to use the graph, please refer to the [Neo4j -documentation](https://neo4j.com/docs/). \ No newline at end of file +documentation](https://neo4j.com/docs/). diff --git a/docs/conf.py b/docs/conf.py index 5c85a849..d51ea793 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,61 +22,60 @@ # -- Project information ----------------------------------------------------- -project = 'BioCypher' +project = "BioCypher" version = biocypher.__version__ -author = ', '.join(biocypher.__author__) -copyright = f'2021-{datetime.now():%Y}, BioCypher developers' +author = ", ".join(biocypher.__author__) +copyright = f"2021-{datetime.now():%Y}, BioCypher developers" # -- General configuration --------------------------------------------------- # TOC only in sidebar -master_doc = 'contents' +master_doc = "contents" html_sidebars = { - '**': - [ - 'globaltoc.html', - 'relations.html', - 'sourcelink.html', - 'searchbox.html', - ], + "**": [ + "globaltoc.html", + "relations.html", + "sourcelink.html", + "searchbox.html", + ], } # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'sphinx.ext.todo', # not for output but to remove warnings - 'sphinxext.opengraph', - 'myst_parser', # markdown support - 'sphinx_rtd_theme', - 'sphinx_design', + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.todo", # not for output but to remove warnings + "sphinxext.opengraph", + "myst_parser", # markdown support + "sphinx_rtd_theme", + "sphinx_design", ] -myst_enable_extensions = ['colon_fence'] +myst_enable_extensions = ["colon_fence"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'biocypher-log/'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "biocypher-log/"] # -- Autodoc configuration --------------------------------------------------- -autodoc_mock_imports = ['bmt', 'neo4j-utils'] +autodoc_mock_imports = ["bmt", "neo4j-utils"] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_title = 'BioCypher' -html_theme = 'sphinx_rtd_theme' +html_title = "BioCypher" +html_theme = "sphinx_rtd_theme" html_theme_options = { - 'navigation_depth': 2, - 'collapse_navigation': True, + "navigation_depth": 2, + "collapse_navigation": True, } # Add any paths that contain custom static files (such as style sheets) here, @@ -86,8 +85,8 @@ # -- OpenGraph configuration ------------------------------------------------- -ogp_site_url = 'https://biocypher.org' -ogp_image = 'https://biocypher.org/_images/biocypher-open-graph.png' +ogp_site_url = "https://biocypher.org" +ogp_image = "https://biocypher.org/_images/biocypher-open-graph.png" ogp_custom_meta_tags = [ '', '', @@ -95,4 +94,4 @@ '', '', ] -ogp_enable_meta_description = True \ No newline at end of file +ogp_enable_meta_description = True diff --git a/docs/index.rst b/docs/index.rst index 4ee91a38..2480dd53 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -114,4 +114,4 @@ information. :link: https://github.com/biocypher/biochatter :text-align: center - :octicon:`mark-github;3em` :octicon:`repo;3em` \ No newline at end of file + :octicon:`mark-github;3em` :octicon:`repo;3em` diff --git a/docs/r-bioc.md b/docs/r-bioc.md index 74f4ecd3..383b4559 100644 --- a/docs/r-bioc.md +++ b/docs/r-bioc.md @@ -2,4 +2,4 @@ We are working on a Bioconductor package to make BioCypher functionality available to the R community. The current work in progess is available in [this repository](https://vjcitn.github.io/biocBiocypher/index.html). If you are -interested in contributing or using the package, please get in touch! \ No newline at end of file +interested in contributing or using the package, please get in touch! diff --git a/docs/tutorial-adapter.md b/docs/tutorial-adapter.md index ff8ba903..567b9f04 100644 --- a/docs/tutorial-adapter.md +++ b/docs/tutorial-adapter.md @@ -43,7 +43,7 @@ There are currently two 'flavours' of adapters. The first is simpler and used in workflows that are similar to harmonisation scripts, where the BioCypher interface is instantiated in the same script as the adapter(s). In the second, the BioCypher interface is contained in the adapter class, which makes for a -more complex architecture, but allows for more involved workflows. In +more complex architecture, but allows for more involved workflows. In pseudo-code, the two approaches look like this: ```{code-block} python @@ -109,7 +109,7 @@ Graph](https://github.com/IGVF-DACC/igvf-catalog/tree/main/data) and the [Clinical Knowledge Graph migration](https://github.com/biocypher/clinical-knowledge-graph). -```{note} +```{note} While there are differences in implementation details, both approaches are largely functionally equivalent. At the current time, there is no clear diff --git a/docs/tutorial.md b/docs/tutorial.md index 7effe22f..345e1cd8 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -567,4 +567,4 @@ protein protein interaction: represented_as: edge use_id: false # ... -``` \ No newline at end of file +``` diff --git a/docs/user-experiences.md b/docs/user-experiences.md index 5a76d14f..44d5e961 100644 --- a/docs/user-experiences.md +++ b/docs/user-experiences.md @@ -9,7 +9,7 @@ repositories ("storage") and (2) project-specific knowledge graph creation ## A Knowledge Graph for Impact of Genomic Variation on Function (IGVF) -:::{card} Impact of Genomic Variation on Function (IGVF) +:::{card} Impact of Genomic Variation on Function (IGVF) :link: https://www.igvf.org/ The Impact of Genomic Variation on Function (IGVF) project aims to provide a @@ -28,7 +28,7 @@ creating a user-facing API (and eventually UI) that will access this graph. BioCypher, which acts as an intermediary between Biolink and graph databases (we are using ArangoDB) has been instrumental in helping us design the schema and move our project forward. Specifically, it provides a framework we can use to -parse the dozens of data files and formats into a Biolink-inspired schema. +parse the dozens of data files and formats into a Biolink-inspired schema. — Ben Hitz, Director of Genomics Data Resources, Project Manager ENCODE, Stanford University @@ -37,10 +37,10 @@ Stanford University The BioCypher pipeline used to build the knowledge graph uses several adapters for genetics data sources; an overview is available in our -[meta-graph](metagraph) and on the [GitHub Components +[meta-graph](metagraph) and on the [GitHub Components Board](https://github.com/orgs/biocypher/projects/3) (pipelines column). The pipeline boasts a Docker Compose workflow that builds the graph and the API -(using [tRPC](https://trpc.io/)), and is available on +(using [tRPC](https://trpc.io/)), and is available on [GitHub](https://github.com/IGVF-DACC/igvf-catalog). ## Drug Repurposing with CROssBAR @@ -72,7 +72,7 @@ multiple genes/proteins, compounds/drugs, diseases, phenotypes, pathways, or any combination of those, this procedure gets extremely complicated, requiring an average of 64 NoSQL queries to construct one single user-specific KG. The total number of lines of code required for this procedure alone is around 8000. -This task could have been achieved significantly faster and more efficiently +This task could have been achieved significantly faster and more efficiently if we had had BioCypher five years ago. — Tunca Doğan, Department of Computer Engineering and Artificial Intelligence @@ -84,7 +84,7 @@ Institute (EMBL-EBI) Using BioCypher, CROssBAR v2 will be a flexible property graph database comprised of single input adapters for each data source. As above, you can see -its current state in the [meta-graph](metagraph) and on the [GitHub Components +its current state in the [meta-graph](metagraph) and on the [GitHub Components Board](https://github.com/orgs/biocypher/projects/3) (pipelines column). ## Builing a Knowledge Graph for Contextualised Metabolic-Enzymatic Interactions @@ -124,4 +124,4 @@ The BioCypher pipeline used to build the knowledge graph uses several adapters, some of which overlap with the CROssBAR project, which helps synergising maintenance efforts. An overview is available in our [meta-graph](metagraph) and on the [GitHub Components -Board](https://github.com/orgs/biocypher/projects/3) (pipelines column). \ No newline at end of file +Board](https://github.com/orgs/biocypher/projects/3) (pipelines column). diff --git a/test/conftest.py b/test/conftest.py index 0a5abf48..c23a77fb 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -14,8 +14,8 @@ _ArangoDBBatchWriter, _PostgreSQLBatchWriter, ) -from biocypher._pandas import Pandas from biocypher._create import BioCypherEdge, BioCypherNode +from biocypher._pandas import Pandas from biocypher._connect import _Neo4jDriver from biocypher._mapping import OntologyMapping from biocypher._ontology import Ontology, OntologyAdapter @@ -25,29 +25,26 @@ # CLI option parser def pytest_addoption(parser): - options = ( # neo4j - ('database_name', 'The Neo4j database to be used for tests.'), - ('user', 'Tests access Neo4j as this user.'), - ('password', 'Password to access Neo4j.'), - ('uri', 'URI of the Neo4j server.'), - + ("database_name", "The Neo4j database to be used for tests."), + ("user", "Tests access Neo4j as this user."), + ("password", "Password to access Neo4j."), + ("uri", "URI of the Neo4j server."), # postgresl ( - 'database_name_postgresql', - 'The PostgreSQL database to be used for tests. Defaults to "postgresql-biocypher-test-TG2C7GsdNw".' + "database_name_postgresql", + 'The PostgreSQL database to be used for tests. Defaults to "postgresql-biocypher-test-TG2C7GsdNw".', ), - ('user_postgresql', 'Tests access PostgreSQL as this user.'), - ('password_postgresql', 'Password to access PostgreSQL.'), - ('port_postgresql', 'Port of the PostgreSQL server.'), + ("user_postgresql", "Tests access PostgreSQL as this user."), + ("password_postgresql", "Password to access PostgreSQL."), + ("port_postgresql", "Port of the PostgreSQL server."), ) for name, help_ in options: - parser.addoption( - f'--{name}', - action='store', + f"--{name}", + action="store", default=None, help=help_, ) @@ -56,33 +53,33 @@ def pytest_addoption(parser): # temporary output paths def get_random_string(length): letters = string.ascii_lowercase - return ''.join(random.choice(letters) for _ in range(length)) + return "".join(random.choice(letters) for _ in range(length)) # biocypher node generator -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def _get_nodes(l: int) -> list: nodes = [] for i in range(l): bnp = BioCypherNode( - node_id=f'p{i+1}', - node_label='protein', - preferred_id='uniprot', + node_id=f"p{i+1}", + node_label="protein", + preferred_id="uniprot", properties={ - 'score': 4 / (i + 1), - 'name': 'StringProperty1', - 'taxon': 9606, - 'genes': ['gene1', 'gene2'], + "score": 4 / (i + 1), + "name": "StringProperty1", + "taxon": 9606, + "genes": ["gene1", "gene2"], }, ) nodes.append(bnp) bnm = BioCypherNode( - node_id=f'm{i+1}', - node_label='microRNA', - preferred_id='mirbase', + node_id=f"m{i+1}", + node_label="microRNA", + preferred_id="mirbase", properties={ - 'name': 'StringProperty1', - 'taxon': 9606, + "name": "StringProperty1", + "taxon": 9606, }, ) nodes.append(bnm) @@ -91,31 +88,31 @@ def _get_nodes(l: int) -> list: # biocypher edge generator -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def _get_edges(l): edges = [] for i in range(l): e1 = BioCypherEdge( - relationship_id=f'prel{i}', - source_id=f'p{i}', - target_id=f'p{i + 1}', - relationship_label='PERTURBED_IN_DISEASE', + relationship_id=f"prel{i}", + source_id=f"p{i}", + target_id=f"p{i + 1}", + relationship_label="PERTURBED_IN_DISEASE", properties={ - 'residue': 'T253', - 'level': 4, + "residue": "T253", + "level": 4, }, # we suppose the verb-form relationship label is created by # translation functionality in translate.py ) edges.append(e1) e2 = BioCypherEdge( - relationship_id=f'mrel{i}', - source_id=f'm{i}', - target_id=f'p{i + 1}', - relationship_label='Is_Mutated_In', + relationship_id=f"mrel{i}", + source_id=f"m{i}", + target_id=f"p{i + 1}", + relationship_label="Is_Mutated_In", properties={ - 'site': '3-UTR', - 'confidence': 1, + "site": "3-UTR", + "confidence": 1, }, # we suppose the verb-form relationship label is created by # translation functionality in translate.py @@ -123,95 +120,95 @@ def _get_edges(l): edges.append(e2) return edges -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def deduplicator(): return Deduplicator() -@pytest.fixture(scope='module') + +@pytest.fixture(scope="module") def ontology_mapping(): return OntologyMapping( - config_file='biocypher/_config/test_schema_config.yaml' + config_file="biocypher/_config/test_schema_config.yaml" ) -@pytest.fixture(scope='module') + +@pytest.fixture(scope="module") def extended_ontology_mapping(): return OntologyMapping( - config_file='biocypher/_config/test_schema_config_extended.yaml' + config_file="biocypher/_config/test_schema_config_extended.yaml" ) -@pytest.fixture(scope='module') + +@pytest.fixture(scope="module") def disconnected_mapping(): return OntologyMapping( - config_file='biocypher/_config/test_schema_config_disconnected.yaml' + config_file="biocypher/_config/test_schema_config_disconnected.yaml" ) -@pytest.fixture(scope='module') + +@pytest.fixture(scope="module") def translator(extended_ontology_mapping): return Translator(extended_ontology_mapping) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def biolink_adapter(): return OntologyAdapter( - 'https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl', - 'entity' + "https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl", + "entity", ) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def so_adapter(): - return OntologyAdapter('test/so.owl', 'sequence_variant') + return OntologyAdapter("test/so.owl", "sequence_variant") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def go_adapter(): - return OntologyAdapter('test/go.owl', 'molecular_function') + return OntologyAdapter("test/go.owl", "molecular_function") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def mondo_adapter(): - return OntologyAdapter('test/mondo.owl', 'disease') + return OntologyAdapter("test/mondo.owl", "disease") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def hybrid_ontology(extended_ontology_mapping): return Ontology( head_ontology={ - 'url': - 'https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl', - 'root_node': - 'entity', + "url": "https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl", + "root_node": "entity", }, ontology_mapping=extended_ontology_mapping, tail_ontologies={ - 'so': - { - 'url': 'test/so.owl', - 'head_join_node': 'sequence variant', - 'tail_join_node': 'sequence_variant', - }, - 'mondo': - { - 'url': 'test/mondo.owl', - 'head_join_node': 'disease', - 'tail_join_node': 'human disease', - 'merge_nodes': False, - } + "so": { + "url": "test/so.owl", + "head_join_node": "sequence variant", + "tail_join_node": "sequence_variant", + }, + "mondo": { + "url": "test/mondo.owl", + "head_join_node": "disease", + "tail_join_node": "human disease", + "merge_nodes": False, + }, }, ) # neo4j batch writer fixtures -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def bw(hybrid_ontology, translator, deduplicator, tmp_path): - bw = _Neo4jBatchWriter( ontology=hybrid_ontology, translator=translator, deduplicator=deduplicator, output_directory=tmp_path, - delimiter=';', - array_delimiter='|', + delimiter=";", + array_delimiter="|", quote="'", ) @@ -224,16 +221,15 @@ def bw(hybrid_ontology, translator, deduplicator, tmp_path): # neo4j batch writer fixtures -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def bw_tab(hybrid_ontology, translator, deduplicator, tmp_path): - bw_tab = _Neo4jBatchWriter( ontology=hybrid_ontology, translator=translator, deduplicator=deduplicator, output_directory=tmp_path, - delimiter='\\t', - array_delimiter='|', + delimiter="\\t", + array_delimiter="|", quote="'", ) @@ -245,16 +241,15 @@ def bw_tab(hybrid_ontology, translator, deduplicator, tmp_path): os.rmdir(tmp_path) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def bw_strict(hybrid_ontology, translator, deduplicator, tmp_path): - bw = _Neo4jBatchWriter( ontology=hybrid_ontology, translator=translator, deduplicator=deduplicator, output_directory=tmp_path, - delimiter=';', - array_delimiter='|', + delimiter=";", + array_delimiter="|", quote="'", strict_mode=True, ) @@ -268,36 +263,31 @@ def bw_strict(hybrid_ontology, translator, deduplicator, tmp_path): # core instance fixture -@pytest.fixture(name='core', scope='function') +@pytest.fixture(name="core", scope="function") def create_core(request, tmp_path): - # TODO why does the integration test use a different path than this fixture? - marker = request.node.get_closest_marker('inject_core_args') + marker = request.node.get_closest_marker("inject_core_args") marker_args = {} # check if marker has attribute param - if marker and hasattr(marker, 'param'): - + if marker and hasattr(marker, "param"): marker_args = marker.param - if not marker_args and 'CORE' in globals(): - - c = globals()['CORE'] + if not marker_args and "CORE" in globals(): + c = globals()["CORE"] else: - core_args = { - 'schema_config_path': 'biocypher/_config/test_schema_config.yaml', - 'output_directory': tmp_path, + "schema_config_path": "biocypher/_config/test_schema_config.yaml", + "output_directory": tmp_path, } core_args.update(marker_args) c = BioCypher(**core_args) if not marker_args: - - globals()['CORE'] = c + globals()["CORE"] = c c._deduplicator = Deduplicator() # seems to reuse deduplicator from previous test, unsure why @@ -309,7 +299,8 @@ def create_core(request, tmp_path): os.remove(os.path.join(tmp_path, f)) os.rmdir(tmp_path) -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def _pd(deduplicator): return Pandas( ontology=None, @@ -317,22 +308,21 @@ def _pd(deduplicator): deduplicator=deduplicator, ) + # neo4j parameters -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def neo4j_param(request): - keys = ( - 'database_name', - 'user', - 'password', - 'uri', + "database_name", + "user", + "password", + "uri", ) - param = bcy_config('neo4j') + param = bcy_config("neo4j") cli = { - key: request.config.getoption(f'--{key}') or param[key] - for key in keys + key: request.config.getoption(f"--{key}") or param[key] for key in keys } return cli @@ -341,118 +331,104 @@ def neo4j_param(request): # skip test if neo4j is offline @pytest.fixture(autouse=True) def skip_if_offline_neo4j(request, neo4j_param, translator, hybrid_ontology): - - marker = request.node.get_closest_marker('requires_neo4j') + marker = request.node.get_closest_marker("requires_neo4j") if marker: - try: - marker_args = {} # check if marker has attribute param - if marker and hasattr(marker, 'param'): - + if marker and hasattr(marker, "param"): marker_args = marker.param driver_args = { - 'wipe': True, - 'multi_db': True, - 'translator': translator, - 'ontology': hybrid_ontology, + "wipe": True, + "multi_db": True, + "translator": translator, + "ontology": hybrid_ontology, } driver_args.update(marker_args) driver_args.update(neo4j_param) - driver_args['database_name'] = 'test' + driver_args["database_name"] = "test" _Neo4jDriver(**driver_args) except ServiceUnavailable as e: - - pytest.skip(f'Neo4j is offline: {e}') + pytest.skip(f"Neo4j is offline: {e}") # neo4j driver fixture -@pytest.fixture(name='driver', scope='function') +@pytest.fixture(name="driver", scope="function") def create_driver(request, neo4j_param, translator, hybrid_ontology): - marker = None # request.node.get_closest_marker('inject_driver_args') marker_args = {} # check if marker has attribute param - if marker and hasattr(marker, 'param'): - + if marker and hasattr(marker, "param"): marker_args = marker.param - if not marker_args and 'DRIVER' in globals(): - - d = globals()['DRIVER'] + if not marker_args and "DRIVER" in globals(): + d = globals()["DRIVER"] else: - driver_args = { - 'wipe': True, - 'multi_db': True, - 'translator': translator, - 'ontology': hybrid_ontology, + "wipe": True, + "multi_db": True, + "translator": translator, + "ontology": hybrid_ontology, } driver_args.update(marker_args) driver_args.update(neo4j_param) - driver_args['database_name'] = 'test' + driver_args["database_name"] = "test" d = _Neo4jDriver(**driver_args) if not marker_args: - - globals()['DRIVER'] = d + globals()["DRIVER"] = d yield d # teardown - d._driver.query('MATCH (n:Test)' - 'DETACH DELETE n') - d._driver.query('MATCH (n:Int1)' - 'DETACH DELETE n') - d._driver.query('MATCH (n:Int2)' - 'DETACH DELETE n') + d._driver.query("MATCH (n:Test)" "DETACH DELETE n") + d._driver.query("MATCH (n:Int1)" "DETACH DELETE n") + d._driver.query("MATCH (n:Int2)" "DETACH DELETE n") # to deal with merging on non-existing nodes # see test_add_single_biocypher_edge_missing_nodes() - d._driver.query("MATCH (n2) WHERE n2.id = 'src'" - 'DETACH DELETE n2') - d._driver.query("MATCH (n3) WHERE n3.id = 'tar'" - 'DETACH DELETE n3') + d._driver.query("MATCH (n2) WHERE n2.id = 'src'" "DETACH DELETE n2") + d._driver.query("MATCH (n3) WHERE n3.id = 'tar'" "DETACH DELETE n3") d._driver.close() ### postgresql ### -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def postgresql_param(request): - keys = ( - 'user_postgresql', - 'password_postgresql', - 'port_postgresql', + "user_postgresql", + "password_postgresql", + "port_postgresql", ) # get fallback parameters from biocypher config - param = bcy_config('postgresql') + param = bcy_config("postgresql") cli = {} for key in keys: # remove '_postgresql' suffix key_short = key[:-11] # change into format of input parameters - cli[f'db_{key_short}'] = request.config.getoption(f'--{key}' - ) or param[key_short] + cli[f"db_{key_short}"] = ( + request.config.getoption(f"--{key}") or param[key_short] + ) # hardcoded string for test-db name. test-db will be created for testing and droped after testing. # Do not take db_name from config to avoid accidental testing on the production database - cli['db_name'] = request.config.getoption( - '--database_name_postgresql' - ) or 'postgresql-biocypher-test-TG2C7GsdNw' + cli["db_name"] = ( + request.config.getoption("--database_name_postgresql") + or "postgresql-biocypher-test-TG2C7GsdNw" + ) return cli @@ -460,36 +436,38 @@ def postgresql_param(request): # skip test if postgresql is offline @pytest.fixture(autouse=True) def skip_if_offline_postgresql(request, postgresql_param): - - marker = request.node.get_closest_marker('requires_postgresql') + marker = request.node.get_closest_marker("requires_postgresql") if marker: - params = postgresql_param - user, port, password = params['db_user'], params['db_port'], params[ - 'db_password'] + user, port, password = ( + params["db_user"], + params["db_port"], + params["db_password"], + ) # an empty command, just to test if connection is possible - command = f'PGPASSWORD={password} psql -c \'\' --port {port} --user {user}' + command = ( + f"PGPASSWORD={password} psql -c '' --port {port} --user {user}" + ) process = subprocess.run(command, shell=True) # returncode is 0 when success if process.returncode != 0: - pytest.skip('Requires psql and connection to Postgresql server.') + pytest.skip("Requires psql and connection to Postgresql server.") -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def bw_comma_postgresql( postgresql_param, hybrid_ontology, translator, deduplicator, tmp_path ): - bw_comma = _PostgreSQLBatchWriter( ontology=hybrid_ontology, translator=translator, deduplicator=deduplicator, output_directory=tmp_path, - delimiter=',', - **postgresql_param + delimiter=",", + **postgresql_param, ) yield bw_comma @@ -500,16 +478,17 @@ def bw_comma_postgresql( os.rmdir(tmp_path) -@pytest.fixture(scope='function') -def bw_tab_postgresql(postgresql_param, hybrid_ontology, translator, deduplicator, tmp_path): - +@pytest.fixture(scope="function") +def bw_tab_postgresql( + postgresql_param, hybrid_ontology, translator, deduplicator, tmp_path +): bw_tab = _PostgreSQLBatchWriter( ontology=hybrid_ontology, translator=translator, deduplicator=deduplicator, output_directory=tmp_path, - delimiter='\\t', - **postgresql_param + delimiter="\\t", + **postgresql_param, ) yield bw_tab @@ -520,32 +499,35 @@ def bw_tab_postgresql(postgresql_param, hybrid_ontology, translator, deduplicato os.rmdir(tmp_path) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def create_database_postgres(postgresql_param): params = postgresql_param - dbname, user, port, password = params['db_name'], params['db_user'], params[ - 'db_port'], params['db_password'] + dbname, user, port, password = ( + params["db_name"], + params["db_user"], + params["db_port"], + params["db_password"], + ) # create the database - command = f'PGPASSWORD={password} psql -c \'CREATE DATABASE "{dbname}";\' --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'CREATE DATABASE \"{dbname}\";' --port {port} --user {user}" process = subprocess.run(command, shell=True) yield dbname, user, port, password, process.returncode == 0 # 0 if success # teardown - command = f'PGPASSWORD={password} psql -c \'DROP DATABASE "{dbname}";\' --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'DROP DATABASE \"{dbname}\";' --port {port} --user {user}" process = subprocess.run(command, shell=True) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def bw_arango(hybrid_ontology, translator, deduplicator, tmp_path): - bw_arango = _ArangoDBBatchWriter( ontology=hybrid_ontology, translator=translator, deduplicator=deduplicator, output_directory=tmp_path, - delimiter=',', + delimiter=",", ) yield bw_arango diff --git a/test/profile_performance.py b/test/profile_performance.py index bee843cd..a4921484 100644 --- a/test/profile_performance.py +++ b/test/profile_performance.py @@ -11,15 +11,15 @@ from biocypher._connect import _Neo4jDriver __all__ = [ - 'create_network_by_gen', - 'create_network_by_list', - 'create_networks', - 'delete_test_network', - 'explain_neo4j', - 'profile_neo4j', - 'remove_constraint', - 'setup_constraint', - 'visualise_benchmark', + "create_network_by_gen", + "create_network_by_list", + "create_networks", + "delete_test_network", + "explain_neo4j", + "profile_neo4j", + "remove_constraint", + "setup_constraint", + "visualise_benchmark", ] @@ -28,14 +28,14 @@ def create_network_by_gen(num_nodes, num_edges, profile=False, explain=False): def node_gen(num_nodes): for i in range(num_nodes): - yield BioCypherNode(i, 'test') + yield BioCypherNode(i, "test") def edge_gen(num_edges): for _ in range(num_edges): src = random.randint(1, num_nodes) tar = random.randint(1, num_nodes) - yield BioCypherEdge(src, tar, 'test') + yield BioCypherEdge(src, tar, "test") node_profile, np_printout = d.add_biocypher_nodes( node_gen(num_nodes), @@ -82,7 +82,7 @@ def create_network_by_list(num_nodes, num_edges): def node_list(num_nodes): ls = [] for i in range(num_nodes): - ls.append(BioCypherNode(i, 'test')) + ls.append(BioCypherNode(i, "test")) return ls @@ -91,7 +91,7 @@ def edge_list(num_edges): for _ in range(num_edges): src = random.randint(1, num_nodes) tar = random.randint(1, num_nodes) - ls.append(BioCypherEdge(src, tar, 'test')) + ls.append(BioCypherEdge(src, tar, "test")) return ls @@ -104,23 +104,23 @@ def edge_list(num_edges): def setup_constraint(): d = _Neo4jDriver(increment_version=False) d.query( - 'CREATE CONSTRAINT test_id ' - 'IF NOT EXISTS ON (n:test) ' - 'ASSERT n.id IS UNIQUE ', + "CREATE CONSTRAINT test_id " + "IF NOT EXISTS ON (n:test) " + "ASSERT n.id IS UNIQUE ", ) d.close() def remove_constraint(): d = _Neo4jDriver(increment_version=False) - d.query('DROP CONSTRAINT test_id') + d.query("DROP CONSTRAINT test_id") d.close() def delete_test_network(): d = _Neo4jDriver(increment_version=False) - d.query('MATCH (n)-[:test]-() DETACH DELETE n') - d.query('MATCH (n:test) DETACH DELETE n') + d.query("MATCH (n)-[:test]-() DETACH DELETE n") + d.query("MATCH (n:test) DETACH DELETE n") d.close() @@ -140,9 +140,9 @@ def create_networks(): ) delete_test_network() - res.update({'lis%s' % n: lis, 'lism%s' % n: lism}) + res.update({"lis%s" % n: lis, "lism%s" % n: lism}) - with open('benchmark.pickle', 'wb') as f: + with open("benchmark.pickle", "wb") as f: pickle.dump(res, f) print(res) @@ -153,57 +153,55 @@ def visualise_benchmark(): import matplotlib.pyplot as plt - with open('benchmark.pickle', 'rb') as f: + with open("benchmark.pickle", "rb") as f: res = pickle.load(f) - x = [key for key in res.keys() if 'lism' in key] - x = [int(e.replace('lism', '')) for e in x] - lis = [value for key, value in res.items() if 'lism' not in key] - lism = [value for key, value in res.items() if 'lism' in key] + x = [key for key in res.keys() if "lism" in key] + x = [int(e.replace("lism", "")) for e in x] + lis = [value for key, value in res.items() if "lism" not in key] + lism = [value for key, value in res.items() if "lism" in key] - plt.plot(x, lis, marker='o', label='List') - plt.plot(x, lism, marker='o', label='List (modified)') - plt.xlabel('Network size (nodes)') - plt.ylabel('Time (s)') + plt.plot(x, lis, marker="o", label="List") + plt.plot(x, lism, marker="o", label="List (modified)") + plt.xlabel("Network size (nodes)") + plt.ylabel("Time (s)") plt.legend() plt.show() def profile_neo4j(num_nodes, num_edges): - np, ep, epm = create_network_by_gen(num_nodes, num_edges, profile=True) - print('') - print(f'{bcolors.HEADER}### NODE PROFILE ###{bcolors.ENDC}') + print("") + print(f"{bcolors.HEADER}### NODE PROFILE ###{bcolors.ENDC}") for p in np[1]: print(p) - print('') - print(f'{bcolors.HEADER}### EDGE PROFILE ###{bcolors.ENDC}') + print("") + print(f"{bcolors.HEADER}### EDGE PROFILE ###{bcolors.ENDC}") for p in ep[1]: print(p) - print('') - print(f'{bcolors.HEADER}### MODIFIED EDGE PROFILE ###{bcolors.ENDC}') + print("") + print(f"{bcolors.HEADER}### MODIFIED EDGE PROFILE ###{bcolors.ENDC}") for p in epm[1]: print(p) def explain_neo4j(num_nodes, num_edges): - np, ep, epm = create_network_by_gen(num_nodes, num_edges, explain=True) - print('') - print(f'{bcolors.HEADER}### NODE PROFILE ###{bcolors.ENDC}') + print("") + print(f"{bcolors.HEADER}### NODE PROFILE ###{bcolors.ENDC}") for p in np[1]: print(p) - print('') - print(f'{bcolors.HEADER}### EDGE PROFILE ###{bcolors.ENDC}') + print("") + print(f"{bcolors.HEADER}### EDGE PROFILE ###{bcolors.ENDC}") for p in ep[1]: print(p) - print('') - print(f'{bcolors.HEADER}### MODIFIED EDGE PROFILE ###{bcolors.ENDC}') + print("") + print(f"{bcolors.HEADER}### MODIFIED EDGE PROFILE ###{bcolors.ENDC}") for p in epm[1]: print(p) -if __name__ == '__main__': +if __name__ == "__main__": # profile python performance with cProfile python_prof = False # run network creation (needed for python profiling) @@ -233,7 +231,7 @@ def explain_neo4j(num_nodes, num_edges): ps = pstats.Stats(profile, stream=s).sort_stats(sortby) ps.print_stats() # print(s.getvalue()) - filename = 'create_network.prof' + filename = "create_network.prof" ps.dump_stats(filename) if viz: diff --git a/test/rdflib_playground.py b/test/rdflib_playground.py index 6dca50c0..44ae877b 100644 --- a/test/rdflib_playground.py +++ b/test/rdflib_playground.py @@ -3,10 +3,9 @@ def ontology_to_tree(ontology_path, root_label, switch_id_and_label=True): - # Load the ontology into an rdflib Graph g = rdflib.Graph() - g.parse(ontology_path, format='ttl') + g.parse(ontology_path, format="ttl") # Loop through all labels in the ontology for s, _, o in g.triples((None, rdflib.RDFS.label, None)): @@ -15,14 +14,13 @@ def ontology_to_tree(ontology_path, root_label, switch_id_and_label=True): root = s break else: - raise ValueError(f'Could not find root node with label {root_label}') + raise ValueError(f"Could not find root node with label {root_label}") # Create a directed graph to represent the ontology as a tree G = nx.DiGraph() # Define a recursive function to add subclasses to the graph def add_subclasses(node): - # Only add nodes that have a label if (node, rdflib.RDFS.label, None) not in g: return @@ -31,25 +29,23 @@ def add_subclasses(node): if nx_id not in G: G.add_node(nx_id) - G.nodes[nx_id]['label'] = nx_label + G.nodes[nx_id]["label"] = nx_label # Recursively add all subclasses of the node to the graph for s, _, o in g.triples((None, rdflib.RDFS.subClassOf, node)): - # Only add nodes that have a label if (s, rdflib.RDFS.label, None) not in g: continue s_id, s_label = _get_nx_id_and_label(s) G.add_node(s_id) - G.nodes[s_id]['label'] = s_label + G.nodes[s_id]["label"] = s_label G.add_edge(s_id, nx_id) add_subclasses(s) add_parents(s) def add_parents(node): - # Only add nodes that have a label if (node, rdflib.RDFS.label, None) not in g: return @@ -58,7 +54,6 @@ def add_parents(node): # Recursively add all parents of the node to the graph for s, _, o in g.triples((node, rdflib.RDFS.subClassOf, None)): - # Only add nodes that have a label if (o, rdflib.RDFS.label, None) not in g: continue @@ -70,7 +65,7 @@ def add_parents(node): continue G.add_node(o_id) - G.nodes[o_id]['label'] = o_label + G.nodes[o_id]["label"] = o_label G.add_edge(nx_id, o_id) add_parents(o) @@ -95,15 +90,15 @@ def remove_prefix(uri: str) -> str: separator between the prefix and the local name. The prefix is everything before the last separator. """ - return uri.rsplit('#', 1)[-1].rsplit('/', 1)[-1] + return uri.rsplit("#", 1)[-1].rsplit("/", 1)[-1] -if __name__ == '__main__': - path = 'test/so.owl' - url = 'https://raw.githubusercontent.com/biolink/biolink-model/v3.2.1/biolink-model.owl.ttl' - root_label = 'entity' +if __name__ == "__main__": + path = "test/so.owl" + url = "https://raw.githubusercontent.com/biolink/biolink-model/v3.2.1/biolink-model.owl.ttl" + root_label = "entity" G = ontology_to_tree(url, root_label, switch_id_and_label=True) # depth first search: ancestors of the "protein" node - ancestors = nx.dfs_preorder_nodes(G, 'macromolecular complex') + ancestors = nx.dfs_preorder_nodes(G, "macromolecular complex") print(list(ancestors)) diff --git a/test/test_config.py b/test/test_config.py index e26ae474..e2611f43 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -4,12 +4,11 @@ def test_read_yaml(): - schema_config = _read_yaml('biocypher/_config/test_schema_config.yaml') + schema_config = _read_yaml("biocypher/_config/test_schema_config.yaml") - assert 'protein' in schema_config + assert "protein" in schema_config def test_for_special_characters(): - with pytest.warns(UserWarning): - _read_yaml('biocypher/_config/test_config.yaml') + _read_yaml("biocypher/_config/test_config.yaml") diff --git a/test/test_core.py b/test/test_core.py index ac227a96..0ba455a0 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -1,7 +1,8 @@ import pytest + def test_biocypher(core): - assert core._dbms == 'neo4j' + assert core._dbms == "neo4j" assert core._offline == True assert core._strict_mode == False @@ -11,12 +12,13 @@ def test_log_missing_types(core, translator): core._translator.notype = {} assert core.log_missing_input_labels() == None - core._translator.notype = {'a': 1, 'b': 2} + core._translator.notype = {"a": 1, "b": 2} mt = core.log_missing_input_labels() - assert mt.get('a') == 1 and mt.get('b') == 2 + assert mt.get("a") == 1 and mt.get("b") == 2 + -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_log_duplicates(core, deduplicator, _get_nodes): core._deduplicator = deduplicator nodes = _get_nodes + _get_nodes @@ -26,6 +28,7 @@ def test_log_duplicates(core, deduplicator, _get_nodes): assert True + # def test_access_translate(driver): # driver.start_ontology() diff --git a/test/test_create.py b/test/test_create.py index ce425c88..abedc0f8 100644 --- a/test/test_create.py +++ b/test/test_create.py @@ -12,7 +12,7 @@ def test_node(node): assert isinstance(node.get_properties(), dict) assert isinstance(node.get_dict(), dict) - assert 'id' in node.get_properties().keys() + assert "id" in node.get_properties().keys() @given(st.builds(BioCypherEdge)) @@ -34,4 +34,4 @@ def test_rel_as_node(rel_as_node): def test_rel_as_node_invalid_node(): with pytest.raises(TypeError): - BioCypherRelAsNode('str', 1, 2.5122) + BioCypherRelAsNode("str", 1, 2.5122) diff --git a/test/test_deduplicate.py b/test/test_deduplicate.py index e82c0e99..1efa122e 100644 --- a/test/test_deduplicate.py +++ b/test/test_deduplicate.py @@ -1,45 +1,47 @@ import pytest -from biocypher._create import BioCypherNode, BioCypherEdge + +from biocypher._create import BioCypherEdge, BioCypherNode from biocypher._deduplicate import Deduplicator -@pytest.mark.parametrize('l', [4], scope='module') + +@pytest.mark.parametrize("l", [4], scope="module") def test_duplicate_nodes(_get_nodes): dedup = Deduplicator() nodes = _get_nodes nodes.append( BioCypherNode( - node_id='p1', - node_label='protein', + node_id="p1", + node_label="protein", properties={ - 'name': 'StringProperty1', - 'score': 4.32, - 'taxon': 9606, - 'genes': ['gene1', 'gene2'] - } + "name": "StringProperty1", + "score": 4.32, + "taxon": 9606, + "genes": ["gene1", "gene2"], + }, ) ) for node in nodes: dedup.node_seen(node) - assert 'protein' in dedup.duplicate_node_types - assert 'p1' in dedup.duplicate_node_ids + assert "protein" in dedup.duplicate_node_types + assert "p1" in dedup.duplicate_node_ids -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_get_duplicate_nodes(_get_nodes): dedup = Deduplicator() nodes = _get_nodes nodes.append( BioCypherNode( - node_id='p1', - node_label='protein', + node_id="p1", + node_label="protein", properties={ - 'name': 'StringProperty1', - 'score': 4.32, - 'taxon': 9606, - 'genes': ['gene1', 'gene2'] - } + "name": "StringProperty1", + "score": 4.32, + "taxon": 9606, + "genes": ["gene1", "gene2"], + }, ) ) @@ -50,24 +52,25 @@ def test_get_duplicate_nodes(_get_nodes): types = d[0] ids = d[1] - assert 'protein' in types - assert 'p1' in ids + assert "protein" in types + assert "p1" in ids -@pytest.mark.parametrize('l', [4], scope='module') + +@pytest.mark.parametrize("l", [4], scope="module") def test_duplicate_edges(_get_edges): dedup = Deduplicator() edges = _get_edges edges.append( BioCypherEdge( - relationship_id='mrel2', - source_id='m2', - target_id='p3', - relationship_label='Is_Mutated_In', + relationship_id="mrel2", + source_id="m2", + target_id="p3", + relationship_label="Is_Mutated_In", properties={ - 'score': 4.32, - 'taxon': 9606, - 'genes': ['gene1', 'gene2'] - } + "score": 4.32, + "taxon": 9606, + "genes": ["gene1", "gene2"], + }, ) ) # this will fail if we go beyond concatenation of ids @@ -75,24 +78,25 @@ def test_duplicate_edges(_get_edges): for edge in edges: dedup.edge_seen(edge) - assert 'Is_Mutated_In' in dedup.duplicate_edge_types - assert ('mrel2') in dedup.duplicate_edge_ids + assert "Is_Mutated_In" in dedup.duplicate_edge_types + assert ("mrel2") in dedup.duplicate_edge_ids + -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_get_duplicate_edges(_get_edges): dedup = Deduplicator() edges = _get_edges edges.append( BioCypherEdge( - relationship_id='mrel2', - source_id='m2', - target_id='p3', - relationship_label='Is_Mutated_In', + relationship_id="mrel2", + source_id="m2", + target_id="p3", + relationship_label="Is_Mutated_In", properties={ - 'score': 4.32, - 'taxon': 9606, - 'genes': ['gene1', 'gene2'] - } + "score": 4.32, + "taxon": 9606, + "genes": ["gene1", "gene2"], + }, ) ) # this will fail if we go beyond concatenation of ids @@ -104,5 +108,5 @@ def test_get_duplicate_edges(_get_edges): types = d[0] ids = d[1] - assert 'Is_Mutated_In' in types - assert ('mrel2') in ids \ No newline at end of file + assert "Is_Mutated_In" in types + assert ("mrel2") in ids diff --git a/test/test_driver.py b/test/test_driver.py index 82a0ee1f..2273ed3d 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -7,13 +7,11 @@ @pytest.mark.requires_neo4j def test_create_driver(driver): - assert isinstance(driver, _Neo4jDriver) @pytest.mark.requires_neo4j def test_connect_to_db(driver): - assert isinstance(driver._driver.driver, neo4j.Neo4jDriver) @@ -24,28 +22,29 @@ def test_increment_version(driver): driver._driver.query(query) driver._update_meta_graph() - r, summary = driver._driver.query('MATCH (n:BioCypher) ' - 'RETURN n', ) + r, summary = driver._driver.query( + "MATCH (n:BioCypher) " "RETURN n", + ) assert len(r) == 2 @pytest.mark.requires_neo4j def test_explain(driver): - query = 'MATCH (n) WITH n LIMIT 25 MATCH (n)--(m)--(f) RETURN n, m, f' + query = "MATCH (n) WITH n LIMIT 25 MATCH (n)--(m)--(f) RETURN n, m, f" e = driver._driver.explain(query) t = e[0] - assert 'args' in t and 'identifiers' in t + assert "args" in t and "identifiers" in t @pytest.mark.requires_neo4j def test_profile(driver): - query = 'MATCH (n) RETURN n LIMIT 100' + query = "MATCH (n) RETURN n LIMIT 100" p = driver._driver.profile(query) t = p[0] - assert 'args' in t and 'identifiers' in t + assert "args" in t and "identifiers" in t @pytest.mark.requires_neo4j @@ -56,34 +55,30 @@ def test_add_invalid_biocypher_node(driver): driver.add_biocypher_nodes(1) with pytest.raises(ValueError): - driver.add_biocypher_nodes('String') + driver.add_biocypher_nodes("String") @pytest.mark.requires_neo4j def test_add_single_biocypher_node(driver): # neo4j database needs to be running! - n = BioCypherNode(node_id='test_id1', node_label='Test') + n = BioCypherNode(node_id="test_id1", node_label="Test") driver.add_biocypher_nodes(n) r, summary = driver._driver.query( - 'MATCH (n:Test) ' - 'WITH n, n.id AS id ' - 'RETURN id ', + "MATCH (n:Test) " "WITH n, n.id AS id " "RETURN id ", ) - assert r[0]['id'] == 'test_id1' + assert r[0]["id"] == "test_id1" @pytest.mark.requires_neo4j def test_add_biocypher_node_list(driver): # neo4j database needs to be running! - n1 = BioCypherNode(node_id='test_id1', node_label='Test') - n2 = BioCypherNode(node_id='test_id2', node_label='Test') + n1 = BioCypherNode(node_id="test_id1", node_label="Test") + n2 = BioCypherNode(node_id="test_id2", node_label="Test") driver.add_biocypher_nodes([n1, n2]) r, summary = driver._driver.query( - 'MATCH (n:Test) ' - 'WITH n, n.id AS id ' - 'RETURN id ', + "MATCH (n:Test) " "WITH n, n.id AS id " "RETURN id ", ) - assert set([r[0]['id'], r[1]['id']]) == set(['test_id1', 'test_id2']) + assert set([r[0]["id"], r[1]["id"]]) == set(["test_id1", "test_id2"]) @pytest.mark.requires_neo4j @@ -94,42 +89,42 @@ def gen(nodes): for g in nodes: yield BioCypherNode(g[0], g[1]) - g = gen([('test_id1', 'Test'), ('test_id2', 'Test')]) + g = gen([("test_id1", "Test"), ("test_id2", "Test")]) driver.add_biocypher_nodes(g) r, summary = driver._driver.query( - 'MATCH (n:Test) ' - 'WITH n, n.id AS id ' - 'RETURN id ', + "MATCH (n:Test) " "WITH n, n.id AS id " "RETURN id ", ) - ids = [n['id'] for n in r] + ids = [n["id"] for n in r] - assert 'test_id1' in ids - assert 'test_id2' in ids + assert "test_id1" in ids + assert "test_id2" in ids @pytest.mark.requires_neo4j def test_add_specific_id_node(driver): - n = BioCypherNode(node_id='CHAT', node_label='Gene', preferred_id='hgnc') + n = BioCypherNode(node_id="CHAT", node_label="Gene", preferred_id="hgnc") driver.add_biocypher_nodes(n) - r, summary = driver._driver.query('MATCH (n:Gene) ' - 'RETURN n', ) + r, summary = driver._driver.query( + "MATCH (n:Gene) " "RETURN n", + ) - assert r[0]['n'].get('id') == 'CHAT' - assert r[0]['n'].get('preferred_id') == 'hgnc' + assert r[0]["n"].get("id") == "CHAT" + assert r[0]["n"].get("preferred_id") == "hgnc" @pytest.mark.requires_neo4j def test_add_generic_id_node(driver): - n = BioCypherNode(node_id='CHAT', node_label='Gene', preferred_id='HGNC') + n = BioCypherNode(node_id="CHAT", node_label="Gene", preferred_id="HGNC") driver.add_biocypher_nodes(n) - r, summary = driver._driver.query('MATCH (n:Gene) ' - 'RETURN n', ) + r, summary = driver._driver.query( + "MATCH (n:Gene) " "RETURN n", + ) - assert r[0]['n'].get('id') is not None + assert r[0]["n"].get("id") is not None @pytest.mark.requires_neo4j @@ -142,20 +137,21 @@ def test_add_invalid_biocypher_edge(driver): @pytest.mark.requires_neo4j def test_add_single_biocypher_edge_explicit_node_creation(driver): # neo4j database needs to be running! - n1 = BioCypherNode('src', 'Test') - n2 = BioCypherNode('tar', 'Test') + n1 = BioCypherNode("src", "Test") + n2 = BioCypherNode("tar", "Test") driver.add_biocypher_nodes([n1, n2]) - e = BioCypherEdge('src', 'tar', 'Test') + e = BioCypherEdge("src", "tar", "Test") driver.add_biocypher_edges(e) r, summary = driver._driver.query( - 'MATCH (n1)-[r:Test]->(n2) ' - 'WITH n1, n2, n1.id AS id1, n2.id AS id2, type(r) AS label ' - 'RETURN id1, id2, label', + "MATCH (n1)-[r:Test]->(n2) " + "WITH n1, n2, n1.id AS id1, n2.id AS id2, type(r) AS label " + "RETURN id1, id2, label", ) assert ( - r[0]['id1'] == 'src' and r[0]['id2'] == 'tar' and - r[0]['label'] == 'Test' + r[0]["id1"] == "src" + and r[0]["id2"] == "tar" + and r[0]["label"] == "Test" ) @@ -165,50 +161,53 @@ def test_add_single_biocypher_edge_missing_nodes(driver): # merging on non-existing nodes creates them without labels; what is # the desired behaviour here? do we only want to MATCH? - e = BioCypherEdge('src', 'tar', 'Test') + e = BioCypherEdge("src", "tar", "Test") driver.add_biocypher_edges(e) r, summary = driver._driver.query( - 'MATCH (n1)-[r:Test]->(n2) ' - 'WITH n1, n2, n1.id AS id1, n2.id AS id2, type(r) AS label ' - 'RETURN id1, id2, label', + "MATCH (n1)-[r:Test]->(n2) " + "WITH n1, n2, n1.id AS id1, n2.id AS id2, type(r) AS label " + "RETURN id1, id2, label", ) assert ( - r[0]['id1'] == 'src' and r[0]['id2'] == 'tar' and - r[0]['label'] == 'Test' + r[0]["id1"] == "src" + and r[0]["id2"] == "tar" + and r[0]["label"] == "Test" ) @pytest.mark.requires_neo4j def test_add_biocypher_edge_list(driver): # neo4j database needs to be running! - n1 = BioCypherNode('src', 'Test') - n2 = BioCypherNode('tar1', 'Test') - n3 = BioCypherNode('tar2', 'Test') + n1 = BioCypherNode("src", "Test") + n2 = BioCypherNode("tar1", "Test") + n3 = BioCypherNode("tar2", "Test") driver.add_biocypher_nodes([n1, n2, n3]) # edge list - e1 = BioCypherEdge('src', 'tar1', 'Test1') - e2 = BioCypherEdge('src', 'tar2', 'Test2') + e1 = BioCypherEdge("src", "tar1", "Test1") + e2 = BioCypherEdge("src", "tar2", "Test2") driver.add_biocypher_edges([e1, e2]) r, summary = driver._driver.query( - 'MATCH (n3)<-[r2:Test2]-(n1)-[r1:Test1]->(n2) ' - 'WITH n1, n2, n3, n1.id AS id1, n2.id AS id2, n3.id AS id3, ' - 'type(r1) AS label1, type(r2) AS label2 ' - 'RETURN id1, id2, id3, label1, label2', + "MATCH (n3)<-[r2:Test2]-(n1)-[r1:Test1]->(n2) " + "WITH n1, n2, n3, n1.id AS id1, n2.id AS id2, n3.id AS id3, " + "type(r1) AS label1, type(r2) AS label2 " + "RETURN id1, id2, id3, label1, label2", ) assert ( - r[0]['id1'] == 'src' and r[0]['id2'] == 'tar1' and - r[0]['id3'] == 'tar2' and r[0]['label1'] == 'Test1' and - r[0]['label2'] == 'Test2' + r[0]["id1"] == "src" + and r[0]["id2"] == "tar1" + and r[0]["id3"] == "tar2" + and r[0]["label1"] == "Test1" + and r[0]["label2"] == "Test2" ) @pytest.mark.requires_neo4j def test_add_biocypher_edge_generator(driver): # neo4j database needs to be running! - n1 = BioCypherNode('src', 'Test') - n2 = BioCypherNode('tar1', 'Test') - n3 = BioCypherNode('tar2', 'Test') + n1 = BioCypherNode("src", "Test") + n2 = BioCypherNode("tar1", "Test") + n3 = BioCypherNode("tar2", "Test") driver.add_biocypher_nodes([n1, n2, n3]) # generator @@ -221,64 +220,69 @@ def gen(edges): ) # edge list - e1 = BioCypherEdge('src', 'tar1', 'Test1') - e2 = BioCypherEdge('src', 'tar2', 'Test2') + e1 = BioCypherEdge("src", "tar1", "Test1") + e2 = BioCypherEdge("src", "tar2", "Test2") g = gen([e1, e2]) driver.add_biocypher_edges(g) r, summary = driver._driver.query( - 'MATCH (n3)<-[r2:Test2]-(n1)-[r1:Test1]->(n2) ' - 'WITH n1, n2, n3, n1.id AS id1, n2.id AS id2, n3.id AS id3, ' - 'type(r1) AS label1, type(r2) AS label2 ' - 'RETURN id1, id2, id3, label1, label2', + "MATCH (n3)<-[r2:Test2]-(n1)-[r1:Test1]->(n2) " + "WITH n1, n2, n3, n1.id AS id1, n2.id AS id2, n3.id AS id3, " + "type(r1) AS label1, type(r2) AS label2 " + "RETURN id1, id2, id3, label1, label2", ) assert ( - r[0]['id1'] == 'src' and r[0]['id2'] == 'tar1' and - r[0]['id3'] == 'tar2' and r[0]['label1'] == 'Test1' and - r[0]['label2'] == 'Test2' + r[0]["id1"] == "src" + and r[0]["id2"] == "tar1" + and r[0]["id3"] == "tar2" + and r[0]["label1"] == "Test1" + and r[0]["label2"] == "Test2" ) @pytest.mark.requires_neo4j def test_add_biocypher_interaction_as_BioCypherRelAsNode_list(driver): # neo4j database needs to be running! - i1 = BioCypherNode('int1', 'Int1') - i2 = BioCypherNode('int2', 'Int2') + i1 = BioCypherNode("int1", "Int1") + i2 = BioCypherNode("int2", "Int2") driver.add_biocypher_nodes([i1, i2]) - e1 = BioCypherEdge('src', 'int1', 'is_source_of') - e2 = BioCypherEdge('tar', 'int1', 'is_target_of') - e3 = BioCypherEdge('src', 'int2', 'is_source_of') - e4 = BioCypherEdge('tar', 'int2', 'is_target_of') + e1 = BioCypherEdge("src", "int1", "is_source_of") + e2 = BioCypherEdge("tar", "int1", "is_target_of") + e3 = BioCypherEdge("src", "int2", "is_source_of") + e4 = BioCypherEdge("tar", "int2", "is_target_of") r1, r2 = BioCypherRelAsNode(i1, e1, e2), BioCypherRelAsNode(i2, e3, e4) driver.add_biocypher_edges([r1, r2]) r, summary = driver._driver.query( - 'MATCH (n2)-[e4:is_target_of]->(i2:Int2)<-[e3:is_source_of]-' - '(n1)-[e1:is_source_of]->(i1:Int1)<-[e2:is_target_of]-(n2)' - 'WITH n1, n2, i1, i2, n1.id AS id1, n2.id AS id2, ' - 'i1.id AS id3, i2.id AS id4, ' - 'type(e1) AS label1, type(e2) AS label2, ' - 'type(e3) AS label3, type(e4) AS label4 ' - 'RETURN id1, id2, id3, id4, label1, label2, label3, label4', + "MATCH (n2)-[e4:is_target_of]->(i2:Int2)<-[e3:is_source_of]-" + "(n1)-[e1:is_source_of]->(i1:Int1)<-[e2:is_target_of]-(n2)" + "WITH n1, n2, i1, i2, n1.id AS id1, n2.id AS id2, " + "i1.id AS id3, i2.id AS id4, " + "type(e1) AS label1, type(e2) AS label2, " + "type(e3) AS label3, type(e4) AS label4 " + "RETURN id1, id2, id3, id4, label1, label2, label3, label4", ) assert ( - r[0]['id1'] == 'src' and r[0]['id2'] == 'tar' and - r[0]['id3'] == 'int1' and r[0]['id4'] == 'int2' and - r[0]['label1'] == 'is_source_of' and - r[0]['label2'] == 'is_target_of' and - r[0]['label3'] == 'is_source_of' and r[0]['label4'] == 'is_target_of' + r[0]["id1"] == "src" + and r[0]["id2"] == "tar" + and r[0]["id3"] == "int1" + and r[0]["id4"] == "int2" + and r[0]["label1"] == "is_source_of" + and r[0]["label2"] == "is_target_of" + and r[0]["label3"] == "is_source_of" + and r[0]["label4"] == "is_target_of" ) @pytest.mark.requires_neo4j def test_add_biocypher_interaction_as_BioCypherRelAsNode_generator(driver): # neo4j database needs to be running! - i1 = BioCypherNode('int1', 'Int1') - i2 = BioCypherNode('int2', 'Int2') + i1 = BioCypherNode("int1", "Int1") + i2 = BioCypherNode("int2", "Int2") driver.add_biocypher_nodes([i1, i2]) - e1 = BioCypherEdge('src', 'int1', 'is_source_of') - e2 = BioCypherEdge('tar', 'int1', 'is_target_of') - e3 = BioCypherEdge('src', 'int2', 'is_source_of') - e4 = BioCypherEdge('tar', 'int2', 'is_target_of') + e1 = BioCypherEdge("src", "int1", "is_source_of") + e2 = BioCypherEdge("tar", "int1", "is_target_of") + e3 = BioCypherEdge("src", "int2", "is_source_of") + e4 = BioCypherEdge("tar", "int2", "is_target_of") r1, r2 = BioCypherRelAsNode(i1, e1, e2), BioCypherRelAsNode(i2, e3, e4) relasnode_list = [r1, r2] @@ -288,40 +292,43 @@ def gen(lis): driver.add_biocypher_edges(gen(relasnode_list)) r, summary = driver._driver.query( - 'MATCH (n2)-[e4:is_target_of]->(i2:Int2)<-[e3:is_source_of]-' - '(n1)-[e1:is_source_of]->(i1:Int1)<-[e2:is_target_of]-(n2)' - 'WITH n1, n2, i1, i2, n1.id AS id1, n2.id AS id2, ' - 'i1.id AS id3, i2.id AS id4, ' - 'type(e1) AS label1, type(e2) AS label2, ' - 'type(e3) AS label3, type(e4) AS label4 ' - 'RETURN id1, id2, id3, id4, label1, label2, label3, label4', + "MATCH (n2)-[e4:is_target_of]->(i2:Int2)<-[e3:is_source_of]-" + "(n1)-[e1:is_source_of]->(i1:Int1)<-[e2:is_target_of]-(n2)" + "WITH n1, n2, i1, i2, n1.id AS id1, n2.id AS id2, " + "i1.id AS id3, i2.id AS id4, " + "type(e1) AS label1, type(e2) AS label2, " + "type(e3) AS label3, type(e4) AS label4 " + "RETURN id1, id2, id3, id4, label1, label2, label3, label4", ) assert ( - r[0]['id1'] == 'src' and r[0]['id2'] == 'tar' and - r[0]['id3'] == 'int1' and r[0]['id4'] == 'int2' and - r[0]['label1'] == 'is_source_of' and - r[0]['label2'] == 'is_target_of' and - r[0]['label3'] == 'is_source_of' and r[0]['label4'] == 'is_target_of' + r[0]["id1"] == "src" + and r[0]["id2"] == "tar" + and r[0]["id3"] == "int1" + and r[0]["id4"] == "int2" + and r[0]["label1"] == "is_source_of" + and r[0]["label2"] == "is_target_of" + and r[0]["label3"] == "is_source_of" + and r[0]["label4"] == "is_target_of" ) @pytest.mark.requires_neo4j def test_pretty_profile(driver): prof, printout = driver._driver.profile( - 'UNWIND [1,2,3,4,5] as id ' - 'MERGE (n:Test {id: id}) ' - 'MERGE (x:Test {id: id + 1})', + "UNWIND [1,2,3,4,5] as id " + "MERGE (n:Test {id: id}) " + "MERGE (x:Test {id: id + 1})", ) - assert 'args' in prof and 'ProduceResults' in printout[1] + assert "args" in prof and "ProduceResults" in printout[1] @pytest.mark.requires_neo4j def test_pretty_explain(driver): plan, printout = driver._driver.explain( - 'UNWIND [1,2,3,4,5] as id ' - 'MERGE (n:Test {id: id}) ' - 'MERGE (x:Test {id: id + 1})', + "UNWIND [1,2,3,4,5] as id " + "MERGE (n:Test {id: id}) " + "MERGE (x:Test {id: id + 1})", ) - assert 'args' in plan and 'ProduceResults' in printout[0] + assert "args" in plan and "ProduceResults" in printout[0] diff --git a/test/test_integration.py b/test/test_integration.py index 23d6e701..ea038118 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -3,7 +3,7 @@ import pytest -@pytest.mark.parametrize('l', [4], scope='function') +@pytest.mark.parametrize("l", [4], scope="function") def test_write_node_data_from_gen(core, _get_nodes): nodes = _get_nodes @@ -15,8 +15,8 @@ def node_gen(nodes): path = core._output_directory - p_csv = os.path.join(path, 'Protein-part000.csv') - m_csv = os.path.join(path, 'MicroRNA-part000.csv') + p_csv = os.path.join(path, "Protein-part000.csv") + m_csv = os.path.join(path, "MicroRNA-part000.csv") with open(p_csv) as f: pr = f.read() @@ -26,9 +26,9 @@ def node_gen(nodes): assert passed assert "p1;'StringProperty1';4.0;9606;'gene1|gene2';'p1';'uniprot'" in pr - assert 'BiologicalEntity' in pr + assert "BiologicalEntity" in pr assert "m1;'StringProperty1';9606;'m1';'mirbase'" in mi - assert 'ChemicalEntity' in mi + assert "ChemicalEntity" in mi def test_show_ontology_structure_kwargs(core): diff --git a/test/test_mapping.py b/test/test_mapping.py index 1169f74f..801ca017 100644 --- a/test/test_mapping.py +++ b/test/test_mapping.py @@ -2,27 +2,26 @@ def test_inheritance_loop(ontology_mapping): + assert "gene to variant association" in ontology_mapping.schema.keys() - assert 'gene to variant association' in ontology_mapping.schema.keys() - - assert 'gene to variant association' not in ontology_mapping.extended_schema.keys( + assert ( + "gene to variant association" + not in ontology_mapping.extended_schema.keys() ) def test_virtual_leaves_node(ontology_mapping): - - assert 'wikipathways.pathway' in ontology_mapping.extended_schema + assert "wikipathways.pathway" in ontology_mapping.extended_schema def test_getting_properties_via_config(ontology_mapping): - - assert 'name' in ontology_mapping.extended_schema['protein'].get( - 'properties' - ).keys() + assert ( + "name" + in ontology_mapping.extended_schema["protein"].get("properties").keys() + ) def test_preferred_id_optional(ontology_mapping): + pti = ontology_mapping.extended_schema.get("post translational interaction") - pti = ontology_mapping.extended_schema.get('post translational interaction') - - assert pti.get('preferred_id') == 'id' + assert pti.get("preferred_id") == "id" diff --git a/test/test_misc.py b/test/test_misc.py index ff6d7bc8..85e63e9d 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -4,52 +4,49 @@ from biocypher._misc import create_tree_visualisation inheritance_tree = { - 'B': 'A', - 'C': 'A', - 'D': 'B', - 'E': 'B', - 'F': 'C', - 'G': 'C', - 'H': 'E', - 'I': 'G', + "B": "A", + "C": "A", + "D": "B", + "E": "B", + "F": "C", + "G": "C", + "H": "E", + "I": "G", } disjoint_tree = { - 'B': 'A', - 'C': 'A', - 'D': 'B', - 'F': 'E', - 'G': 'E', - 'H': 'F', + "B": "A", + "C": "A", + "D": "B", + "F": "E", + "G": "E", + "H": "F", } def test_tree_vis(): - tree_vis = create_tree_visualisation(inheritance_tree) assert tree_vis.DEPTH == 1 assert tree_vis.WIDTH == 2 - assert tree_vis.root == 'A' + assert tree_vis.root == "A" def test_tree_vis_from_networkx(): - G = nx.DiGraph(inheritance_tree) tree_vis = create_tree_visualisation(G) assert tree_vis.DEPTH == 1 assert tree_vis.WIDTH == 2 - assert tree_vis.root == 'A' + assert tree_vis.root == "A" def test_disjoint_tree(): - with pytest.raises(ValueError): create_tree_visualisation(disjoint_tree) -if __name__ == '__main__': +if __name__ == "__main__": # to look at it print(create_tree_visualisation(nx.DiGraph(inheritance_tree)).show()) diff --git a/test/test_ontology.py b/test/test_ontology.py index f5f3027d..a40d1d02 100644 --- a/test/test_ontology.py +++ b/test/test_ontology.py @@ -1,47 +1,48 @@ import os -import networkx as nx import pytest +import networkx as nx from biocypher._ontology import Ontology def test_biolink_adapter(biolink_adapter): - assert biolink_adapter.get_root_label() == 'entity' + assert biolink_adapter.get_root_label() == "entity" assert biolink_adapter.get_nx_graph().number_of_nodes() > 100 - assert 'biological entity' in biolink_adapter.get_ancestors('gene') - assert 'macromolecular machine mixin' in biolink_adapter.get_ancestors( - 'macromolecular complex' + assert "biological entity" in biolink_adapter.get_ancestors("gene") + assert "macromolecular machine mixin" in biolink_adapter.get_ancestors( + "macromolecular complex" ) def test_so_adapter(so_adapter): - assert so_adapter.get_root_label() == 'sequence_variant' + assert so_adapter.get_root_label() == "sequence_variant" # here without underscores - assert 'sequence variant' in so_adapter.get_ancestors('lethal variant') + assert "sequence variant" in so_adapter.get_ancestors("lethal variant") def test_go_adapter(go_adapter): - assert go_adapter.get_root_label() == 'molecular_function' + assert go_adapter.get_root_label() == "molecular_function" - assert 'molecular function' in go_adapter.get_ancestors( - 'rna helicase activity' + assert "molecular function" in go_adapter.get_ancestors( + "rna helicase activity" ) def test_mondo_adapter(mondo_adapter): - assert mondo_adapter.get_root_label() == 'disease' + assert mondo_adapter.get_root_label() == "disease" - assert 'human disease' in mondo_adapter.get_ancestors('cystic fibrosis') + assert "human disease" in mondo_adapter.get_ancestors("cystic fibrosis") def test_ontology_functions(hybrid_ontology): assert isinstance(hybrid_ontology, Ontology) - first_tail_ontology = hybrid_ontology._tail_ontologies.get('so' - ).get_nx_graph() + first_tail_ontology = hybrid_ontology._tail_ontologies.get( + "so" + ).get_nx_graph() assert len(first_tail_ontology) == 6 assert nx.is_directed_acyclic_graph(first_tail_ontology) @@ -61,51 +62,51 @@ def test_ontology_functions(hybrid_ontology): assert hybrid_length - num_ext == combined_length - num_tail dgpl_ancestors = list( - hybrid_ontology.get_ancestors('decreased gene product level') + hybrid_ontology.get_ancestors("decreased gene product level") ) - assert 'decreased gene product level' in dgpl_ancestors - assert 'altered gene product level' in dgpl_ancestors - assert 'functional effect variant' in dgpl_ancestors - assert 'sequence variant' in dgpl_ancestors - assert 'biological entity' in dgpl_ancestors - assert 'named thing' in dgpl_ancestors - assert 'entity' in dgpl_ancestors - assert 'thing with taxon' in dgpl_ancestors - - lethal_var = hybrid_ontology._nx_graph.nodes['lethal variant'] - assert lethal_var['label'] == 'SO_0001773' + assert "decreased gene product level" in dgpl_ancestors + assert "altered gene product level" in dgpl_ancestors + assert "functional effect variant" in dgpl_ancestors + assert "sequence variant" in dgpl_ancestors + assert "biological entity" in dgpl_ancestors + assert "named thing" in dgpl_ancestors + assert "entity" in dgpl_ancestors + assert "thing with taxon" in dgpl_ancestors + + lethal_var = hybrid_ontology._nx_graph.nodes["lethal variant"] + assert lethal_var["label"] == "SO_0001773" # second tail ontology: here we don't merge the nodes, but attach 'human # disease' as a child of 'disease' - cf_ancestors = list(hybrid_ontology.get_ancestors('cystic fibrosis')) - assert 'cystic fibrosis' in cf_ancestors - assert 'autosomal recessive disease' in cf_ancestors - assert 'autosomal genetic disease' in cf_ancestors - assert 'hereditary disease' in cf_ancestors - assert 'human disease' in cf_ancestors - assert 'disease' in cf_ancestors - assert 'disease or phenotypic feature' in cf_ancestors - assert 'biological entity' in cf_ancestors - assert 'entity' in cf_ancestors + cf_ancestors = list(hybrid_ontology.get_ancestors("cystic fibrosis")) + assert "cystic fibrosis" in cf_ancestors + assert "autosomal recessive disease" in cf_ancestors + assert "autosomal genetic disease" in cf_ancestors + assert "hereditary disease" in cf_ancestors + assert "human disease" in cf_ancestors + assert "disease" in cf_ancestors + assert "disease or phenotypic feature" in cf_ancestors + assert "biological entity" in cf_ancestors + assert "entity" in cf_ancestors # mixins? # user extensions - dsdna_ancestors = list(hybrid_ontology.get_ancestors('dsDNA sequence')) - assert 'chemical entity' in dsdna_ancestors - assert 'association' in hybrid_ontology.get_ancestors( - 'mutation to tissue association' + dsdna_ancestors = list(hybrid_ontology.get_ancestors("dsDNA sequence")) + assert "chemical entity" in dsdna_ancestors + assert "association" in hybrid_ontology.get_ancestors( + "mutation to tissue association" ) # properties - protein = hybrid_ontology._nx_graph.nodes['protein'] - assert protein['label'] == 'Protein' - assert 'taxon' in protein['properties'].keys() + protein = hybrid_ontology._nx_graph.nodes["protein"] + assert protein["label"] == "Protein" + assert "taxon" in protein["properties"].keys() # synonyms - assert 'complex' in hybrid_ontology._nx_graph.nodes - assert 'macromolecular complex' not in hybrid_ontology._nx_graph.nodes + assert "complex" in hybrid_ontology._nx_graph.nodes + assert "macromolecular complex" not in hybrid_ontology._nx_graph.nodes def test_show_ontology(hybrid_ontology): @@ -123,20 +124,18 @@ def test_show_full_ontology(hybrid_ontology): def test_write_ontology(hybrid_ontology, tmp_path): passed = hybrid_ontology.show_ontology_structure(to_disk=tmp_path) - f = os.path.join(tmp_path, 'ontology_structure.graphml') + f = os.path.join(tmp_path, "ontology_structure.graphml") assert passed assert os.path.isfile(f) def test_disconnected_exception(disconnected_mapping): - with pytest.raises(ValueError): Ontology( head_ontology={ - 'url': 'test/so.owl', - 'root_node': 'sequence_variant', + "url": "test/so.owl", + "root_node": "sequence_variant", }, ontology_mapping=disconnected_mapping, ) - diff --git a/test/test_pandas.py b/test/test_pandas.py index 8d7329d0..9cd405d4 100644 --- a/test/test_pandas.py +++ b/test/test_pandas.py @@ -1,9 +1,11 @@ import pytest + def test_pandas(_pd): assert _pd.dfs == {} -@pytest.mark.parametrize('l', [4], scope='module') + +@pytest.mark.parametrize("l", [4], scope="module") def test_nodes(_pd, _get_nodes): _pd.add_tables(_get_nodes) assert "protein" in _pd.dfs.keys() @@ -14,7 +16,7 @@ def test_nodes(_pd, _get_nodes): assert "m2" in _pd.dfs["microRNA"]["node_id"].values -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_nodes_gen(_pd, _get_nodes): def node_gen(): for node in _get_nodes: @@ -23,19 +25,22 @@ def node_gen(): _pd.add_tables(node_gen()) assert "protein" in _pd.dfs.keys() -@pytest.mark.parametrize('l', [4], scope='module') + +@pytest.mark.parametrize("l", [4], scope="module") def test_duplicates(_pd, _get_nodes): nodes = _get_nodes + _get_nodes _pd.add_tables(nodes) assert len(_pd.dfs["protein"].node_id) == 4 -@pytest.mark.parametrize('l', [8], scope='module') + +@pytest.mark.parametrize("l", [8], scope="module") def test_two_step_add(_pd, _get_nodes): _pd.add_tables(_get_nodes[:4]) _pd.add_tables(_get_nodes[4:]) assert len(_pd.dfs["protein"].node_id) == 8 -@pytest.mark.parametrize('l', [4], scope='module') + +@pytest.mark.parametrize("l", [4], scope="module") def test_edges(_pd, _get_edges): _pd.add_tables(_get_edges) assert "PERTURBED_IN_DISEASE" in _pd.dfs.keys() @@ -46,11 +51,11 @@ def test_edges(_pd, _get_edges): assert "p1" in _pd.dfs["Is_Mutated_In"]["target_id"].values -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_edges_gen(_pd, _get_edges): def edge_gen(): for edge in _get_edges: yield edge _pd.add_tables(edge_gen()) - assert "PERTURBED_IN_DISEASE" in _pd.dfs.keys() \ No newline at end of file + assert "PERTURBED_IN_DISEASE" in _pd.dfs.keys() diff --git a/test/test_translate.py b/test/test_translate.py index 558700a9..bf113158 100644 --- a/test/test_translate.py +++ b/test/test_translate.py @@ -6,68 +6,68 @@ def test_translate_nodes(translator): id_type = [ ( - 'G9205', - 'protein', + "G9205", + "protein", { - 'taxon': 9606, + "taxon": 9606, }, ), ( - 'hsa-miR-132-3p', - 'mirna', + "hsa-miR-132-3p", + "mirna", { - 'taxon': 9606, + "taxon": 9606, }, ), ( - 'ASDB_OSBS', - 'complex', + "ASDB_OSBS", + "complex", { - 'taxon': 9606, + "taxon": 9606, }, ), - ('REACT:25520', 'reactome', {}), - ('agpl:001524', 'agpl', {}), + ("REACT:25520", "reactome", {}), + ("agpl:001524", "agpl", {}), ] t = translator.translate_nodes(id_type) assert all(type(n) == BioCypherNode for n in t) t = translator.translate_nodes(id_type) - assert next(t).get_label() == 'protein' - assert next(t).get_label() == 'microRNA' - assert next(t).get_label() == 'complex' - assert next(t).get_label() == 'reactome.pathway' - assert next(t).get_label() == 'altered gene product level' + assert next(t).get_label() == "protein" + assert next(t).get_label() == "microRNA" + assert next(t).get_label() == "complex" + assert next(t).get_label() == "reactome.pathway" + assert next(t).get_label() == "altered gene product level" def test_specific_and_generic_ids(translator): id_type = [ ( - 'CHAT', - 'hgnc', + "CHAT", + "hgnc", { - 'taxon': 9606, + "taxon": 9606, }, ), - ('REACT:25520', 'reactome', {}), + ("REACT:25520", "reactome", {}), ] t = list(translator.translate_nodes(id_type)) - assert t[0].get_id() == 'CHAT' - assert t[0].get_properties().get('preferred_id') == 'hgnc' - assert t[0].get_properties().get('id') == 'CHAT' - assert t[1].get_id() == 'REACT:25520' - assert t[1].get_properties().get('preferred_id') == 'reactome' - assert t[1].get_properties().get('id') == 'REACT:25520' + assert t[0].get_id() == "CHAT" + assert t[0].get_properties().get("preferred_id") == "hgnc" + assert t[0].get_properties().get("id") == "CHAT" + assert t[1].get_id() == "REACT:25520" + assert t[1].get_properties().get("preferred_id") == "reactome" + assert t[1].get_properties().get("id") == "REACT:25520" def test_translate_edges(translator): # edge type association (defined in `schema_config.yaml`) src_tar_type_edge = [ - ('G15258', 'MONDO1', 'gene_disease', {}), - ('G15258', 'MONDO2', 'protein_disease', {}), - ('G15258', 'G15242', 'phosphorylation', {}), + ("G15258", "MONDO1", "gene_disease", {}), + ("G15258", "MONDO2", "protein_disease", {}), + ("G15258", "G15242", "phosphorylation", {}), ] def gen_edges(): @@ -76,34 +76,34 @@ def gen_edges(): t = translator.translate_edges(gen_edges()) assert type(next(t)) == BioCypherEdge - assert next(t).get_label() == 'PERTURBED_IN_DISEASE' - assert next(t).get_label() == 'phosphorylation' + assert next(t).get_label() == "PERTURBED_IN_DISEASE" + assert next(t).get_label() == "phosphorylation" # node type association (defined in `schema_config.yaml`) src_tar_type_node = [ ( - 'G21058', - 'G50127', - 'post_translational', + "G21058", + "G50127", + "post_translational", { - 'prop1': 'test', + "prop1": "test", }, ), ( - 'G22418', - 'G50123', - 'post_translational', + "G22418", + "G50123", + "post_translational", { - 'directed': 'arbitrary_string', + "directed": "arbitrary_string", }, ), ( - 'G15258', - 'G16347', - 'post_translational', + "G15258", + "G16347", + "post_translational", { - 'directed': True, - 'effect': -1, + "directed": True, + "effect": -1, }, ), ] @@ -114,16 +114,16 @@ def gen_edges(): n2 = t[1] n3 = t[2] - assert n1.get_source_edge().get_label() == 'IS_PART_OF' - assert n2.get_source_edge().get_label() == 'IS_PART_OF' - assert n3.get_target_edge().get_label() == 'IS_TARGET_OF' + assert n1.get_source_edge().get_label() == "IS_PART_OF" + assert n2.get_source_edge().get_label() == "IS_PART_OF" + assert n3.get_target_edge().get_label() == "IS_TARGET_OF" assert ( - type(n1.get_node()) == BioCypherNode and - type(n1.get_source_edge()) == BioCypherEdge and - type(n1.get_target_edge()) == BioCypherEdge + type(n1.get_node()) == BioCypherNode + and type(n1.get_source_edge()) == BioCypherEdge + and type(n1.get_target_edge()) == BioCypherEdge ) - assert n3.get_node().get_id() == 'G15258_G16347_True_-1' - assert n3.get_source_edge().get_source_id() == 'G15258' + assert n3.get_node().get_id() == "G15258_G16347_True_-1" + assert n3.get_source_edge().get_source_id() == "G15258" # def test_biolink_adapter(version_node, translator): @@ -177,17 +177,17 @@ def test_merge_multiple_inputs_node(ontology_mapping, translator): # define nodes id_type = [ ( - 'CHAT', - 'hgnc', + "CHAT", + "hgnc", { - 'taxon': 9606, + "taxon": 9606, }, ), ( - 'CHRNA4', - 'ensg', + "CHRNA4", + "ensg", { - 'taxon': 9606, + "taxon": 9606, }, ), ] @@ -197,26 +197,27 @@ def test_merge_multiple_inputs_node(ontology_mapping, translator): # check unique node type assert not any( - [s for s in ontology_mapping.extended_schema.keys() if '.gene' in s] + [s for s in ontology_mapping.extended_schema.keys() if ".gene" in s] ) assert any( - [s for s in ontology_mapping.extended_schema.keys() if '.pathway' in s] + [s for s in ontology_mapping.extended_schema.keys() if ".pathway" in s] ) # check translator.translate_nodes for unique return type assert all([type(n) == BioCypherNode for n in t]) - assert all([n.get_label() == 'gene' for n in t]) + assert all([n.get_label() == "gene" for n in t]) + def test_implicit_inheritance_node(translator): id_type = [ ( - 'snrna1', - 'intact_snrna', + "snrna1", + "intact_snrna", {}, ), ( - 'snrna2', - 'rnacentral_snrna', + "snrna2", + "rnacentral_snrna", {}, ), ] @@ -224,8 +225,8 @@ def test_implicit_inheritance_node(translator): t = list(translator.translate_nodes(id_type)) assert all([type(n) == BioCypherNode for n in t]) - assert t[0].get_label() == 'intact.snRNA sequence' - assert t[1].get_label() == 'rnacentral.snRNA sequence' + assert t[0].get_label() == "intact.snRNA sequence" + assert t[1].get_label() == "rnacentral.snRNA sequence" def test_merge_multiple_inputs_edge(ontology_mapping, translator): @@ -237,19 +238,19 @@ def test_merge_multiple_inputs_edge(ontology_mapping, translator): # define nodes src_tar_type = [ ( - 'CHAT', - 'AD', - 'gene_disease', + "CHAT", + "AD", + "gene_disease", { - 'taxon': 9606, + "taxon": 9606, }, ), ( - 'CHRNA4', - 'AD', - 'protein_disease', + "CHRNA4", + "AD", + "protein_disease", { - 'taxon': 9606, + "taxon": 9606, }, ), ] @@ -258,100 +259,106 @@ def test_merge_multiple_inputs_edge(ontology_mapping, translator): # check unique edge type assert not any( [ - s for s in ontology_mapping.extended_schema.keys() - if '.gene to disease association' in s + s + for s in ontology_mapping.extended_schema.keys() + if ".gene to disease association" in s ], ) assert any( [ - s for s in ontology_mapping.extended_schema.keys() - if '.sequence variant' in s + s + for s in ontology_mapping.extended_schema.keys() + if ".sequence variant" in s ], ) # check translator.translate_nodes for unique return type assert all([type(e) == BioCypherEdge for e in t]) - assert all([e.get_label() == 'PERTURBED_IN_DISEASE' for e in t]) + assert all([e.get_label() == "PERTURBED_IN_DISEASE" for e in t]) + def test_implicit_inheritance_edge(translator): src_tar_type = [ ( - 'mut1', - 'var1', - 'gene1', - 'VARIANT_FOUND_IN_GENE_Known_variant_Gene', + "mut1", + "var1", + "gene1", + "VARIANT_FOUND_IN_GENE_Known_variant_Gene", {}, ), ( - 'mut2', - 'var2', - 'gene2', - 'VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene', + "mut2", + "var2", + "gene2", + "VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene", {}, ), ] t = list(translator.translate_edges(src_tar_type)) assert all([type(e) == BioCypherEdge for e in t]) - assert t[0].get_label() == 'known.sequence variant.variant to gene association' - assert t[1].get_label() == 'somatic.sequence variant.variant to gene association' + assert ( + t[0].get_label() == "known.sequence variant.variant to gene association" + ) + assert ( + t[1].get_label() + == "somatic.sequence variant.variant to gene association" + ) -def test_virtual_leaves_inherit_is_a(ontology_mapping): - snrna = ontology_mapping.extended_schema.get('intact.snRNA sequence') +def test_virtual_leaves_inherit_is_a(ontology_mapping): + snrna = ontology_mapping.extended_schema.get("intact.snRNA sequence") - assert 'is_a' in snrna.keys() - assert snrna['is_a'] == ['snRNA sequence', 'nucleic acid entity'] + assert "is_a" in snrna.keys() + assert snrna["is_a"] == ["snRNA sequence", "nucleic acid entity"] - dsdna = ontology_mapping.extended_schema.get('intact.dsDNA sequence') + dsdna = ontology_mapping.extended_schema.get("intact.dsDNA sequence") - assert dsdna['is_a'] == [ - 'dsDNA sequence', - 'DNA sequence', - 'nucleic acid entity', + assert dsdna["is_a"] == [ + "dsDNA sequence", + "DNA sequence", + "nucleic acid entity", ] def test_virtual_leaves_inherit_properties(ontology_mapping): + snrna = ontology_mapping.extended_schema.get("intact.snRNA sequence") - snrna = ontology_mapping.extended_schema.get('intact.snRNA sequence') - - assert 'properties' in snrna.keys() - assert 'exclude_properties' in snrna.keys() + assert "properties" in snrna.keys() + assert "exclude_properties" in snrna.keys() def test_inherit_properties(ontology_mapping): + dsdna = ontology_mapping.extended_schema.get("intact.dsDNA sequence") - dsdna = ontology_mapping.extended_schema.get('intact.dsDNA sequence') - - assert 'properties' in dsdna.keys() - assert 'sequence' in dsdna['properties'] + assert "properties" in dsdna.keys() + assert "sequence" in dsdna["properties"] def test_properties_from_config(translator): id_type = [ ( - 'G49205', - 'protein', + "G49205", + "protein", { - 'taxon': 9606, - 'name': 'test', + "taxon": 9606, + "name": "test", }, ), ( - 'G92035', - 'protein', + "G92035", + "protein", { - 'taxon': 9606, + "taxon": 9606, }, ), ( - 'G92205', - 'protein', + "G92205", + "protein", { - 'taxon': 9606, - 'name': 'test2', - 'test': 'should_not_be_returned', + "taxon": 9606, + "name": "test2", + "test": "should_not_be_returned", }, ), ] @@ -359,32 +366,32 @@ def test_properties_from_config(translator): r = list(t) assert ( - 'name' in r[0].get_properties().keys() and - 'name' in r[1].get_properties().keys() and - 'test' not in r[2].get_properties().keys() + "name" in r[0].get_properties().keys() + and "name" in r[1].get_properties().keys() + and "test" not in r[2].get_properties().keys() ) src_tar_type = [ ( - 'G49205', - 'AD', - 'gene_gene', + "G49205", + "AD", + "gene_gene", { - 'directional': True, - 'score': 0.5, - 'id': 'should_not_be_returned', + "directional": True, + "score": 0.5, + "id": "should_not_be_returned", }, ), ( - 'G92035', - 'AD', - 'gene_gene', + "G92035", + "AD", + "gene_gene", { - 'directional': False, - 'curated': True, - 'score': 0.5, - 'test': 'should_not_be_returned', - 'id': 'should_not_be_returned', + "directional": False, + "curated": True, + "score": 0.5, + "test": "should_not_be_returned", + "id": "should_not_be_returned", }, ), ] @@ -393,32 +400,32 @@ def test_properties_from_config(translator): r = list(t) assert ( - 'directional' in r[0].get_properties().keys() and - 'directional' in r[1].get_properties().keys() and - 'curated' in r[1].get_properties().keys() and - 'score' in r[0].get_properties().keys() and - 'score' in r[1].get_properties().keys() and - 'test' not in r[1].get_properties().keys() and - 'id' not in r[0].get_properties().keys() and - 'id' not in r[1].get_properties().keys() + "directional" in r[0].get_properties().keys() + and "directional" in r[1].get_properties().keys() + and "curated" in r[1].get_properties().keys() + and "score" in r[0].get_properties().keys() + and "score" in r[1].get_properties().keys() + and "test" not in r[1].get_properties().keys() + and "id" not in r[0].get_properties().keys() + and "id" not in r[1].get_properties().keys() ) def test_exclude_properties(translator): id_type = [ ( - 'CHAT', - 'ensg', + "CHAT", + "ensg", { - 'taxon': 9606, - 'accession': 'should_not_be_returned', + "taxon": 9606, + "accession": "should_not_be_returned", }, ), ( - 'ACHE', - 'ensg', + "ACHE", + "ensg", { - 'taxon': 9606, + "taxon": 9606, }, ), ] @@ -426,29 +433,29 @@ def test_exclude_properties(translator): r = list(t) assert ( - 'taxon' in r[0].get_properties().keys() and - 'taxon' in r[1].get_properties().keys() and - 'accession' not in r[0].get_properties().keys() + "taxon" in r[0].get_properties().keys() + and "taxon" in r[1].get_properties().keys() + and "accession" not in r[0].get_properties().keys() ) src_tar_type = [ ( - 'G49205', - 'AD', - 'gene_disease', + "G49205", + "AD", + "gene_disease", { - 'directional': True, - 'score': 0.5, + "directional": True, + "score": 0.5, }, ), ( - 'G92035', - 'AD', - 'gene_disease', + "G92035", + "AD", + "gene_disease", { - 'directional': False, - 'score': 0.5, - 'accession': 'should_not_be_returned', + "directional": False, + "score": 0.5, + "accession": "should_not_be_returned", }, ), ] @@ -457,51 +464,51 @@ def test_exclude_properties(translator): r = list(t) assert ( - 'directional' in r[0].get_properties().keys() and - 'directional' in r[1].get_properties().keys() and - 'score' in r[0].get_properties().keys() and - 'score' in r[1].get_properties().keys() and - 'accession' not in r[1].get_properties().keys() + "directional" in r[0].get_properties().keys() + and "directional" in r[1].get_properties().keys() + and "score" in r[0].get_properties().keys() + and "score" in r[1].get_properties().keys() + and "accession" not in r[1].get_properties().keys() ) # we need to load the adapter because the mappings are passed from the adapter # to the translator def test_translate_term(translator): - assert translator.translate_term('hgnc') == 'Gene' + assert translator.translate_term("hgnc") == "Gene" assert ( - translator.translate_term('protein_disease') == 'PERTURBED_IN_DISEASE' + translator.translate_term("protein_disease") == "PERTURBED_IN_DISEASE" ) def test_reverse_translate_term(translator): - assert 'hgnc' in translator.reverse_translate_term('Gene') - assert 'protein_disease' in translator.reverse_translate_term( - 'PERTURBED_IN_DISEASE', + assert "hgnc" in translator.reverse_translate_term("Gene") + assert "protein_disease" in translator.reverse_translate_term( + "PERTURBED_IN_DISEASE", ) def test_translate_query(translator): # we translate to PascalCase for cypher queries, not to internal # sentence case - query = 'MATCH (n:hgnc)-[r:gene_disease]->(d:Disease) RETURN n' + query = "MATCH (n:hgnc)-[r:gene_disease]->(d:Disease) RETURN n" assert ( - translator.translate(query) == - 'MATCH (n:Gene)-[r:PERTURBED_IN_DISEASE]->(d:Disease) RETURN n' + translator.translate(query) + == "MATCH (n:Gene)-[r:PERTURBED_IN_DISEASE]->(d:Disease) RETURN n" ) def test_reverse_translate_query(translator): # TODO cannot use sentence case in this context. include sentence to # pascal case and back in translation? - query = 'MATCH (n:Known.SequenceVariant)-[r:Known.SequenceVariant.VariantToGeneAssociation]->(g:Gene) RETURN n' + query = "MATCH (n:Known.SequenceVariant)-[r:Known.SequenceVariant.VariantToGeneAssociation]->(g:Gene) RETURN n" with pytest.raises(NotImplementedError): translator.reverse_translate(query) - query = 'MATCH (n:Known.SequenceVariant)-[r:Known.SequenceVariant.VariantToGeneAssociation]->(g:Protein) RETURN n' + query = "MATCH (n:Known.SequenceVariant)-[r:Known.SequenceVariant.VariantToGeneAssociation]->(g:Protein) RETURN n" assert ( - translator.reverse_translate(query) == - 'MATCH (n:Known_variant)-[r:VARIANT_FOUND_IN_GENE_Known_variant_Gene]->(g:protein) RETURN n' + translator.reverse_translate(query) + == "MATCH (n:Known_variant)-[r:VARIANT_FOUND_IN_GENE_Known_variant_Gene]->(g:protein) RETURN n" ) @@ -509,67 +516,64 @@ def test_log_missing_nodes(translator): tn = translator.translate_nodes( [ ( - 'G49205', - 'missing_protein', + "G49205", + "missing_protein", { - 'taxon': 9606, + "taxon": 9606, }, ), - ('G92035', 'missing_protein', {}), - ('REACT:25520', 'missing_pathway', {}), + ("G92035", "missing_protein", {}), + ("REACT:25520", "missing_pathway", {}), ], ) tn = list(tn) m = translator.get_missing_biolink_types() - assert m.get('missing_protein') == 2 - assert m.get('missing_pathway') == 1 + assert m.get("missing_protein") == 2 + assert m.get("missing_pathway") == 1 def test_strict_mode_error(translator): translator.strict_mode = True n1 = ( - 'n2', 'Test', { - 'prop': 'val', - 'source': 'test', - 'licence': 'test', - 'version': 'test' - } + "n2", + "Test", + {"prop": "val", "source": "test", "licence": "test", "version": "test"}, ) assert list(translator.translate_nodes([n1])) is not None # test 'license' instead of 'licence' n2 = ( - 'n2', 'Test', { - 'prop': 'val', - 'source': 'test', - 'license': 'test', - 'version': 'test' - } + "n2", + "Test", + {"prop": "val", "source": "test", "license": "test", "version": "test"}, ) assert list(translator.translate_nodes([n2])) is not None - n3 = ('n1', 'Test', {'prop': 'val'}) + n3 = ("n1", "Test", {"prop": "val"}) with pytest.raises(ValueError): list(translator.translate_nodes([n1, n2, n3])) e1 = ( - 'n1', 'n2', 'Test', { - 'prop': 'val', - 'source': 'test', - 'licence': 'test', - 'version': 'test', - } + "n1", + "n2", + "Test", + { + "prop": "val", + "source": "test", + "licence": "test", + "version": "test", + }, ) assert list(translator.translate_edges([e1])) is not None - e2 = ('n1', 'n2', 'Test', {'prop': 'val'}) + e2 = ("n1", "n2", "Test", {"prop": "val"}) with pytest.raises(ValueError): list(translator.translate_edges([e1, e2])) @@ -579,16 +583,18 @@ def test_strict_mode_property_filter(translator): translator.strict_mode = True p1 = ( - 'p1', 'protein', { - 'taxon': 9606, - 'source': 'test', - 'licence': 'test', - 'version': 'test', - } + "p1", + "protein", + { + "taxon": 9606, + "source": "test", + "licence": "test", + "version": "test", + }, ) l = list(translator.translate_nodes([p1])) - assert 'source' in l[0].get_properties().keys() - assert 'licence' in l[0].get_properties().keys() - assert 'version' in l[0].get_properties().keys() + assert "source" in l[0].get_properties().keys() + assert "licence" in l[0].get_properties().keys() + assert "version" in l[0].get_properties().keys() diff --git a/test/test_write_arango.py b/test/test_write_arango.py index e7e0a38b..d3639b2b 100644 --- a/test/test_write_arango.py +++ b/test/test_write_arango.py @@ -3,7 +3,7 @@ import pytest -@pytest.mark.parametrize('l', [4], scope='function') +@pytest.mark.parametrize("l", [4], scope="function") def test_arango_write_data_headers_import_call( bw_arango, _get_nodes, @@ -25,19 +25,19 @@ def test_arango_write_data_headers_import_call( tmp_path = bw_arango.outdir - ph_csv = os.path.join(tmp_path, 'Protein-header.csv') - pp_1_csv = os.path.join(tmp_path, 'Protein-part000.csv') - pp_2_csv = os.path.join(tmp_path, 'Protein-part001.csv') - mh_csv = os.path.join(tmp_path, 'MicroRNA-header.csv') - mp_1_csv = os.path.join(tmp_path, 'MicroRNA-part000.csv') - mp_2_csv = os.path.join(tmp_path, 'MicroRNA-part001.csv') - dh_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-header.csv') - dp_1_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part000.csv') - dp_2_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part001.csv') - muh_csv = os.path.join(tmp_path, 'Is_Mutated_In-header.csv') - mup_1_csv = os.path.join(tmp_path, 'Is_Mutated_In-part000.csv') - mup_2_csv = os.path.join(tmp_path, 'Is_Mutated_In-part001.csv') - call_csv = os.path.join(tmp_path, 'arangodb-import-call.sh') + ph_csv = os.path.join(tmp_path, "Protein-header.csv") + pp_1_csv = os.path.join(tmp_path, "Protein-part000.csv") + pp_2_csv = os.path.join(tmp_path, "Protein-part001.csv") + mh_csv = os.path.join(tmp_path, "MicroRNA-header.csv") + mp_1_csv = os.path.join(tmp_path, "MicroRNA-part000.csv") + mp_2_csv = os.path.join(tmp_path, "MicroRNA-part001.csv") + dh_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-header.csv") + dp_1_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part000.csv") + dp_2_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part001.csv") + muh_csv = os.path.join(tmp_path, "Is_Mutated_In-header.csv") + mup_1_csv = os.path.join(tmp_path, "Is_Mutated_In-part000.csv") + mup_2_csv = os.path.join(tmp_path, "Is_Mutated_In-part001.csv") + call_csv = os.path.join(tmp_path, "arangodb-import-call.sh") with open(ph_csv) as f: ph = f.read() @@ -66,23 +66,31 @@ def test_arango_write_data_headers_import_call( with open(call_csv) as f: call = f.read() - assert ph == '_key,name,score,taxon,genes,id,preferred_id' - assert mh == '_key,name,taxon,id,preferred_id' - assert '_from' in dh - assert '_key' in dh - assert '_to' in dh - assert '_from' in muh - assert '_key' in muh - assert '_to' in muh - assert len(pp_1) == len(pp_2) == len(mp_1) == len(mp_2) == len(dp_1) == len( - dp_2 - ) == len(mup_1) == len(mup_2) == 2 - assert 'arangoimp --type csv' in call - assert '--collection proteins' in call - assert 'MicroRNA-part' in call + assert ph == "_key,name,score,taxon,genes,id,preferred_id" + assert mh == "_key,name,taxon,id,preferred_id" + assert "_from" in dh + assert "_key" in dh + assert "_to" in dh + assert "_from" in muh + assert "_key" in muh + assert "_to" in muh + assert ( + len(pp_1) + == len(pp_2) + == len(mp_1) + == len(mp_2) + == len(dp_1) + == len(dp_2) + == len(mup_1) + == len(mup_2) + == 2 + ) + assert "arangoimp --type csv" in call + assert "--collection proteins" in call + assert "MicroRNA-part" in call # custom import call executable path - bw_arango.import_call_bin_prefix = 'custom/path/to/' + bw_arango.import_call_bin_prefix = "custom/path/to/" os.remove(call_csv) bw_arango.write_import_call() @@ -90,4 +98,4 @@ def test_arango_write_data_headers_import_call( with open(call_csv) as f: call = f.read() - assert 'custom/path/to/arangoimp --type csv' in call + assert "custom/path/to/arangoimp --type csv" in call diff --git a/test/test_write_neo4j.py b/test/test_write_neo4j.py index c065e504..3092488d 100644 --- a/test/test_write_neo4j.py +++ b/test/test_write_neo4j.py @@ -8,12 +8,12 @@ def test_neo4j_writer_and_output_dir(bw): - tmp_path = bw.outdir assert ( - os.path.isdir(tmp_path) and isinstance(bw, _Neo4jBatchWriter) and - bw.delim == ';' + os.path.isdir(tmp_path) + and isinstance(bw, _Neo4jBatchWriter) + and bw.delim == ";" ) @@ -22,25 +22,25 @@ def test_create_import_call(bw): le = 4 for i in range(le): n = BioCypherNode( - f'i{i+1}', - 'post translational interaction', + f"i{i+1}", + "post translational interaction", ) e1 = BioCypherEdge( - source_id=f'i{i+1}', - target_id=f'p{i+1}', - relationship_label='IS_SOURCE_OF', + source_id=f"i{i+1}", + target_id=f"p{i+1}", + relationship_label="IS_SOURCE_OF", ) e2 = BioCypherEdge( - source_id=f'i{i}', - target_id=f'p{i+2}', - relationship_label='IS_TARGET_OF', + source_id=f"i{i}", + target_id=f"p{i+2}", + relationship_label="IS_TARGET_OF", ) mixed.append(BioCypherRelAsNode(n, e1, e2)) e3 = BioCypherEdge( - source_id=f'p{i+1}', - target_id=f'p{i+1}', - relationship_label='PERTURBED_IN_DISEASE', + source_id=f"p{i+1}", + target_id=f"p{i+1}", + relationship_label="PERTURBED_IN_DISEASE", ) mixed.append(e3) @@ -56,13 +56,25 @@ def gen(lis): assert passed assert 'bin/neo4j-admin import --database=neo4j --delimiter=";" ' in call assert '--array-delimiter="|" --quote="\'" --force=true ' in call - assert f'--nodes="{tmp_path}/PostTranslationalInteraction-header.csv,{tmp_path}/PostTranslationalInteraction-part.*" ' in call - assert f'--relationships="{tmp_path}/IS_SOURCE_OF-header.csv,{tmp_path}/IS_SOURCE_OF-part.*" ' in call - assert f'--relationships="{tmp_path}/IS_TARGET_OF-header.csv,{tmp_path}/IS_TARGET_OF-part.*" ' in call - assert f'--relationships="{tmp_path}/PERTURBED_IN_DISEASE-header.csv,{tmp_path}/PERTURBED_IN_DISEASE-part.*" ' in call + assert ( + f'--nodes="{tmp_path}/PostTranslationalInteraction-header.csv,{tmp_path}/PostTranslationalInteraction-part.*" ' + in call + ) + assert ( + f'--relationships="{tmp_path}/IS_SOURCE_OF-header.csv,{tmp_path}/IS_SOURCE_OF-part.*" ' + in call + ) + assert ( + f'--relationships="{tmp_path}/IS_TARGET_OF-header.csv,{tmp_path}/IS_TARGET_OF-part.*" ' + in call + ) + assert ( + f'--relationships="{tmp_path}/PERTURBED_IN_DISEASE-header.csv,{tmp_path}/PERTURBED_IN_DISEASE-part.*" ' + in call + ) -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_neo4j_write_node_data_headers_import_call(bw, _get_nodes): # four proteins, four miRNAs nodes = _get_nodes @@ -75,9 +87,9 @@ def test_neo4j_write_node_data_headers_import_call(bw, _get_nodes): tmp_path = bw.outdir - p_csv = os.path.join(tmp_path, 'Protein-header.csv') - m_csv = os.path.join(tmp_path, 'MicroRNA-header.csv') - call = os.path.join(tmp_path, 'neo4j-admin-import-call.sh') + p_csv = os.path.join(tmp_path, "Protein-header.csv") + m_csv = os.path.join(tmp_path, "MicroRNA-header.csv") + call = os.path.join(tmp_path, "neo4j-admin-import-call.sh") with open(p_csv) as f: p = f.read() @@ -86,20 +98,23 @@ def test_neo4j_write_node_data_headers_import_call(bw, _get_nodes): with open(call) as f: c = f.read() - assert p == ':ID;name;score:double;taxon:long;genes:string[];id;preferred_id;:LABEL' - assert m == ':ID;name;taxon:long;id;preferred_id;:LABEL' - assert 'bin/neo4j-admin import' in c - assert '--database=neo4j' in c + assert ( + p + == ":ID;name;score:double;taxon:long;genes:string[];id;preferred_id;:LABEL" + ) + assert m == ":ID;name;taxon:long;id;preferred_id;:LABEL" + assert "bin/neo4j-admin import" in c + assert "--database=neo4j" in c assert '--delimiter=";"' in c - assert '--force=true' in c + assert "--force=true" in c assert '--nodes="' in c - assert 'Protein-header.csv' in c + assert "Protein-header.csv" in c assert 'Protein-part.*"' in c - assert 'MicroRNA-header.csv' in c + assert "MicroRNA-header.csv" in c assert 'MicroRNA-part.*"' in c # custom import call executable path - bw.import_call_bin_prefix = 'custom/path/' + bw.import_call_bin_prefix = "custom/path/" os.remove(call) bw.write_import_call() @@ -107,7 +122,7 @@ def test_neo4j_write_node_data_headers_import_call(bw, _get_nodes): with open(call) as f: c = f.read() - assert 'custom/path/neo4j-admin import' in c + assert "custom/path/neo4j-admin import" in c # custom file prefix # TODO @@ -118,9 +133,9 @@ def test_write_hybrid_ontology_nodes(bw): for i in range(4): nodes.append( BioCypherNode( - node_id=f'agpl:000{i}', - node_label='altered gene product level', - properties={} + node_id=f"agpl:000{i}", + node_label="altered gene product level", + properties={}, ) ) @@ -130,8 +145,8 @@ def test_write_hybrid_ontology_nodes(bw): tmp_path = bw.outdir - h_csv = os.path.join(tmp_path, 'AlteredGeneProductLevel-header.csv') - p_csv = os.path.join(tmp_path, 'AlteredGeneProductLevel-part000.csv') + h_csv = os.path.join(tmp_path, "AlteredGeneProductLevel-header.csv") + p_csv = os.path.join(tmp_path, "AlteredGeneProductLevel-part000.csv") with open(h_csv) as f: header = f.read() @@ -139,23 +154,23 @@ def test_write_hybrid_ontology_nodes(bw): with open(p_csv) as f: part = f.read() - assert header == ':ID;id;preferred_id;:LABEL' + assert header == ":ID;id;preferred_id;:LABEL" assert "agpl:0000;'agpl:0000';'id'" in part - assert 'AlteredGeneProductLevel' in part - assert 'BiologicalEntity' in part + assert "AlteredGeneProductLevel" in part + assert "BiologicalEntity" in part def test_property_types(bw): nodes = [] for i in range(4): bnp = BioCypherNode( - node_id=f'p{i+1}', - node_label='protein', + node_id=f"p{i+1}", + node_label="protein", properties={ - 'score': 4 / (i + 1), - 'name': 'StringProperty1', - 'taxon': 9606, - 'genes': ['gene1', 'gene2'], + "score": 4 / (i + 1), + "name": "StringProperty1", + "taxon": 9606, + "genes": ["gene1", "gene2"], }, ) nodes.append(bnp) @@ -164,8 +179,8 @@ def test_property_types(bw): tmp_path = bw.outdir - d_csv = os.path.join(tmp_path, 'Protein-part000.csv') - h_csv = os.path.join(tmp_path, 'Protein-header.csv') + d_csv = os.path.join(tmp_path, "Protein-part000.csv") + h_csv = os.path.join(tmp_path, "Protein-header.csv") with open(d_csv) as f: data = f.read() @@ -174,12 +189,15 @@ def test_property_types(bw): header = f.read() assert passed - assert header == ':ID;name;score:double;taxon:long;genes:string[];id;preferred_id;:LABEL' + assert ( + header + == ":ID;name;score:double;taxon:long;genes:string[];id;preferred_id;:LABEL" + ) assert "p1;'StringProperty1';4.0;9606;'gene1|gene2';'p1';'id'" in data - assert 'BiologicalEntity' in data + assert "BiologicalEntity" in data -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_write_node_data_from_list(bw, _get_nodes): nodes = _get_nodes @@ -187,8 +205,8 @@ def test_write_node_data_from_list(bw, _get_nodes): tmp_path = bw.outdir - p_csv = os.path.join(tmp_path, 'Protein-part000.csv') - m_csv = os.path.join(tmp_path, 'MicroRNA-part000.csv') + p_csv = os.path.join(tmp_path, "Protein-part000.csv") + m_csv = os.path.join(tmp_path, "MicroRNA-part000.csv") with open(p_csv) as f: pr = f.read() @@ -198,12 +216,12 @@ def test_write_node_data_from_list(bw, _get_nodes): assert passed assert "p1;'StringProperty1';4.0;9606;'gene1|gene2';'p1';'uniprot'" in pr - assert 'BiologicalEntity' in pr + assert "BiologicalEntity" in pr assert "m1;'StringProperty1';9606;'m1';'mirbase'" in mi - assert 'ChemicalEntity' in mi + assert "ChemicalEntity" in mi -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_write_node_data_from_gen(bw, _get_nodes): nodes = _get_nodes @@ -214,8 +232,8 @@ def node_gen(nodes): tmp_path = bw.outdir - p_csv = os.path.join(tmp_path, 'Protein-part000.csv') - m_csv = os.path.join(tmp_path, 'MicroRNA-part000.csv') + p_csv = os.path.join(tmp_path, "Protein-part000.csv") + m_csv = os.path.join(tmp_path, "MicroRNA-part000.csv") with open(p_csv) as f: pr = f.read() @@ -225,9 +243,9 @@ def node_gen(nodes): assert passed assert "p1;'StringProperty1';4.0;9606;'gene1|gene2';'p1';'uniprot'" in pr - assert 'BiologicalEntity' in pr + assert "BiologicalEntity" in pr assert "m1;'StringProperty1';9606;'m1';'mirbase'" in mi - assert 'ChemicalEntity' in mi + assert "ChemicalEntity" in mi def test_write_node_data_from_gen_no_props(bw): @@ -235,19 +253,19 @@ def test_write_node_data_from_gen_no_props(bw): le = 4 for i in range(le): bnp = BioCypherNode( - node_id=f'p{i+1}', - node_label='protein', + node_id=f"p{i+1}", + node_label="protein", properties={ - 'score': 4 / (i + 1), - 'name': 'StringProperty1', - 'taxon': 9606, - 'genes': ['gene1', 'gene2'], + "score": 4 / (i + 1), + "name": "StringProperty1", + "taxon": 9606, + "genes": ["gene1", "gene2"], }, ) nodes.append(bnp) bnm = BioCypherNode( - node_id=f'm{i+1}', - node_label='microRNA', + node_id=f"m{i+1}", + node_label="microRNA", ) nodes.append(bnm) @@ -258,8 +276,8 @@ def node_gen(nodes): tmp_path = bw.outdir - p_csv = os.path.join(tmp_path, 'Protein-part000.csv') - m_csv = os.path.join(tmp_path, 'microRNA-part000.csv') + p_csv = os.path.join(tmp_path, "Protein-part000.csv") + m_csv = os.path.join(tmp_path, "microRNA-part000.csv") with open(p_csv) as f: pr = f.read() @@ -269,12 +287,12 @@ def node_gen(nodes): assert passed assert "p1;'StringProperty1';4.0;9606;'gene1|gene2';'p1';'id'" in pr - assert 'BiologicalEntity' in pr + assert "BiologicalEntity" in pr assert "m1;'m1';'id'" in mi - assert 'ChemicalEntity' in mi + assert "ChemicalEntity" in mi -@pytest.mark.parametrize('l', [int(1e4 + 4)], scope='module') +@pytest.mark.parametrize("l", [int(1e4 + 4)], scope="module") def test_write_node_data_from_large_gen(bw, _get_nodes): nodes = _get_nodes @@ -288,10 +306,10 @@ def node_gen(nodes): tmp_path = bw.outdir - p0_csv = os.path.join(tmp_path, 'Protein-part000.csv') - m0_csv = os.path.join(tmp_path, 'MicroRNA-part000.csv') - p1_csv = os.path.join(tmp_path, 'Protein-part001.csv') - m1_csv = os.path.join(tmp_path, 'MicroRNA-part001.csv') + p0_csv = os.path.join(tmp_path, "Protein-part000.csv") + m0_csv = os.path.join(tmp_path, "MicroRNA-part000.csv") + p1_csv = os.path.join(tmp_path, "Protein-part001.csv") + m1_csv = os.path.join(tmp_path, "MicroRNA-part001.csv") pr_lines = sum(1 for _ in open(p0_csv)) mi_lines = sum(1 for _ in open(m0_csv)) @@ -299,23 +317,26 @@ def node_gen(nodes): mi_lines1 = sum(1 for _ in open(m1_csv)) assert ( - passed and pr_lines == 1e4 and mi_lines == 1e4 and pr_lines1 == 4 and - mi_lines1 == 4 + passed + and pr_lines == 1e4 + and mi_lines == 1e4 + and pr_lines1 == 4 + and mi_lines1 == 4 ) -@pytest.mark.parametrize('l', [1], scope='module') +@pytest.mark.parametrize("l", [1], scope="module") def test_too_many_properties(bw, _get_nodes): nodes = _get_nodes bn1 = BioCypherNode( - node_id='p0', - node_label='protein', + node_id="p0", + node_label="protein", properties={ - 'p1': 'StringProperty1', - 'p2': 'StringProperty2', - 'p3': 'StringProperty3', - 'p4': 'StringProperty4', + "p1": "StringProperty1", + "p2": "StringProperty2", + "p3": "StringProperty3", + "p4": "StringProperty4", }, ) nodes.append(bn1) @@ -331,14 +352,14 @@ def node_gen(nodes): assert not passed -@pytest.mark.parametrize('l', [1], scope='module') +@pytest.mark.parametrize("l", [1], scope="module") def test_not_enough_properties(bw, _get_nodes): nodes = _get_nodes bn1 = BioCypherNode( - node_id='p0', - node_label='protein', - properties={'p1': 'StringProperty1'}, + node_id="p0", + node_label="protein", + properties={"p1": "StringProperty1"}, ) nodes.append(bn1) @@ -352,7 +373,7 @@ def node_gen(nodes): tmp_path = bw.outdir - p0_csv = os.path.join(tmp_path, 'Protein-part000.csv') + p0_csv = os.path.join(tmp_path, "Protein-part000.csv") assert not passed and not isfile(p0_csv) @@ -363,31 +384,31 @@ def test_write_none_type_property_and_order_invariance(bw): nodes = [] bnp1 = BioCypherNode( - node_id=f'p1', - node_label='protein', + node_id=f"p1", + node_label="protein", properties={ - 'taxon': 9606, - 'score': 1, - 'name': None, - 'genes': None, + "taxon": 9606, + "score": 1, + "name": None, + "genes": None, }, ) bnp2 = BioCypherNode( - node_id=f'p2', - node_label='protein', + node_id=f"p2", + node_label="protein", properties={ - 'name': None, - 'genes': ['gene1', 'gene2'], - 'score': 2, - 'taxon': 9606, + "name": None, + "genes": ["gene1", "gene2"], + "score": 2, + "taxon": 9606, }, ) bnm = BioCypherNode( - node_id=f'm1', - node_label='microRNA', + node_id=f"m1", + node_label="microRNA", properties={ - 'name': None, - 'taxon': 9606, + "name": None, + "taxon": 9606, }, ) nodes.append(bnp1) @@ -404,16 +425,16 @@ def node_gen(nodes): tmp_path = bw.outdir - p0_csv = os.path.join(tmp_path, 'Protein-part000.csv') + p0_csv = os.path.join(tmp_path, "Protein-part000.csv") with open(p0_csv) as f: p = f.read() assert passed assert "p1;;1;9606;;'p1';'id'" in p - assert 'BiologicalEntity' in p + assert "BiologicalEntity" in p -@pytest.mark.parametrize('l', [int(1e4)], scope='module') +@pytest.mark.parametrize("l", [int(1e4)], scope="module") def test_accidental_exact_batch_size(bw, _get_nodes): nodes = _get_nodes @@ -427,16 +448,16 @@ def node_gen(nodes): tmp_path = bw.outdir - p0_csv = os.path.join(tmp_path, 'Protein-part000.csv') - m0_csv = os.path.join(tmp_path, 'MicroRNA-part000.csv') - p1_csv = os.path.join(tmp_path, 'Protein-part001.csv') - m1_csv = os.path.join(tmp_path, 'MicroRNA-part001.csv') + p0_csv = os.path.join(tmp_path, "Protein-part000.csv") + m0_csv = os.path.join(tmp_path, "MicroRNA-part000.csv") + p1_csv = os.path.join(tmp_path, "Protein-part001.csv") + m1_csv = os.path.join(tmp_path, "MicroRNA-part001.csv") pr_lines = sum(1 for _ in open(p0_csv)) mi_lines = sum(1 for _ in open(m0_csv)) - ph_csv = os.path.join(tmp_path, 'Protein-header.csv') - mh_csv = os.path.join(tmp_path, 'MicroRNA-header.csv') + ph_csv = os.path.join(tmp_path, "Protein-header.csv") + mh_csv = os.path.join(tmp_path, "MicroRNA-header.csv") with open(ph_csv) as f: p = f.read() @@ -444,14 +465,18 @@ def node_gen(nodes): m = f.read() assert ( - passed and pr_lines == 1e4 and mi_lines == 1e4 and - not isfile(p1_csv) and not isfile(m1_csv) and p == - ':ID;name;score:double;taxon:long;genes:string[];id;preferred_id;:LABEL' - and m == ':ID;name;taxon:long;id;preferred_id;:LABEL' + passed + and pr_lines == 1e4 + and mi_lines == 1e4 + and not isfile(p1_csv) + and not isfile(m1_csv) + and p + == ":ID;name;score:double;taxon:long;genes:string[];id;preferred_id;:LABEL" + and m == ":ID;name;taxon:long;id;preferred_id;:LABEL" ) -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_write_edge_data_from_gen(bw, _get_edges): edges = _get_edges @@ -462,8 +487,8 @@ def edge_gen(edges): tmp_path = bw.outdir - pid_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part000.csv') - imi_csv = os.path.join(tmp_path, 'Is_Mutated_In-part000.csv') + pid_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part000.csv") + imi_csv = os.path.join(tmp_path, "Is_Mutated_In-part000.csv") with open(pid_csv) as f: l = f.read() @@ -483,7 +508,7 @@ def edge_gen(edges): assert "4;" in l assert "p2;" in l assert "PERTURBED_IN_DISEASE" in l - assert '\n' in l + assert "\n" in l assert "m0;" in c assert "mrel0;" in c assert "'3-UTR';" in c @@ -496,12 +521,11 @@ def edge_gen(edges): assert "1;" in c assert "p2;" in c assert "Is_Mutated_In" in c - assert '\n' in c + assert "\n" in c -@pytest.mark.parametrize('l', [int(1e4 + 4)], scope='module') +@pytest.mark.parametrize("l", [int(1e4 + 4)], scope="module") def test_write_edge_data_from_large_gen(bw, _get_edges): - edges = _get_edges def edge_gen(edges): @@ -511,10 +535,10 @@ def edge_gen(edges): tmp_path = bw.outdir - apl0_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part000.csv') - ips0_csv = os.path.join(tmp_path, 'Is_Mutated_In-part000.csv') - apl1_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part001.csv') - ips1_csv = os.path.join(tmp_path, 'Is_Mutated_In-part001.csv') + apl0_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part000.csv") + ips0_csv = os.path.join(tmp_path, "Is_Mutated_In-part000.csv") + apl1_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part001.csv") + ips1_csv = os.path.join(tmp_path, "Is_Mutated_In-part001.csv") l_lines0 = sum(1 for _ in open(apl0_csv)) c_lines0 = sum(1 for _ in open(ips0_csv)) @@ -522,12 +546,15 @@ def edge_gen(edges): c_lines1 = sum(1 for _ in open(ips1_csv)) assert ( - passed and l_lines0 == 1e4 and c_lines0 == 1e4 and l_lines1 == 4 and - c_lines1 == 4 + passed + and l_lines0 == 1e4 + and c_lines0 == 1e4 + and l_lines1 == 4 + and c_lines1 == 4 ) -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_write_edge_data_from_list(bw, _get_edges): edges = _get_edges @@ -535,8 +562,8 @@ def test_write_edge_data_from_list(bw, _get_edges): tmp_path = bw.outdir - apl_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part000.csv') - ips_csv = os.path.join(tmp_path, 'Is_Mutated_In-part000.csv') + apl_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part000.csv") + ips_csv = os.path.join(tmp_path, "Is_Mutated_In-part000.csv") with open(apl_csv) as f: l = f.read() @@ -547,7 +574,7 @@ def test_write_edge_data_from_list(bw, _get_edges): assert "p0;" in l assert "prel0;" in l assert "'T253';" in l - assert "4;" in l + assert "4;" in l assert "p1;" in l assert "PERTURBED_IN_DISEASE" in l assert "\n" in l @@ -559,9 +586,10 @@ def test_write_edge_data_from_list(bw, _get_edges): assert "p1;" in c assert "Is_Mutated_In" in c assert "m1;" in c - assert '\n' in c - -@pytest.mark.parametrize('l', [4], scope='module') + assert "\n" in c + + +@pytest.mark.parametrize("l", [4], scope="module") def test_write_edge_id_optional(bw, _get_edges): edges = _get_edges @@ -580,8 +608,8 @@ def test_write_edge_id_optional(bw, _get_edges): tmp_path = bw.outdir - pert_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part000.csv') - phos_csv = os.path.join(tmp_path, 'Phosphorylation-part000.csv') + pert_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part000.csv") + phos_csv = os.path.join(tmp_path, "Phosphorylation-part000.csv") with open(pert_csv) as f: pertf = f.read() @@ -591,8 +619,8 @@ def test_write_edge_id_optional(bw, _get_edges): assert "prel0;" in pertf assert "phos1;" not in phosf - pert_header = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-header.csv') - phos_header = os.path.join(tmp_path, 'Phosphorylation-header.csv') + pert_header = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-header.csv") + phos_header = os.path.join(tmp_path, "Phosphorylation-header.csv") with open(pert_header) as f: perth = f.read() @@ -602,20 +630,21 @@ def test_write_edge_id_optional(bw, _get_edges): assert "id;" in perth assert "id;" not in phosh + def test_write_edge_data_from_list_no_props(bw): le = 4 edges = [] for i in range(le): e1 = BioCypherEdge( - source_id=f'p{i}', - target_id=f'p{i + 1}', - relationship_label='PERTURBED_IN_DISEASE', + source_id=f"p{i}", + target_id=f"p{i + 1}", + relationship_label="PERTURBED_IN_DISEASE", ) edges.append(e1) e2 = BioCypherEdge( - source_id=f'm{i}', - target_id=f'p{i + 1}', - relationship_label='Is_Mutated_In', + source_id=f"m{i}", + target_id=f"p{i + 1}", + relationship_label="Is_Mutated_In", ) edges.append(e2) @@ -623,8 +652,8 @@ def test_write_edge_data_from_list_no_props(bw): tmp_path = bw.outdir - ptl_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part000.csv') - pts_csv = os.path.join(tmp_path, 'Is_Mutated_In-part000.csv') + ptl_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part000.csv") + pts_csv = os.path.join(tmp_path, "Is_Mutated_In-part000.csv") with open(ptl_csv) as f: l = f.read() @@ -632,23 +661,23 @@ def test_write_edge_data_from_list_no_props(bw): c = f.read() assert passed - assert 'p0;' in l - assert 'p1;' in l - assert 'PERTURBED_IN_DISEASE' in l - assert 'p1;' in l - assert 'p2;' in l - assert 'PERTURBED_IN_DISEASE' in l - assert '\n' in l - assert 'm0;' in c - assert 'p1;' in c - assert 'Is_Mutated_In' in c - assert 'm1;' in c - assert 'p2;' in c - assert 'Is_Mutated_In' in c - assert '\n' in c - - -@pytest.mark.parametrize('l', [8], scope='module') + assert "p0;" in l + assert "p1;" in l + assert "PERTURBED_IN_DISEASE" in l + assert "p1;" in l + assert "p2;" in l + assert "PERTURBED_IN_DISEASE" in l + assert "\n" in l + assert "m0;" in c + assert "p1;" in c + assert "Is_Mutated_In" in c + assert "m1;" in c + assert "p2;" in c + assert "Is_Mutated_In" in c + assert "\n" in c + + +@pytest.mark.parametrize("l", [8], scope="module") def test_write_edge_data_headers_import_call(bw, _get_nodes, _get_edges): edges = _get_edges @@ -673,9 +702,9 @@ def edge_gen2(edges): tmp_path = bw.outdir - ptl_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-header.csv') - pts_csv = os.path.join(tmp_path, 'Is_Mutated_In-header.csv') - call_csv = os.path.join(tmp_path, 'neo4j-admin-import-call.sh') + ptl_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-header.csv") + pts_csv = os.path.join(tmp_path, "Is_Mutated_In-header.csv") + call_csv = os.path.join(tmp_path, "neo4j-admin-import-call.sh") with open(ptl_csv) as f: l = f.read() @@ -684,19 +713,19 @@ def edge_gen2(edges): with open(call_csv) as f: call = f.read() - assert l == ':START_ID;id;residue;level:long;:END_ID;:TYPE' - assert c == ':START_ID;id;site;confidence:long;:END_ID;:TYPE' + assert l == ":START_ID;id;residue;level:long;:END_ID;:TYPE" + assert c == ":START_ID;id;site;confidence:long;:END_ID;:TYPE" - assert 'bin/neo4j-admin import' in call - assert '--database=neo4j' in call + assert "bin/neo4j-admin import" in call + assert "--database=neo4j" in call assert '--delimiter=";"' in call - assert '--force=true' in call + assert "--force=true" in call assert '--nodes="' in call - assert 'PERTURBED_IN_DISEASE' in call - assert 'Is_Mutated_In' in call + assert "PERTURBED_IN_DISEASE" in call + assert "Is_Mutated_In" in call -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_write_duplicate_edges(bw, _get_edges): edges = _get_edges edges.append(edges[0]) @@ -705,8 +734,8 @@ def test_write_duplicate_edges(bw, _get_edges): tmp_path = bw.outdir - ptl_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-part000.csv') - pts_csv = os.path.join(tmp_path, 'Is_Mutated_In-part000.csv') + ptl_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-part000.csv") + pts_csv = os.path.join(tmp_path, "Is_Mutated_In-part000.csv") l = sum(1 for _ in open(ptl_csv)) c = sum(1 for _ in open(pts_csv)) @@ -724,9 +753,9 @@ def gen(lis): tmp_path = bw.outdir - iso_csv = os.path.join(tmp_path, 'IS_SOURCE_OF-part000.csv') - ito_csv = os.path.join(tmp_path, 'IS_TARGET_OF-part000.csv') - pmi_csv = os.path.join(tmp_path, 'PostTranslationalInteraction-part000.csv') + iso_csv = os.path.join(tmp_path, "IS_SOURCE_OF-part000.csv") + ito_csv = os.path.join(tmp_path, "IS_TARGET_OF-part000.csv") + pmi_csv = os.path.join(tmp_path, "PostTranslationalInteraction-part000.csv") with open(iso_csv) as f: s = f.read() @@ -736,39 +765,39 @@ def gen(lis): p = f.read() assert passed - assert 'i1;' in s - assert 'p1;' in s - assert 'IS_SOURCE_OF' in s - assert '\n' in s - assert 'i0;' in t - assert 'p2;' in t - assert 'IS_TARGET_OF' in t - assert '\n' in t + assert "i1;" in s + assert "p1;" in s + assert "IS_SOURCE_OF" in s + assert "\n" in s + assert "i0;" in t + assert "p2;" in t + assert "IS_TARGET_OF" in t + assert "\n" in t assert "i1;True;-1;'i1';'id'" in p - assert 'Association' in p - assert '\n' in p + assert "Association" in p + assert "\n" in p def _get_rel_as_nodes(l): rels = [] for i in range(l): n = BioCypherNode( - node_id=f'i{i+1}', - node_label='post translational interaction', + node_id=f"i{i+1}", + node_label="post translational interaction", properties={ - 'directed': True, - 'effect': -1, + "directed": True, + "effect": -1, }, ) e1 = BioCypherEdge( - source_id=f'i{i+1}', - target_id=f'p{i+1}', - relationship_label='IS_SOURCE_OF', + source_id=f"i{i+1}", + target_id=f"p{i+1}", + relationship_label="IS_SOURCE_OF", ) e2 = BioCypherEdge( - source_id=f'i{i}', - target_id=f'p{i + 2}', - relationship_label='IS_TARGET_OF', + source_id=f"i{i}", + target_id=f"p{i + 2}", + relationship_label="IS_TARGET_OF", ) rels.append(BioCypherRelAsNode(n, e1, e2)) return rels @@ -790,7 +819,7 @@ def gen2(lis): tmp_path = bw.outdir - iso_csv = os.path.join(tmp_path, 'IS_SOURCE_OF-part001.csv') + iso_csv = os.path.join(tmp_path, "IS_SOURCE_OF-part001.csv") assert passed1 and passed2 and isfile(iso_csv) @@ -800,25 +829,25 @@ def test_write_mixed_edges(bw): le = 4 for i in range(le): e3 = BioCypherEdge( - source_id=f'p{i+1}', - target_id=f'p{i+1}', - relationship_label='PERTURBED_IN_DISEASE', + source_id=f"p{i+1}", + target_id=f"p{i+1}", + relationship_label="PERTURBED_IN_DISEASE", ) mixed.append(e3) n = BioCypherNode( - f'i{i+1}', - 'post translational interaction', + f"i{i+1}", + "post translational interaction", ) e1 = BioCypherEdge( - source_id=f'i{i+1}', - target_id=f'p{i+1}', - relationship_label='IS_SOURCE_OF', + source_id=f"i{i+1}", + target_id=f"p{i+1}", + relationship_label="IS_SOURCE_OF", ) e2 = BioCypherEdge( - source_id=f'i{i}', - target_id=f'p{i+2}', - relationship_label='IS_TARGET_OF', + source_id=f"i{i}", + target_id=f"p{i+2}", + relationship_label="IS_TARGET_OF", ) mixed.append(BioCypherRelAsNode(n, e1, e2)) @@ -829,24 +858,26 @@ def gen(lis): tmp_path = bw.outdir - pmi_csv = os.path.join(tmp_path, 'PostTranslationalInteraction-header.csv') - iso_csv = os.path.join(tmp_path, 'IS_SOURCE_OF-header.csv') - ito_csv = os.path.join(tmp_path, 'IS_TARGET_OF-header.csv') - ipt_csv = os.path.join(tmp_path, 'PERTURBED_IN_DISEASE-header.csv') + pmi_csv = os.path.join(tmp_path, "PostTranslationalInteraction-header.csv") + iso_csv = os.path.join(tmp_path, "IS_SOURCE_OF-header.csv") + ito_csv = os.path.join(tmp_path, "IS_TARGET_OF-header.csv") + ipt_csv = os.path.join(tmp_path, "PERTURBED_IN_DISEASE-header.csv") assert ( - passed and os.path.isfile(pmi_csv) and os.path.isfile(iso_csv) and - os.path.isfile(ito_csv) and os.path.isfile(ipt_csv) + passed + and os.path.isfile(pmi_csv) + and os.path.isfile(iso_csv) + and os.path.isfile(ito_csv) + and os.path.isfile(ipt_csv) ) def test_duplicate_id(bw): - nodes = [] tmp_path = bw.outdir - csv = os.path.join(tmp_path, 'Protein-part000.csv') + csv = os.path.join(tmp_path, "Protein-part000.csv") # remove csv file in path if os.path.exists(csv): @@ -855,13 +886,13 @@ def test_duplicate_id(bw): # four proteins, four miRNAs for _ in range(2): bnp = BioCypherNode( - node_id=f'p1', - node_label='protein', + node_id=f"p1", + node_label="protein", properties={ - 'name': 'StringProperty1', - 'score': 4.32, - 'taxon': 9606, - 'genes': ['gene1', 'gene2'], + "name": "StringProperty1", + "score": 4.32, + "taxon": 9606, + "genes": ["gene1", "gene2"], }, ) nodes.append(bnp) @@ -874,12 +905,11 @@ def test_duplicate_id(bw): def test_write_synonym(bw): - nodes = [] tmp_path = bw.outdir - csv = os.path.join(tmp_path, 'Complex-part000.csv') + csv = os.path.join(tmp_path, "Complex-part000.csv") # remove csv file in path if os.path.exists(csv): @@ -887,12 +917,12 @@ def test_write_synonym(bw): # four proteins, four miRNAs for _ in range(4): bnp = BioCypherNode( - node_id=f'p{_+1}', - node_label='complex', + node_id=f"p{_+1}", + node_label="complex", properties={ - 'name': 'StringProperty1', - 'score': 4.32, - 'taxon': 9606, + "name": "StringProperty1", + "score": 4.32, + "taxon": 9606, }, ) nodes.append(bnp) @@ -904,21 +934,21 @@ def test_write_synonym(bw): assert passed and os.path.exists(csv) assert "p1;'StringProperty1';4.32;9606;'p1';'id'" in comp - assert 'Complex' in comp + assert "Complex" in comp -def test_write_strict(bw_strict): +def test_write_strict(bw_strict): n1 = BioCypherNode( - node_id='p1', - node_label='protein', + node_id="p1", + node_label="protein", properties={ - 'name': 'StringProperty1', - 'score': 4.32, - 'taxon': 9606, - 'genes': ['gene1', 'gene2'], - 'source': 'source1', - 'version': 'version1', - 'licence': 'licence1', + "name": "StringProperty1", + "score": 4.32, + "taxon": 9606, + "genes": ["gene1", "gene2"], + "source": "source1", + "version": "version1", + "licence": "licence1", }, ) @@ -928,30 +958,32 @@ def test_write_strict(bw_strict): tmp_path = bw_strict.outdir - csv = os.path.join(tmp_path, 'Protein-part000.csv') + csv = os.path.join(tmp_path, "Protein-part000.csv") with open(csv) as f: prot = f.read() - assert "p1;'StringProperty1';4.32;9606;'gene1|gene2';'p1';'id';'source1';'version1';'licence1'" in prot - assert 'BiologicalEntity' in prot + assert ( + "p1;'StringProperty1';4.32;9606;'gene1|gene2';'p1';'id';'source1';'version1';'licence1'" + in prot + ) + assert "BiologicalEntity" in prot -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_tab_delimiter(bw_tab, _get_nodes): - passed = bw_tab.write_nodes(_get_nodes) assert passed tmp_path = bw_tab.outdir - header = os.path.join(tmp_path, 'Protein-header.csv') + header = os.path.join(tmp_path, "Protein-header.csv") with open(header) as f: prot = f.read() - assert '\t' in prot + assert "\t" in prot call = bw_tab._construct_import_call() diff --git a/test/test_write_postgres.py b/test/test_write_postgres.py index 6ffd1930..cbad24c7 100644 --- a/test/test_write_postgres.py +++ b/test/test_write_postgres.py @@ -4,7 +4,7 @@ import pytest -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_write_node_data_from_gen_comma_postgresql( bw_comma_postgresql, _get_nodes ): @@ -20,8 +20,8 @@ def node_gen(nodes): tmp_path = bw_comma_postgresql.outdir - p_csv = os.path.join(tmp_path, 'Protein-part000.csv') - m_csv = os.path.join(tmp_path, 'MicroRNA-part000.csv') + p_csv = os.path.join(tmp_path, "Protein-part000.csv") + m_csv = os.path.join(tmp_path, "MicroRNA-part000.csv") with open(p_csv) as f: pr = f.read() @@ -30,15 +30,15 @@ def node_gen(nodes): mi = f.read() assert 'p1,"StringProperty1",4.0,9606' in pr - assert 'uniprot' in pr - assert 'BiologicalEntity' in pr - assert 'Polypeptide' in pr - assert 'Protein' in pr + assert "uniprot" in pr + assert "BiologicalEntity" in pr + assert "Polypeptide" in pr + assert "Protein" in pr assert 'm1,"StringProperty1",9606,"m1","mirbase"' in mi - assert 'ChemicalEntity' in mi + assert "ChemicalEntity" in mi -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_write_node_data_from_gen_tab_postgresql(bw_tab_postgresql, _get_nodes): nodes = _get_nodes @@ -49,8 +49,8 @@ def node_gen(nodes): tmp_path = bw_tab_postgresql.outdir - p_csv = os.path.join(tmp_path, 'Protein-part000.csv') - m_csv = os.path.join(tmp_path, 'MicroRNA-part000.csv') + p_csv = os.path.join(tmp_path, "Protein-part000.csv") + m_csv = os.path.join(tmp_path, "MicroRNA-part000.csv") with open(p_csv) as f: pr = f.read() @@ -61,19 +61,25 @@ def node_gen(nodes): assert passed assert 'p1\t"StringProperty1"\t4.0\t9606\t' in pr assert '\t"uniprot"\t' in pr - assert 'BiologicalEntity' in pr - assert 'Polypeptide' in pr - assert 'Protein' in pr + assert "BiologicalEntity" in pr + assert "Polypeptide" in pr + assert "Protein" in pr assert 'm1\t"StringProperty1"\t9606\t"m1"\t"mirbase"' in mi - assert 'ChemicalEntity' in mi + assert "ChemicalEntity" in mi @pytest.mark.requires_postgresql -@pytest.mark.parametrize('l', [4], scope='module') +@pytest.mark.parametrize("l", [4], scope="module") def test_database_import_node_data_from_gen_comma_postgresql( bw_comma_postgresql, _get_nodes, create_database_postgres ): - dbname, user, port, password, create_database_success = create_database_postgres + ( + dbname, + user, + port, + password, + create_database_success, + ) = create_database_postgres assert create_database_success nodes = _get_nodes @@ -88,8 +94,10 @@ def node_gen(nodes): # verify that all files have been created assert set(os.listdir(tmp_path)) == set( [ - 'protein-create_table.sql', 'Protein-part000.csv', - 'microrna-create_table.sql', 'MicroRNA-part000.csv' + "protein-create_table.sql", + "Protein-part000.csv", + "microrna-create_table.sql", + "MicroRNA-part000.csv", ] ) @@ -97,7 +105,8 @@ def node_gen(nodes): # verify that import call has been created import_scripts = [ name - for name in os.listdir(tmp_path) if name.endswith('-import-call.sh') + for name in os.listdir(tmp_path) + if name.endswith("-import-call.sh") ] assert len(import_scripts) == 1 @@ -112,32 +121,38 @@ def node_gen(nodes): assert result.returncode == 0 # check data in the databases - command = f'PGPASSWORD={password} psql -c \'SELECT COUNT(*) FROM protein;\' --dbname {dbname} --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'SELECT COUNT(*) FROM protein;' --dbname {dbname} --port {port} --user {user}" result = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # subprocess success assert result.returncode == 0 # 4 entires in table - assert '4' in result.stdout.decode() + assert "4" in result.stdout.decode() # check data in the databases - command = f'PGPASSWORD={password} psql -c \'SELECT COUNT(*) FROM microrna;\' --dbname {dbname} --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'SELECT COUNT(*) FROM microrna;' --dbname {dbname} --port {port} --user {user}" result = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # subprocess success assert result.returncode == 0 # 4 entires in table - assert '4' in result.stdout.decode() + assert "4" in result.stdout.decode() @pytest.mark.requires_postgresql -@pytest.mark.parametrize('l', [5], scope='module') +@pytest.mark.parametrize("l", [5], scope="module") def test_database_import_node_data_from_gen_tab_postgresql( bw_tab_postgresql, _get_nodes, create_database_postgres ): - dbname, user, port, password, create_database_success = create_database_postgres + ( + dbname, + user, + port, + password, + create_database_success, + ) = create_database_postgres assert create_database_success nodes = _get_nodes @@ -152,8 +167,10 @@ def node_gen(nodes): # verify that all files have been created assert set(os.listdir(tmp_path)) == set( [ - 'protein-create_table.sql', 'Protein-part000.csv', - 'microrna-create_table.sql', 'MicroRNA-part000.csv' + "protein-create_table.sql", + "Protein-part000.csv", + "microrna-create_table.sql", + "MicroRNA-part000.csv", ] ) @@ -161,7 +178,8 @@ def node_gen(nodes): # verify that import call has been created import_scripts = [ name - for name in os.listdir(tmp_path) if name.endswith('-import-call.sh') + for name in os.listdir(tmp_path) + if name.endswith("-import-call.sh") ] assert len(import_scripts) == 1 @@ -176,32 +194,38 @@ def node_gen(nodes): assert result.returncode == 0 # check data in the databases - command = f'PGPASSWORD={password} psql -c \'SELECT COUNT(*) FROM protein;\' --dbname {dbname} --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'SELECT COUNT(*) FROM protein;' --dbname {dbname} --port {port} --user {user}" result = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # subprocess success assert result.returncode == 0 # 5 entires in table - assert '5' in result.stdout.decode() + assert "5" in result.stdout.decode() # check data in the databases - command = f'PGPASSWORD={password} psql -c \'SELECT COUNT(*) FROM microrna;\' --dbname {dbname} --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'SELECT COUNT(*) FROM microrna;' --dbname {dbname} --port {port} --user {user}" result = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # subprocess success assert result.returncode == 0 # 5 entires in table - assert '5' in result.stdout.decode() + assert "5" in result.stdout.decode() @pytest.mark.requires_postgresql -@pytest.mark.parametrize('l', [8], scope='module') +@pytest.mark.parametrize("l", [8], scope="module") def test_database_import_edge_data_from_gen_comma_postgresql( bw_comma_postgresql, _get_nodes, create_database_postgres, _get_edges ): - dbname, user, port, password, create_database_success = create_database_postgres + ( + dbname, + user, + port, + password, + create_database_success, + ) = create_database_postgres assert create_database_success edges = _get_edges @@ -227,7 +251,8 @@ def edge_gen2(edges): # verify that import call has been created import_scripts = [ name - for name in os.listdir(tmp_path) if name.endswith('-import-call.sh') + for name in os.listdir(tmp_path) + if name.endswith("-import-call.sh") ] assert len(import_scripts) == 1 @@ -237,9 +262,9 @@ def edge_gen2(edges): commands = f.readlines() assert len(commands) > 0 - assert str(tmp_path) in '\n'.join(commands) - assert 'protein-create_table.sql' in '\n'.join(commands) - assert '--user' in '\n'.join(commands) + assert str(tmp_path) in "\n".join(commands) + assert "protein-create_table.sql" in "\n".join(commands) + assert "--user" in "\n".join(commands) for command in commands: result = subprocess.run(command, shell=True) @@ -247,31 +272,37 @@ def edge_gen2(edges): assert result.returncode == 0 # check data in the databases - command = f'PGPASSWORD={password} psql -c \'SELECT COUNT(*) FROM is_mutated_in;\' --dbname {dbname} --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'SELECT COUNT(*) FROM is_mutated_in;' --dbname {dbname} --port {port} --user {user}" result = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # subprocess success assert result.returncode == 0 # 2 entries in table - assert '8' in result.stdout.decode() + assert "8" in result.stdout.decode() - command = f'PGPASSWORD={password} psql -c \'SELECT COUNT(*) FROM perturbed_in_disease;\' --dbname {dbname} --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'SELECT COUNT(*) FROM perturbed_in_disease;' --dbname {dbname} --port {port} --user {user}" result = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # subprocess success assert result.returncode == 0 # 2 entries in table - assert '8' in result.stdout.decode() + assert "8" in result.stdout.decode() @pytest.mark.requires_postgresql -@pytest.mark.parametrize('l', [8], scope='module') +@pytest.mark.parametrize("l", [8], scope="module") def test_database_import_edge_data_from_gen_tab_postgresql( bw_tab_postgresql, _get_nodes, create_database_postgres, _get_edges ): - dbname, user, port, password, create_database_success = create_database_postgres + ( + dbname, + user, + port, + password, + create_database_success, + ) = create_database_postgres assert create_database_success edges = _get_edges @@ -297,7 +328,8 @@ def edge_gen2(edges): # verify that import call has been created import_scripts = [ name - for name in os.listdir(tmp_path) if name.endswith('-import-call.sh') + for name in os.listdir(tmp_path) + if name.endswith("-import-call.sh") ] assert len(import_scripts) == 1 @@ -307,29 +339,29 @@ def edge_gen2(edges): commands = f.readlines() assert len(commands) > 1 - assert str(tmp_path) in '\n'.join(commands) - assert 'protein-create_table.sql' in '\n'.join(commands) - assert '--user' in '\n'.join(commands) + assert str(tmp_path) in "\n".join(commands) + assert "protein-create_table.sql" in "\n".join(commands) + assert "--user" in "\n".join(commands) for command in commands: result = subprocess.run(command, shell=True) assert result.returncode == 0 # check data in the databases - command = f'PGPASSWORD={password} psql -c \'SELECT COUNT(*) FROM is_mutated_in;\' --dbname {dbname} --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'SELECT COUNT(*) FROM is_mutated_in;' --dbname {dbname} --port {port} --user {user}" result = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # subprocess success assert result.returncode == 0 # 2 entires in table - assert '8' in result.stdout.decode() + assert "8" in result.stdout.decode() - command = f'PGPASSWORD={password} psql -c \'SELECT COUNT(*) FROM perturbed_in_disease;\' --dbname {dbname} --port {port} --user {user}' + command = f"PGPASSWORD={password} psql -c 'SELECT COUNT(*) FROM perturbed_in_disease;' --dbname {dbname} --port {port} --user {user}" result = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) # subprocess success assert result.returncode == 0 # 2 entires in table - assert '8' in result.stdout.decode() + assert "8" in result.stdout.decode() diff --git a/tutorial/01__basic_import.py b/tutorial/01__basic_import.py index 7bfdb0ec..861f7abe 100644 --- a/tutorial/01__basic_import.py +++ b/tutorial/01__basic_import.py @@ -17,8 +17,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/01_biocypher_config.yaml', - schema_config_path='tutorial/01_schema_config.yaml', + biocypher_config_path="tutorial/01_biocypher_config.yaml", + schema_config_path="tutorial/01_schema_config.yaml", ) # Run the import bc.write_nodes(node_generator()) @@ -27,5 +27,5 @@ def node_generator(): bc.write_import_call() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/01__basic_import_pandas.py b/tutorial/01__basic_import_pandas.py index b0b26a4a..e12c4cc9 100644 --- a/tutorial/01__basic_import_pandas.py +++ b/tutorial/01__basic_import_pandas.py @@ -17,12 +17,13 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/01_biocypher_config.yaml', - schema_config_path='tutorial/01_schema_config.yaml', + biocypher_config_path="tutorial/01_biocypher_config.yaml", + schema_config_path="tutorial/01_schema_config.yaml", ) # Run the import bc.add(node_generator()) bc.to_df() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/tutorial/02__merge.py b/tutorial/02__merge.py index 245be65f..7141f133 100644 --- a/tutorial/02__merge.py +++ b/tutorial/02__merge.py @@ -5,10 +5,12 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [Protein() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -22,8 +24,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/02_biocypher_config.yaml', - schema_config_path='tutorial/02_schema_config.yaml', + biocypher_config_path="tutorial/02_biocypher_config.yaml", + schema_config_path="tutorial/02_schema_config.yaml", ) # Run the import bc.write_nodes(node_generator()) @@ -32,5 +34,5 @@ def node_generator(): bc.write_import_call() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/02__merge_pandas.py b/tutorial/02__merge_pandas.py index a3583bb6..6f9aa126 100644 --- a/tutorial/02__merge_pandas.py +++ b/tutorial/02__merge_pandas.py @@ -5,10 +5,12 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [Protein() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -22,8 +24,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/02_biocypher_config.yaml', - schema_config_path='tutorial/02_schema_config.yaml', + biocypher_config_path="tutorial/02_biocypher_config.yaml", + schema_config_path="tutorial/02_schema_config.yaml", ) # Run the import bc.add(node_generator()) @@ -31,5 +33,5 @@ def node_generator(): print(bc.to_df()) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/03__implicit_subclass.py b/tutorial/03__implicit_subclass.py index 7ba277be..1c43269f 100644 --- a/tutorial/03__implicit_subclass.py +++ b/tutorial/03__implicit_subclass.py @@ -5,10 +5,12 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [Protein() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -22,8 +24,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/03_biocypher_config.yaml', - schema_config_path='tutorial/03_schema_config.yaml', + biocypher_config_path="tutorial/03_biocypher_config.yaml", + schema_config_path="tutorial/03_schema_config.yaml", ) # Run the import bc.write_nodes(node_generator()) @@ -32,5 +34,5 @@ def node_generator(): bc.write_import_call() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/03__implicit_subclass_pandas.py b/tutorial/03__implicit_subclass_pandas.py index 0f7be14c..709984a1 100644 --- a/tutorial/03__implicit_subclass_pandas.py +++ b/tutorial/03__implicit_subclass_pandas.py @@ -5,10 +5,12 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [Protein() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -22,8 +24,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/03_biocypher_config.yaml', - schema_config_path='tutorial/03_schema_config.yaml', + biocypher_config_path="tutorial/03_biocypher_config.yaml", + schema_config_path="tutorial/03_schema_config.yaml", ) # Run the import bc.add(node_generator()) @@ -33,5 +35,5 @@ def node_generator(): print(df) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/04__properties.py b/tutorial/04__properties.py index 24a268be..9c3e683c 100644 --- a/tutorial/04__properties.py +++ b/tutorial/04__properties.py @@ -5,10 +5,12 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [RandomPropertyProtein() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -22,8 +24,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/04_biocypher_config.yaml', - schema_config_path='tutorial/04_schema_config.yaml', + biocypher_config_path="tutorial/04_biocypher_config.yaml", + schema_config_path="tutorial/04_schema_config.yaml", ) # Run the import bc.write_nodes(node_generator()) @@ -32,5 +34,5 @@ def node_generator(): bc.write_import_call() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/04__properties_pandas.py b/tutorial/04__properties_pandas.py index 052f1428..fc39e294 100644 --- a/tutorial/04__properties_pandas.py +++ b/tutorial/04__properties_pandas.py @@ -5,10 +5,12 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [RandomPropertyProtein() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -22,8 +24,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/04_biocypher_config.yaml', - schema_config_path='tutorial/04_schema_config.yaml', + biocypher_config_path="tutorial/04_biocypher_config.yaml", + schema_config_path="tutorial/04_schema_config.yaml", ) # Run the import bc.add(node_generator()) @@ -33,5 +35,5 @@ def node_generator(): print(df) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/05__property_inheritance.py b/tutorial/05__property_inheritance.py index 4baadbf0..2f7577a9 100644 --- a/tutorial/05__property_inheritance.py +++ b/tutorial/05__property_inheritance.py @@ -9,11 +9,13 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [RandomPropertyProtein() for _ in range(10)], [RandomPropertyProteinIsoform() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -27,8 +29,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/05_biocypher_config.yaml', - schema_config_path='tutorial/05_schema_config.yaml', + biocypher_config_path="tutorial/05_biocypher_config.yaml", + schema_config_path="tutorial/05_schema_config.yaml", ) # Run the import bc.write_nodes(node_generator()) @@ -37,5 +39,5 @@ def node_generator(): bc.write_import_call() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/05__property_inheritance_pandas.py b/tutorial/05__property_inheritance_pandas.py index 3e11f074..b0ccf2ec 100644 --- a/tutorial/05__property_inheritance_pandas.py +++ b/tutorial/05__property_inheritance_pandas.py @@ -9,11 +9,13 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [RandomPropertyProtein() for _ in range(10)], [RandomPropertyProteinIsoform() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -27,8 +29,8 @@ def node_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/05_biocypher_config.yaml', - schema_config_path='tutorial/05_schema_config.yaml', + biocypher_config_path="tutorial/05_biocypher_config.yaml", + schema_config_path="tutorial/05_schema_config.yaml", ) # Run the import bc.add(node_generator()) @@ -38,5 +40,5 @@ def node_generator(): print(df) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/06__relationships.py b/tutorial/06__relationships.py index 364377bc..768746ad 100644 --- a/tutorial/06__relationships.py +++ b/tutorial/06__relationships.py @@ -10,11 +10,13 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [RandomPropertyProtein() for _ in range(10)], [RandomPropertyProteinIsoform() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -45,8 +47,8 @@ def edge_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/06_biocypher_config.yaml', - schema_config_path='tutorial/06_schema_config.yaml', + biocypher_config_path="tutorial/06_biocypher_config.yaml", + schema_config_path="tutorial/06_schema_config.yaml", ) # Run the import bc.write_nodes(node_generator()) @@ -59,5 +61,5 @@ def edge_generator(): bc.show_ontology_structure() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/06__relationships_pandas.py b/tutorial/06__relationships_pandas.py index b839fef1..b67a5b4d 100644 --- a/tutorial/06__relationships_pandas.py +++ b/tutorial/06__relationships_pandas.py @@ -10,11 +10,13 @@ def main(): # Setup: create a list of proteins to be imported proteins = [ - p for sublist in zip( + p + for sublist in zip( [RandomPropertyProtein() for _ in range(10)], [RandomPropertyProteinIsoform() for _ in range(10)], [EntrezProtein() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -45,16 +47,17 @@ def edge_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/06_biocypher_config.yaml', - schema_config_path='tutorial/06_schema_config_pandas.yaml', + biocypher_config_path="tutorial/06_biocypher_config.yaml", + schema_config_path="tutorial/06_schema_config_pandas.yaml", ) # Run the import bc.add(node_generator()) bc.add(edge_generator()) - + for name, df in bc.to_df().items(): print(name) print(df) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/tutorial/07__synonyms.py b/tutorial/07__synonyms.py index a6fa7cff..c7af403e 100644 --- a/tutorial/07__synonyms.py +++ b/tutorial/07__synonyms.py @@ -11,12 +11,14 @@ def main(): # Setup: create a list of proteins to be imported proteins_complexes = [ - p for sublist in zip( + p + for sublist in zip( [RandomPropertyProtein() for _ in range(10)], [RandomPropertyProteinIsoform() for _ in range(10)], [EntrezProtein() for _ in range(10)], [Complex() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -47,8 +49,8 @@ def edge_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/07_biocypher_config.yaml', - schema_config_path='tutorial/07_schema_config.yaml', + biocypher_config_path="tutorial/07_biocypher_config.yaml", + schema_config_path="tutorial/07_schema_config.yaml", ) # Run the import bc.write_nodes(node_generator()) @@ -61,5 +63,5 @@ def edge_generator(): bc.summary() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/07__synonyms_pandas.py b/tutorial/07__synonyms_pandas.py index 9b694514..95ce0a13 100644 --- a/tutorial/07__synonyms_pandas.py +++ b/tutorial/07__synonyms_pandas.py @@ -11,12 +11,14 @@ def main(): # Setup: create a list of proteins to be imported proteins_complexes = [ - p for sublist in zip( + p + for sublist in zip( [RandomPropertyProtein() for _ in range(10)], [RandomPropertyProteinIsoform() for _ in range(10)], [EntrezProtein() for _ in range(10)], [Complex() for _ in range(10)], - ) for p in sublist + ) + for p in sublist ] # Extract id, label, and property dictionary @@ -47,8 +49,8 @@ def edge_generator(): # Create BioCypher driver bc = BioCypher( - biocypher_config_path='tutorial/07_biocypher_config.yaml', - schema_config_path='tutorial/07_schema_config_pandas.yaml', + biocypher_config_path="tutorial/07_biocypher_config.yaml", + schema_config_path="tutorial/07_schema_config_pandas.yaml", ) # Run the import bc.add(node_generator()) @@ -62,5 +64,5 @@ def edge_generator(): bc.summary() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorial/data_generator.py b/tutorial/data_generator.py index 7f805252..27f18b48 100644 --- a/tutorial/data_generator.py +++ b/tutorial/data_generator.py @@ -6,14 +6,14 @@ import string __all__ = [ - 'EntrezProtein', - 'Interaction', - 'InteractionGenerator', - 'Node', - 'Protein', - 'ProteinProteinInteraction', - 'RandomPropertyProtein', - 'RandomPropertyProteinIsoform', + "EntrezProtein", + "Interaction", + "InteractionGenerator", + "Node", + "Protein", + "ProteinProteinInteraction", + "RandomPropertyProtein", + "RandomPropertyProteinIsoform", ] @@ -21,6 +21,7 @@ class Node: """ Base class for nodes. """ + def __init__(self): self.id = None self.label = None @@ -49,9 +50,10 @@ class Protein(Node): """ Generates instances of proteins. """ + def __init__(self): self.id = self._generate_id() - self.label = 'uniprot_protein' + self.label = "uniprot_protein" self.properties = self._generate_properties() def _generate_id(self): @@ -62,7 +64,7 @@ def _generate_id(self): nums = [random.choice(string.digits) for _ in range(3)] # join alternating between lets and nums - return ''.join([x for y in zip(lets, nums) for x in y]) + return "".join([x for y in zip(lets, nums) for x in y]) def _generate_properties(self): properties = {} @@ -72,17 +74,17 @@ def _generate_properties(self): # random int between 50 and 250 l = random.randint(50, 250) - properties['sequence'] = ''.join( - [random.choice('ACDEFGHIKLMNPQRSTVWY') for _ in range(l)], + properties["sequence"] = "".join( + [random.choice("ACDEFGHIKLMNPQRSTVWY") for _ in range(l)], ) ## random description - properties['description'] = ' '.join( + properties["description"] = " ".join( [random.choice(string.ascii_lowercase) for _ in range(10)], ) ## taxon - properties['taxon'] = '9606' + properties["taxon"] = "9606" return properties @@ -91,9 +93,10 @@ class Complex(Node): """ Generates instances of complexes. """ + def __init__(self): self.id = self._generate_id() - self.label = 'complex' + self.label = "complex" self.properties = self._generate_properties() def _generate_id(self): @@ -109,12 +112,12 @@ def _generate_properties(self): properties = {} ## random description - properties['description'] = ' '.join( + properties["description"] = " ".join( [random.choice(string.ascii_lowercase) for _ in range(10)], ) ## taxon - properties['taxon'] = '9606' + properties["taxon"] = "9606" return properties @@ -123,6 +126,7 @@ class RandomPropertyProtein(Protein): """ Generates instances of proteins with random properties. """ + def _generate_properties(self): properties = {} @@ -131,21 +135,21 @@ def _generate_properties(self): # random int between 50 and 250 l = random.randint(50, 250) - properties['sequence'] = ''.join( - [random.choice('ACDEFGHIKLMNPQRSTVWY') for _ in range(l)], + properties["sequence"] = "".join( + [random.choice("ACDEFGHIKLMNPQRSTVWY") for _ in range(l)], ) ## random description - properties['description'] = ' '.join( + properties["description"] = " ".join( [random.choice(string.ascii_lowercase) for _ in range(10)], ) ## random taxon - properties['taxon'] = str(random.randint(0, 10000)) + properties["taxon"] = str(random.randint(0, 10000)) ## randomly add 'mass' if random.random() > 0.5: - properties['mass'] = random.randint(0, 10000) + properties["mass"] = random.randint(0, 10000) return properties @@ -154,19 +158,21 @@ class RandomPropertyProteinIsoform(RandomPropertyProtein): """ Generates instances of protein isoforms with random properties. """ + def __init__(self): super().__init__() - self.label = 'uniprot_isoform' + self.label = "uniprot_isoform" class EntrezProtein(Protein): """ Generates instances of proteins with Entrez IDs. """ + def __init__(self): super().__init__() self.id = self._generate_id() - self.label = 'entrez_protein' + self.label = "entrez_protein" def _generate_id(self): """ @@ -179,6 +185,7 @@ class Interaction: """ Base class for interactions. """ + def __init__(self): self.id = None self.source_id = None @@ -222,12 +229,13 @@ class ProteinProteinInteraction(Interaction): Simulates interactions between proteins given a source and target protein IDs. Occasionally generates an ID for the interaction itself. """ + def __init__(self, source, target): super().__init__() self.id = self._generate_id() self.source_id = source self.target_id = target - self.label = 'interacts_with' + self.label = "interacts_with" self.properties = self._generate_properties() def _generate_id(self): @@ -237,18 +245,18 @@ def _generate_id(self): if random.random() > 0.5: return None else: - return 'intact' + str(random.randint(1, 1000000)) + return "intact" + str(random.randint(1, 1000000)) def _generate_properties(self): properties = {} ## randomly add 'source' if random.random() > 0.5: - properties['source'] = random.choice(['intact', 'signor']) + properties["source"] = random.choice(["intact", "signor"]) ## randomly add 'method' if random.random() > 0.5: - properties['method'] = ' '.join( + properties["method"] = " ".join( [random.choice(string.ascii_lowercase) for _ in range(10)], ) @@ -260,6 +268,7 @@ class InteractionGenerator: Simulates interactions given a list of potential interactors based on an interaction probability or probability distribution. """ + def __init__(self, interactors: list, interaction_probability: float): self.interactors = interactors self.interaction_probability = interaction_probability