diff --git a/.mergify.yml b/.mergify.yml deleted file mode 100644 index 9d432bba0..000000000 --- a/.mergify.yml +++ /dev/null @@ -1,16 +0,0 @@ -pull_request_rules: - - name: Automatic update for PRs - conditions: - - -conflict # skip PRs with conflicts - - -draft # filter-out GH draft PRs - actions: - update: - - name: Warn author on conflicts - conditions: - - conflict - actions: - comment: - message: "@{{author}} esse pull request tem conflitos 😩" - label: - add: - - conflict \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f75737485..b0c342d2a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,20 +21,5 @@ repos: rev: v0.2.0 hooks: - id: ruff - args: [--fix] + args: [--fix, --select, I] - id: ruff-format - - - repo: https://github.com/returntocorp/semgrep - rev: v1.30.0 - hooks: - - id: semgrep - language: python - args: [ - "--error", - "--config", - "auto", - "--exclude-rule", - "python.lang.security.audit.subprocess-shell-true.subprocess-shell-true", - "--exclude-rule", - "yaml.github-actions.security.third-party-action-not-pinned-to-commit-sha.third-party-action-not-pinned-to-commit-sha", - ] diff --git a/docs/docs/api_reference_python.md b/docs/docs/api_reference_python.md index 8e98f5edf..a70b32eaa 100644 --- a/docs/docs/api_reference_python.md +++ b/docs/docs/api_reference_python.md @@ -4,8 +4,7 @@ Esta API é composta por funções com 2 tipos de funcionalidade: - Módulos para **requisição de dados**: para aquele(as) que desejam - somente consultar os dados e metadados do nosso projeto (ou qualquer outro - projeto no Google Cloud). + somente consultar os dados e metadados do nosso projeto. - Classes para **gerenciamento de dados** no Google Cloud: para aqueles(as) que desejam subir dados no nosso projeto (ou qualquer outro @@ -15,7 +14,7 @@ Esta API é composta por funções com 2 tipos de funcionalidade: ## Módulos (Requisição de dados) -::: basedosdados.download.download +::: basedosdados.download.metadata handler: python rendering: show_root_heading: no @@ -25,7 +24,7 @@ Esta API é composta por funções com 2 tipos de funcionalidade: docstring_options: replace_admonitions: no -::: basedosdados.download.metadata +::: basedosdados.download.download handler: python rendering: show_root_heading: no diff --git a/python-package/basedosdados/__init__.py b/python-package/basedosdados/__init__.py index 0fa9067c2..6e816a927 100644 --- a/python-package/basedosdados/__init__.py +++ b/python-package/basedosdados/__init__.py @@ -1,7 +1,6 @@ """ Importing the module will automatically import the submodules. """ -# flake8: noqa import os import sys @@ -14,15 +13,11 @@ from basedosdados.backend import Backend from basedosdados.constants import config, constants -from basedosdados.download.base import reauth from basedosdados.download.download import download, read_sql, read_table from basedosdados.download.metadata import ( - get_dataset_description, - get_table_columns, - get_table_description, - get_table_size, - list_dataset_tables, - list_datasets, + get_columns, + get_datasets, + get_tables, search, ) from basedosdados.upload.connection import Connection diff --git a/python-package/basedosdados/backend.py b/python-package/basedosdados/backend.py new file mode 100644 index 000000000..c5bb8429c --- /dev/null +++ b/python-package/basedosdados/backend.py @@ -0,0 +1,497 @@ +""" +Module for interacting with the backend. +""" +from typing import Any, Dict + +from loguru import logger +from requests import get + +try: + from gql import Client, gql + from gql.transport.requests import RequestsHTTPTransport + + _backend_dependencies = True +except ImportError: + _backend_dependencies = False + +from basedosdados.constants import constants +from basedosdados.exceptions import ( + BaseDosDadosException, + BaseDosDadosMissingDependencyException, +) + + +class SingletonMeta(type): + """Singleton Meta to avoid multiple instances of a class""" + + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super().__call__(*args, **kwargs) + return cls._instances[cls] + + +class Backend(metaclass=SingletonMeta): + def __init__(self, search_url: str = None, graphql_url: str = None): + """ + Backend class to communicate with the backend. + + Args: + graphql_url (str): URL of the GraphQL endpoint. + """ + self.search_url: str = search_url or constants.BACKEND_SEARCH_URL.value + self.graphql_url: str = graphql_url or constants.BACKEND_GRAPHQL_URL.value + self.graphql_client: "Client" = self._get_client() + + def get_datasets( + self, + dataset_id: str = None, + dataset_name: str = None, + page: int = 1, + page_size: int = 10, + ): + """ + Get a list of available datasets, + either by `dataset_id` or `dataset_name` + + Args: + dataset_id(str): dataset slug in google big query (gbq). + dataset_name(str): dataset name in base dos dados metadata. + + page(int): page for pagination. + page_size(int): page size for pagination. + backend(Backend): backend instance, injected automatically. + + Returns: + list[dict]: List of datasets. + """ + + query = """ + query ($first: Int!, $offset: Int!) { + allDataset(first: $first, offset: $offset) { + edges { + node { + slug + name + description + organization { + name + } + tags { + edges { + node { + name + } + } + } + themes { + edges { + node { + name + } + } + } + createdAt + updatedAt + } + } + totalCount + } + } + """ + variables = {"first": page_size, "offset": (page - 1) * page_size} + + extra = None + if dataset_id: + extra = f'id: "{dataset_id}"' + if dataset_name: + extra = f'name_Icontains: "{dataset_name}"' + if extra: + query = query.replace("$offset)", f"$offset, {extra})") + + return self._execute_query(query, variables, page, page_size).get("allDataset") + + def get_tables( + self, + dataset_id: str = None, + table_id: str = None, + table_name: str = None, + page: int = 1, + page_size: int = 10, + ): + """ + Get a list of available tables, + either by `dataset_id`, `table_id` or `table_name` + + Args: + dataset_id(str): dataset slug in google big query (gbq). + table_id(str): table slug in google big query (gbq). + table_name(str): table name in base dos dados metadata. + + page(int): page for pagination. + page_size(int): page size for pagination. + backend(Backend): backend instance, injected automatically. + + Returns: + list[dict]: List of tables. + """ + + query = """ + query ($first: Int!, $offset: Int!) { + allTable(first: $first, offset: $offset) { + edges { + node { + slug + name + description + numberRows + numberColumns + uncompressedFileSize + + } + } + totalCount + } + } + """ + variables = {"first": page_size, "offset": (page - 1) * page_size} + + extra = None + if table_id: + extra = f'id: "{table_id}"' + if dataset_id: + extra = f'dataset_id: "{dataset_id}"' + if table_name: + extra = f'name_Icontains: "{table_name}"' + if extra: + query = query.replace("$offset)", f"$offset, {extra})") + + return self._execute_query(query, variables, page, page_size).get("allTable") + + def get_columns( + self, + table_id: str = None, + column_id: str = None, + column_name: str = None, + page: int = 1, + page_size: int = 10, + ): + """ + Get a list of available columns, + either by `table_id`, `column_id` or `column_name` + + Args: + table_id(str): table slug in google big query (gbq). + column_id(str): column slug in google big query (gbq). + column_name(str): table name in base dos dados metadata. + + page(int): page for pagination. + page_size(int): page size for pagination. + backend(Backend): backend instance, injected automatically. + + Returns: + list[dict]: List of tables. + """ + + query = """ + query ($first: Int!, $offset: Int!) { + allColumn(first: $first, offset: $offset) { + edges { + node { + name + description + observations + bigqueryType { + name + } + } + } + totalCount + } + } + """ + variables = {"first": page_size, "offset": (page - 1) * page_size} + + extra = None + if column_id: + extra = f'id: "{column_id}"' + if table_id: + extra = f'table_id: "{table_id}"' + if column_name: + extra = f'name_Icontains: "{column_name}"' + if extra: + query = query.replace("$offset)", f"$offset, {extra})") + + return self._execute_query(query, variables, page, page_size).get("allColumn") + + def search(self, q: str = None, page: int = 1, page_size: int = 10) -> list[dict]: + """ + Search for datasets, querying all available metadata for the term `q` + + Args: + q(str): search term. + + page(int): page for pagination. + page_size(int): page size for pagination. + backend(Backend): backend instance, injected automatically. + + Returns: + dict: page of tables. + """ + response = get( + url=self.search_url, + params={"q": q, "page": page, "page_size": page_size}, + ) + if response.status_code not in [200]: + raise BaseDosDadosException(response.text) + return response.json() + + def get_dataset_config(self, dataset_id: str) -> Dict[str, Any]: + """ + Get dataset configuration. + Args: + dataset_id (str): The ID for the dataset. + Returns: + Dict: Dataset configuration. + """ + query = """ + query ($dataset_id: ID!){ + allDataset(id: $dataset_id) { + edges { + node { + slug + name + descriptionPt + createdAt + updatedAt + themes { + edges { + node { + namePt + } + } + } + tags { + edges { + node { + namePt + } + } + } + organization { + namePt + } + } + } + } + } + """ + dataset_id = self._get_dataset_id_from_name(dataset_id) + if dataset_id: + variables = {"dataset_id": dataset_id} + response = self._execute_query(query=query, variables=variables) + return self._simplify_graphql_response(response).get("allDataset")[0] + else: + return {} + + def get_table_config(self, dataset_id: str, table_id: str) -> Dict[str, Any]: + """ + Get table configuration. + Args: + dataset_id (str): The ID for the dataset. + table_id (str): The ID for the table. + Returns: + Dict: Table configuration. + """ + + query = """ + query ($table_id: ID!){ + allTable(id: $table_id) { + edges { + node { + slug + dataset { + slug + organization { + slug + } + } + namePt + descriptionPt + columns { + edges { + node { + name + isInStaging + isPartition + descriptionPt + observations + bigqueryType { + name + } + } + } + } + } + } + } + } + """ + table_id = self._get_table_id_from_name( + gcp_dataset_id=dataset_id, gcp_table_id=table_id + ) + + if table_id: + variables = {"table_id": table_id} + response = self._execute_query(query=query, variables=variables) + return self._simplify_graphql_response(response).get("allTable")[0] + else: + return {} + + def _get_dataset_id_from_name(self, gcp_dataset_id): + query = """ + query ($gcp_dataset_id: String!){ + allCloudtable(gcpDatasetId: $gcp_dataset_id) { + edges { + node { + table { + dataset { + _id + } + } + } + } + } + } + """ + + variables = {"gcp_dataset_id": gcp_dataset_id} + response = self._execute_query(query=query, variables=variables) + r = {} if response is None else self._simplify_graphql_response(response) + if r.get("allCloudtable", []) != []: + return r.get("allCloudtable")[0].get("table").get("dataset").get("_id") + msg = f"{gcp_dataset_id} not found. Please create the metadata first in {self.graphql_url}" + logger.info(msg) + return None + + def _get_table_id_from_name(self, gcp_dataset_id, gcp_table_id): + query = """ + query ($gcp_dataset_id: String!, $gcp_table_id: String!){ + allCloudtable(gcpDatasetId: $gcp_dataset_id, gcpTableId: $gcp_table_id) { + edges { + node { + table { + _id + } + } + } + } + } + """ + + if gcp_dataset_id: + variables = { + "gcp_dataset_id": gcp_dataset_id, + "gcp_table_id": gcp_table_id, + } + + response = self._execute_query(query=query, variables=variables) + r = {} if response is None else self._simplify_graphql_response(response) + if r.get("allCloudtable", []) != []: + return r.get("allCloudtable")[0].get("table").get("_id") + msg = f"No table {gcp_table_id} found in {gcp_dataset_id}. Please create in {self.graphql_url}" + logger.info(msg) + return None + + def _get_client( + self, headers: Dict[str, str] = None, fetch_schema_from_transport: bool = False + ) -> "Client": + """ + Get a GraphQL client. + + Args: + headers (Dict[str, str], optional): Headers to be passed to the client. + Defaults to None. + fetch_schema_from_transport (bool, optional): Whether to fetch the schema + from the transport. Defaults to False. + + Returns: + Client: GraphQL client. + """ + if not _backend_dependencies: + raise BaseDosDadosMissingDependencyException( + "Optional dependencies for backend interaction are not installed. " + 'Please install basedosdados with the "upload" extra, such as:' + "\n\npip install basedosdados[upload]" + ) + transport = RequestsHTTPTransport( + url=self.graphql_url, headers=headers, use_json=True + ) + return Client( + transport=transport, fetch_schema_from_transport=fetch_schema_from_transport + ) + + def _execute_query( + self, + query: str, + variables: Dict[str, str] = None, + page: int = 1, + page_size: int = 10, + ) -> Dict[str, Any]: + """ + Execute a GraphQL query. + + Args: + query (str): GraphQL query. + variables (Dict[str, str], optional): Variables to be passed to the query. Defaults to None. + + Returns: + Dict: GraphQL response. + """ + try: + response = self.graphql_client.execute( + gql(query), variable_values=variables + ) + except Exception as e: + logger.error( + f"The API URL in the config.toml file may be incorrect " + f"or the API might be temporarily unavailable!\n" + f"Error executing query: {e}." + ) + return self._simplify_response(response or {}, page, page_size) + + def _simplify_response( + self, response: dict, page: int = 1, page_size: int = 10 + ) -> dict: + """ + Simplify the graphql response + + Args: + response: the graphql response + + Returns: + dict: the simplified graphql response + """ + if response is None: + return {} + if response == {}: + return {} + + output_ = {} + for key, value in response.items(): + if isinstance(value, list) and key == "edges": + output_["items"] = [ + self._simplify_response(v).get("node") for v in value + ] + elif isinstance(value, dict): + output_[key] = self._simplify_response(value) + else: + output_[key] = value + + if "totalCount" in output_: + output_["page"] = page + output_["page_size"] = page_size + output_["page_total"] = int(output_.pop("totalCount") / page_size) + + return output_ diff --git a/python-package/basedosdados/backend/__init__.py b/python-package/basedosdados/backend/__init__.py deleted file mode 100644 index 4e862ed21..000000000 --- a/python-package/basedosdados/backend/__init__.py +++ /dev/null @@ -1,298 +0,0 @@ -""" -Module for interacting with the backend. -""" -from typing import Any, Dict - -from loguru import logger - -try: - from gql import Client, gql - from gql.transport.requests import RequestsHTTPTransport - - _backend_dependencies = True -except ImportError: - _backend_dependencies = False - -from basedosdados.exceptions import BaseDosDadosMissingDependencyException - - -class Backend: - def __init__(self, graphql_url: str): - """ - Backend class for interacting with the backend. - - Args: - graphql_url (str): URL of the GraphQL endpoint. - """ - self._graphql_url: str = graphql_url - - @property - def graphql_url(self) -> str: - """ - GraphQL endpoint URL. - """ - return self._graphql_url - - def _get_client( - self, headers: Dict[str, str] = None, fetch_schema_from_transport: bool = False - ) -> "Client": - """ - Get a GraphQL client. - - Args: - headers (Dict[str, str], optional): Headers to be passed to the client. Defaults to - None. - fetch_schema_from_transport (bool, optional): Whether to fetch the schema from the - transport. Defaults to False. - - Returns: - Client: GraphQL client. - """ - if not _backend_dependencies: - raise BaseDosDadosMissingDependencyException( - "Optional dependencies for backend interaction are not installed. " - 'Please install basedosdados with the "upload" extra, such as:' - "\n\npip install basedosdados[upload]" - ) - transport = RequestsHTTPTransport( - url=self.graphql_url, headers=headers, use_json=True - ) - return Client( - transport=transport, fetch_schema_from_transport=fetch_schema_from_transport - ) - - def _execute_query( - self, - query: str, - variables: Dict[str, str] = None, - client: "Client" = None, - headers: Dict[str, str] = None, - fetch_schema_from_transport: bool = False, - ) -> Dict[str, Any]: - """ - Execute a GraphQL query. - - Args: - query (str): GraphQL query. - variables (Dict[str, str], optional): Variables to be passed to the query. Defaults - to None. - client (Client, optional): GraphQL client. Defaults to None. - headers (Dict[str, str], optional): Headers to be passed to the client. Defaults to - None. - fetch_schema_from_transport (bool, optional): Whether to fetch the schema from the - transport. Defaults to False. - - Returns: - Dict: GraphQL response. - """ - if not _backend_dependencies: - raise BaseDosDadosMissingDependencyException( - "Optional dependencies for backend interaction are not installed. " - 'Please install basedosdados with the "upload" extra, such as:' - "\n\npip install basedosdados[upload]" - ) - if not client: - client = self._get_client( - headers=headers, fetch_schema_from_transport=fetch_schema_from_transport - ) - try: - return client.execute(gql(query), variable_values=variables) - except Exception as e: - msg = f"The API URL in the config.toml file may be incorrect or the API might be temporarily unavailable!\nError executing query: {e}." - logger.error(msg) - return None - - def _get_dataset_id_from_name(self, gcp_dataset_id): - query = """ - query ($gcp_dataset_id: String!){ - allCloudtable(gcpDatasetId: $gcp_dataset_id) { - edges { - node { - table { - dataset { - _id - } - } - } - } - } - } - """ - - variables = {"gcp_dataset_id": gcp_dataset_id} - response = self._execute_query(query=query, variables=variables) - r = {} if response is None else self._simplify_graphql_response(response) - if r.get("allCloudtable", []) != []: - return r.get("allCloudtable")[0].get("table").get("dataset").get("_id") - msg = f"{gcp_dataset_id} not found. Please create the metadata first in {self.graphql_url}" - logger.info(msg) - return None - - def _get_table_id_from_name(self, gcp_dataset_id, gcp_table_id): - query = """ - query ($gcp_dataset_id: String!, $gcp_table_id: String!){ - allCloudtable(gcpDatasetId: $gcp_dataset_id, gcpTableId: $gcp_table_id) { - edges { - node { - table { - _id - } - } - } - } - } - """ - - if gcp_dataset_id: - variables = { - "gcp_dataset_id": gcp_dataset_id, - "gcp_table_id": gcp_table_id, - } - - response = self._execute_query(query=query, variables=variables) - r = {} if response is None else self._simplify_graphql_response(response) - if r.get("allCloudtable", []) != []: - return r.get("allCloudtable")[0].get("table").get("_id") - msg = f"No table {gcp_table_id} found in {gcp_dataset_id}. Please create in {self.graphql_url}" - logger.info(msg) - return None - - def get_dataset_config(self, dataset_id: str) -> Dict[str, Any]: - """ - Get dataset configuration. - - Args: - dataset_id (str): The ID for the dataset. - - Returns: - Dict: Dataset configuration. - """ - query = """ - query ($dataset_id: ID!){ - allDataset(id: $dataset_id) { - edges { - node { - slug - name - descriptionPt - createdAt - updatedAt - themes { - edges { - node { - namePt - } - } - } - tags { - edges { - node { - namePt - } - } - } - organization { - namePt - } - } - } - } - } - - """ - dataset_id = self._get_dataset_id_from_name(dataset_id) - if dataset_id: - variables = {"dataset_id": dataset_id} - response = self._execute_query(query=query, variables=variables) - return self._simplify_graphql_response(response).get("allDataset")[0] - else: - return {} - - def get_table_config(self, dataset_id: str, table_id: str) -> Dict[str, Any]: - """ - Get table configuration. - - Args: - dataset_id (str): The ID for the dataset. - table_id (str): The ID for the table. - - Returns: - Dict: Table configuration. - """ - - query = """ - query ($table_id: ID!){ - allTable(id: $table_id) { - edges { - node { - slug - dataset { - slug - organization { - slug - } - } - namePt - descriptionPt - columns { - edges { - node { - name - isInStaging - isPartition - descriptionPt - observations - bigqueryType { - name - } - } - } - } - } - } - } - } - """ - table_id = self._get_table_id_from_name( - gcp_dataset_id=dataset_id, gcp_table_id=table_id - ) - - if table_id: - variables = {"table_id": table_id} - response = self._execute_query(query=query, variables=variables) - return self._simplify_graphql_response(response).get("allTable")[0] - else: - return {} - - def _simplify_graphql_response(self, response: dict) -> dict: - """ - Simplify the graphql response - Args: - response: the graphql response - Returns: - dict: the simplified graphql response - """ - if response == {}: # pragma: no cover - return {} - - output_ = {} - - for key in response: - try: - if ( - isinstance(response[key], dict) - and response[key].get("edges") is not None # noqa - ): - output_[key] = [ - v.get("node") - for v in list( - map(self._simplify_graphql_response, response[key]["edges"]) - ) - ] - elif isinstance(response[key], dict): - output_[key] = self._simplify_graphql_response(response[key]) - else: - output_[key] = response[key] - except TypeError as e: - logger.error(f"Erro({e}): {key} - {response[key]}") - return output_ diff --git a/python-package/basedosdados/configs/templates/dataset/README.md b/python-package/basedosdados/configs/templates/dataset/README.md deleted file mode 100644 index d6e62676c..000000000 --- a/python-package/basedosdados/configs/templates/dataset/README.md +++ /dev/null @@ -1,7 +0,0 @@ -Como capturar os dados de {{ dataset_id }}? - -1. Para capturar esses dados, basta verificar o link dos dados originais indicado em `dataset_config.yaml` no item `website`. - -2. Caso tenha sido utilizado algum código de captura ou tratamento, estes estarão contidos em `code/`. Se o dado publicado for em sua versão bruta, não existirá a pasta `code/`. - -Os dados publicados estão disponíveis em: https://basedosdados.org/dataset/{{ dataset_id | replace("_","-") }} diff --git a/python-package/basedosdados/configs/templates/dataset/dataset_description.txt b/python-package/basedosdados/configs/templates/dataset/dataset_description.txt deleted file mode 100644 index ec628262a..000000000 --- a/python-package/basedosdados/configs/templates/dataset/dataset_description.txt +++ /dev/null @@ -1,43 +0,0 @@ -{% macro input(var) -%} -{% if ( - (var is not none) and - (("<" not in var | string) and (">" not in var | string)) -) -%} -{{- caller() }} -{%- endif %} -{%- endmacro -%} - -{{ description }} - -Para saber mais acesse: -Website: {{ url_ckan }} -Github: https://github.com/basedosdados/mais/ - -Ajude a manter o projeto :) -Apoia-se: https://apoia.se/basedosdados - -Instituição (Quem mantém os dados oficiais?) ------------ -Nome: {{ organization -}} - -{% if website is defined %} -{% call input(website[0]) -%} -Onde encontrar os dados ------------------------ -{% if (website is not none) -%} -{% for partition in website -%} -- {{ partition }} -{% endfor -%} -{%- endif %} -{% endcall -%} -{% endif %} - -{% call input(groups[0]) -%} -Grupos ------- -{% if (groups is not none) -%} -{% for partition in groups -%} -- {{ partition }} -{% endfor -%} -{%- endif %} -{% endcall -%} \ No newline at end of file diff --git a/python-package/basedosdados/configs/templates/table/publish.sql b/python-package/basedosdados/configs/templates/table/publish.sql deleted file mode 100644 index d7b86b4f1..000000000 --- a/python-package/basedosdados/configs/templates/table/publish.sql +++ /dev/null @@ -1,30 +0,0 @@ -/* - -Query para publicar a tabela. - -Esse é o lugar para: - - modificar nomes, ordem e tipos de colunas - - dar join com outras tabelas - - criar colunas extras (e.g. logs, proporções, etc.) - -Qualquer coluna definida aqui deve também existir em `table_config.yaml`. - -# Além disso, sinta-se à vontade para alterar alguns nomes obscuros -# para algo um pouco mais explícito. - -TIPOS: - - Para modificar tipos de colunas, basta substituir STRING por outro tipo válido. - - Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name` - - Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types - -*/ -{% set project = project_id_prod %} -CREATE VIEW {{ project }}.{{ dataset_id }}.{{ table_id }} AS -SELECT -{% for column in columns|list + partition_columns|list -%} -{%- if not loop.last -%} - SAFE_CAST({{ column }} AS STRING) {{ column }}, -{% else -%} - SAFE_CAST({{ column }} AS STRING) {{ column }} -{% endif -%}{% endfor -%} -from {{ project_id }}.{{ dataset_id }}_staging.{{ table_id }} as t diff --git a/python-package/basedosdados/configs/templates/table/table_description.txt b/python-package/basedosdados/configs/templates/table/table_description.txt deleted file mode 100644 index c52f26948..000000000 --- a/python-package/basedosdados/configs/templates/table/table_description.txt +++ /dev/null @@ -1,107 +0,0 @@ -{% macro input(var) -%} -{% if ( - (var is not none) and - (("<" not in var | string) and (">" not in var | string)) -) -%} -{{- caller() }} -{%- endif %} -{%- endmacro -%} - -{{ description }} - -Para saber mais acesse: -Website: {{ url_ckan }} -Github: {{ url_github }} - -Ajude a manter o projeto :) -Apoia-se: https://apoia.se/basedosdados - -Publicado por -------------- -Nome: {{ published_by.name -}} -{% call input(published_by.code_url ) %} -Código: {{ published_by.code_url }} -{%- endcall -%} -{% call input(published_by.website ) %} -Website: {{ published_by.website }} -{%- endcall -%} -{% call input(published_by.email ) %} -Email: {{ published_by.email }} -{%- endcall -%} - -{% if data_cleaned_by is defined %} -Tratado por ------------ -Nome: {{ data_cleaned_by.name -}} -{% call input(data_cleaned_by.code_url) %} -Código: {{ data_cleaned_by.code_url }} -{%- endcall -%} -{% call input(data_cleaned_by.website) %} -Website: {{ data_cleaned_by.website }} -{%- endcall -%} -{% call input(data_cleaned_by.email) %} -Email: {{ data_cleaned_by.email }} -{%- endcall %} -{% endif %} - - -{% call input(partitions) -%} -Partições (Filtre a tabela por essas colunas para economizar dinheiro e tempo) ---------- -{% if (partitions is not none) -%} -{% for partition in partitions -%} -- {{ partition }} -{% endfor -%} -{%- endif %} -{% endcall -%} - -{% if identifying_columns is defined %} -{% call input(identifying_columns[0]) -%} -Colunas identificando linhas unicamente -------------------- -{% if (identifying_columns is not none) -%} -{% for partition in identifying_columns -%} -- {{ partition }} -{% endfor -%} -{%- endif %} -{% endcall -%} -{% endif %} - -{% if temporal_coverage is defined %} -{% call input(temporal_coverage[0]) -%} -Cobertura Temporal ------------------- -{% if (temporal_coverage is not none) -%} -{% for partition in temporal_coverage -%} -- {{ partition }} -{% endfor -%} -{%- endif %} -{% endcall -%} -{% endif %} - -{% if spatial_coverage is defined %} -{% call input(spatial_coverage[0]) -%} -Cobertura Espacial ------------------- -{% if (spatial_coverage is not none) -%} -{% for partition in spatial_coverage -%} -- {{ partition }} -{% endfor -%} -{%- endif %} -{% endcall -%} -{% endif %} - -{% if data_cleaning_description is defined %} -{% call input(data_cleaning_description) -%} -Tratamento ----------- -{{ data_cleaning_description }} -{% endcall %} -{% endif %} - -{% if update_frequency is defined %} -{%- call input(update_frequency) -%} -Frequencia de Atualização -------------------------- -{{ update_frequency }}{% endcall %} -{% endif %} \ No newline at end of file diff --git a/python-package/basedosdados/constants.py b/python-package/basedosdados/constants.py index 6154d649f..f6c3880a0 100644 --- a/python-package/basedosdados/constants.py +++ b/python-package/basedosdados/constants.py @@ -13,9 +13,10 @@ class config: Configuration for the project. """ - verbose: bool = True billing_project_id: str = None project_config_path: str = None + + verbose: bool = True from_file: bool = False @@ -33,3 +34,5 @@ class constants(Enum): REFRESH_TOKEN_URL: str = "/api/token/refresh/" VERIFY_TOKEN_URL: str = "/api/token/verify/" TEST_ENDPOINT: str = "/api/v1/private/bigquerytypes/" + BACKEND_SEARCH_URL: str = "https://backend.basedosdados.org/search" + BACKEND_GRAPHQL_URL: str = "https://backend.basedosdados.org/graphql" diff --git a/python-package/basedosdados/upload/base.py b/python-package/basedosdados/core/base.py similarity index 99% rename from python-package/basedosdados/upload/base.py rename to python-package/basedosdados/core/base.py index c9b1b5d0f..ceaf778d8 100644 --- a/python-package/basedosdados/upload/base.py +++ b/python-package/basedosdados/core/base.py @@ -20,7 +20,7 @@ from google.oauth2 import service_account from loguru import logger -from basedosdados.backend import Backend +from basedosdados import Backend from basedosdados.constants import config, constants warnings.filterwarnings("ignore") @@ -232,7 +232,7 @@ def _init_config(self, force): if (not config_file.exists()) or (force): # Load config file c_file = tomlkit.parse( - (Path(__file__).resolve().parents[1] / "configs" / "config.toml") + (Path(__file__).resolve().parents[1] / "templates" / "config.toml") .open("r", encoding="utf-8") .read() ) diff --git a/python-package/basedosdados/configs/config.toml b/python-package/basedosdados/core/config.toml similarity index 65% rename from python-package/basedosdados/configs/config.toml rename to python-package/basedosdados/core/config.toml index 59a364022..127733b17 100644 --- a/python-package/basedosdados/configs/config.toml +++ b/python-package/basedosdados/core/config.toml @@ -1,5 +1,5 @@ -# What is the bucket that you are saving all the data? It should be -# an unique name. +# What is the bucket that you are saving all the data? +# It should be an unique name. bucket_name = "" [gcloud-projects] @@ -13,4 +13,4 @@ bucket_name = "" credentials_path = "" [api] -url = "" \ No newline at end of file +url = "" diff --git a/python-package/basedosdados/download/base.py b/python-package/basedosdados/download/base.py deleted file mode 100644 index 190cfc39a..000000000 --- a/python-package/basedosdados/download/base.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Functions for manage auth and credentials -""" -import sys - -from functools import lru_cache - -import pydata_google_auth -from google.cloud import bigquery, storage - -from basedosdados.upload.base import Base - -SCOPES = [ - "https://www.googleapis.com/auth/cloud-platform", -] - - -def reauth(): - """ - Reauth user credentials - """ - - pydata_google_auth.get_user_credentials( - SCOPES, credentials_cache=pydata_google_auth.cache.REAUTH - ) - - -def credentials(from_file=False, reauth=False): - """ - Get user credentials - """ - - # check if is running in colab - if "google.colab" in sys.modules: - from google.colab import auth - - auth.authenticate_user() - return None - - if from_file: - return Base()._load_credentials(mode="prod") - - if reauth: - return pydata_google_auth.get_user_credentials( - SCOPES, credentials_cache=pydata_google_auth.cache.REAUTH - ) - - return pydata_google_auth.get_user_credentials( - SCOPES, - ) - - -@lru_cache(256) -def google_client(billing_project_id, from_file, reauth): - """ - Get Google Cloud client for bigquery and storage - """ - - return dict( - bigquery=bigquery.Client( - credentials=credentials(from_file=from_file, reauth=reauth), - project=billing_project_id, - ), - storage=storage.Client( - credentials=credentials(from_file=from_file, reauth=reauth), - project=billing_project_id, - ), - ) diff --git a/python-package/basedosdados/download/download.py b/python-package/basedosdados/download/download.py index 357d333a5..28807f778 100644 --- a/python-package/basedosdados/download/download.py +++ b/python-package/basedosdados/download/download.py @@ -5,18 +5,13 @@ import os import re import shutil +import sys import time -from functools import partialmethod - +from functools import lru_cache, partialmethod from pathlib import Path -import pandas_gbq -from google.cloud import bigquery, bigquery_storage_v1 -from pandas_gbq.gbq import GenericGBQException -from pydata_google_auth.exceptions import PyDataCredentialsError - from basedosdados.constants import config -from basedosdados.download.base import credentials, google_client +from basedosdados.core.base import Base from basedosdados.exceptions import ( BaseDosDadosAccessDeniedException, BaseDosDadosAuthorizationException, @@ -24,19 +19,11 @@ BaseDosDadosInvalidProjectIDException, BaseDosDadosNoBillingProjectIDException, ) - - -def _set_config_variables(billing_project_id, from_file): - """ - Set billing_project_id and from_file variables - """ - - # standard billing_project_id configuration - billing_project_id = billing_project_id or config.billing_project_id - # standard from_file configuration - from_file = from_file or config.from_file - - return billing_project_id, from_file +from google.cloud import bigquery, bigquery_storage_v1, storage +from pandas_gbq import read_gbq +from pandas_gbq.gbq import GenericGBQException +from pydata_google_auth import cache, get_user_credentials +from pydata_google_auth.exceptions import PyDataCredentialsError def read_sql( @@ -78,13 +65,12 @@ def read_sql( timeout=3600 * 2, ) - return pandas_gbq.read_gbq( + return read_gbq( query, - credentials=credentials(from_file=from_file, reauth=reauth), - project_id=billing_project_id, + project_id=config.billing_project_id, use_bqstorage_api=use_bqstorage_api, + credentials=_credentials(from_file=config.from_file, reauth=reauth), ) - except GenericGBQException as e: if "Reason: 403" in str(e): raise BaseDosDadosAccessDeniedException from e @@ -230,7 +216,7 @@ def download( "Either table_id, dataset_id or query should be filled." ) - client = google_client(billing_project_id, from_file, reauth) + client = _google_client(billing_project_id, from_file, reauth) # makes sure that savepath is a filepath and not a folder savepath = _sets_savepath(savepath) @@ -521,3 +507,45 @@ def _sets_savepath(savepath): ) return savepath + + +def _credentials( + from_file: bool = False, + reauth: bool = False, + scopes: list[str] = ["https://www.googleapis.com/auth/cloud-platform"], +): + """ + Get user credentials + """ + + if "google.colab" in sys.modules: + from google.colab import auth + + auth.authenticate_user() + return None + + if from_file: + return Base()._load_credentials(mode="prod") + + if reauth: + return get_user_credentials(scopes, credentials_cache=cache.REAUTH) + + return get_user_credentials(scopes) + + +@lru_cache(256) +def _google_client(billing_project_id: str, from_file: bool, reauth: bool): + """ + Get Google Cloud client for bigquery and storage + """ + + return dict( + bigquery=bigquery.Client( + credentials=_credentials(from_file=from_file, reauth=reauth), + project=billing_project_id, + ), + storage=storage.Client( + credentials=_credentials(from_file=from_file, reauth=reauth), + project=billing_project_id, + ), + ) diff --git a/python-package/basedosdados/download/metadata.py b/python-package/basedosdados/download/metadata.py index d7d387bf0..bffe1b509 100644 --- a/python-package/basedosdados/download/metadata.py +++ b/python-package/basedosdados/download/metadata.py @@ -1,415 +1,165 @@ """ Functions to get metadata from BD's API """ -import math +from functools import wraps -from collections import defaultdict +from basedosdados.backend import Backend -import pandas as pd -import requests -from google.cloud import bigquery +def check_input(f): + """Checks if the number of inputs is valid""" -def _safe_fetch(url: str): - """ - Safely fetchs urls and, if somehting goes wrong, informs user what is the possible cause - """ - response = None - try: - response = requests.get(url, timeout=10) - response.raise_for_status() - except requests.exceptions.HTTPError as errh: - print("Http Error:", errh) - except requests.exceptions.ConnectionError as errc: - print("Error Connecting:", errc) - except requests.exceptions.Timeout as errt: - print("Timeout Error:", errt) - except requests.exceptions.RequestException as err: - print("This url doesn't appear to exists:", err) - - return response + @wraps(f) + def wrapper(*args, **kwargs): + if sum([a is not None for a in args]) > 1: + raise ValueError("At most one of the inputs must be non null") + return f(*args, **kwargs) + return wrapper -def _dict_from_page(json_response): - """ - Generate a dict from BD's API response with dataset_id and description as keys - """ - temp_dict = { - "dataset_id": [ - dataset["name"] for dataset in json_response["result"]["datasets"] - ], - "description": [ - dataset["notes"] if "notes" in dataset.keys() else None - for dataset in json_response["result"]["datasets"] - ], - } - - return temp_dict +def inject_backend(f): + """Inject backend instance if doesn't exists""" -def _fix_size(s, step=80): - final = "" + @wraps(f) + def wrapper(*args, **kwargs): + if "backend" not in kwargs: + kwargs["backend"] = Backend() + return f(*args, **kwargs) - for l in s.split(" "): # noqa - final += (l + " ") if len(final.split("\n")[-1]) < step else "\n" + return wrapper - return final - -def _print_output(df): - """Prints dataframe contents as print blocks - Args: - df (pd.DataFrame): table to be printed +@check_input +@inject_backend +def get_datasets( + dataset_id: str = None, + dataset_name: str = None, + page: int = 1, + page_size: int = 10, + backend: Backend = None, +) -> list[dict]: """ + Get a list of available datasets, + either by `dataset_id` or `dataset_name` - columns = df.columns - step = 80 - print() - for _, row in df.iterrows(): - for c in columns: - print(_fix_size(f"{c}: \n\t{row[c]}")) - print("-" * (step + 15)) - print() - - -def _handle_output(verbose, output_type, df, col_name=None): - """Handles datasets and tables listing outputs based on user's choice. - Either prints it to the screen or returns it as a `list` object. Args: - verbose (bool): amount of verbosity - output_type (str): type of output - df (pd.DataFrame, bigquery.Dataset or bigquery.Table): table containing datasets metadata - col_name (str): name of column with id's data - """ + dataset_id(str): dataset slug in google big query (gbq). + dataset_name(str): dataset name in base dos dados metadata. - df_is_dataframe = isinstance(df, pd.DataFrame) - df_is_bq_dataset_or_table = isinstance(df, bigquery.Table) - df_is_bq_dataset_or_table |= isinstance(df, bigquery.Dataset) - - if verbose is True and df_is_dataframe: - _print_output(df) - - elif verbose is True and df_is_bq_dataset_or_table: - print(df.description) - - elif verbose is False: - if output_type == "list": - return df[col_name].to_list() - if output_type == "str": - return df.description - if output_type == "records": - return df.to_dict("records") - msg = '`output_type` argument must be set to "list", "str" or "records".' - raise ValueError(msg) - raise TypeError("`verbose` argument must be of `bool` type.") - - -def list_datasets(with_description=False, verbose=True): - """ - This function uses `bd_dataset_search` website API - enpoint to retrieve a list of available datasets. - - Args: - with_description (bool): Optional - If True, fetch short dataset description for each dataset. - verbose (bool): Optional. - If set to True, information is printed to the screen. If set to False, a list object is returned. + page(int): page for pagination. + page_size(int): page size for pagination. + backend(Backend): backend instance, injected automatically. Returns: - list | stdout - """ - # first request is made separately since we need to now the number of pages before the iteration - page_size = 100 # this function will only made more than one requisition if there are more than 100 datasets in the API response - url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q=&resource_type=bdm_table&page=1&page_size={page_size}" - response = _safe_fetch(url) - json_response = response.json() - n_datasets = json_response["result"]["count"] - n_pages = math.ceil(n_datasets / page_size) - temp_dict = _dict_from_page(json_response) - - temp_dicts = [temp_dict] - for page in range(2, n_pages + 1): - url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q=&resource_type=bdm_table&page={page}&page_size={page_size}" - response = _safe_fetch(url) - json_response = response.json() - temp_dict = _dict_from_page(json_response) - temp_dicts.append(temp_dict) - - dataset_dict = defaultdict(list) - - for d in temp_dicts: - for key, value in d.items(): - dataset_dict[key].append(value) - - # flat inner lists - dataset_dict["dataset_id"] = [ - item for sublist in dataset_dict["dataset_id"] for item in sublist - ] - dataset_dict["description"] = [ - item for sublist in dataset_dict["description"] for item in sublist - ] - # select desired output using dataset_id info. Note that the output is either a standardized string or a list #pylint: disable=C0301 - if verbose & (with_description is False): - return _print_output(pd.DataFrame.from_dict(dataset_dict)[["dataset_id"]]) - if verbose & with_description: - return _print_output( - pd.DataFrame.from_dict(dataset_dict)[["dataset_id", "description"]] - ) - if (verbose is False) & (with_description is False): - return dataset_dict["dataset_id"] - if (verbose is False) & with_description: - return [ - { - "dataset_id": dataset_dict["dataset_id"][k], - "description": dataset_dict["description"][k], - } - for k in range(len(dataset_dict["dataset_id"])) - ] - raise ValueError( - "`verbose` and `with_description` argument must be of `bool` type." - ) - - -def list_dataset_tables( - dataset_id, - with_description=False, - verbose=True, -): - """ - Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list. + dict: List of datasets. + """ + result = backend.get_datasets(dataset_id, dataset_name, page, page_size) + for item in result.get("items", []) or []: + item["organization"] = item.get("organization", {}).get("name") + item["tags"] = [i.get("name") for i in item.get("tags", {}).get("items")] + item["themes"] = [i.get("name") for i in item.get("themes", {}).get("items")] + return result + + +@check_input +@inject_backend +def get_tables( + dataset_id: str = None, + table_id: str = None, + table_name: str = None, + page: int = 1, + page_size: int = 10, + backend: Backend = None, +) -> list[dict]: + """ + Get a list of available tables, + either by `dataset_id`, `table_id` or `table_name` Args: - dataset_id (str): Optional. - Dataset id returned by list_datasets function - limit (int): - Field to limit the number of results - with_description (bool): Optional - If True, fetch short table descriptions for each table that match the search criteria. - verbose (bool): Optional. - If set to True, information is printed to the screen. If set to False, a list object is returned. - - Returns: - stdout | list - """ - - dataset_id = dataset_id.replace( - "-", "_" - ) # The dataset_id pattern in the bd_dataset_search endpoint response uses a hyphen as a separator, while in the endpoint urls that specify the dataset_id parameter the separator used is an underscore. See issue #1079 - - url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" - - response = _safe_fetch(url) - - json_response = response.json() - - dataset = json_response["result"] - # this dict has all information need to output the function - table_dict = { - "table_id": [ - dataset["resources"][k]["name"] - for k in range(len(dataset["resources"])) - if dataset["resources"][k]["resource_type"] == "bdm_table" - ], - "description": [ - dataset["resources"][k]["description"] - for k in range(len(dataset["resources"])) - if dataset["resources"][k]["resource_type"] == "bdm_table" - ], - } - # select desired output using table_id info. Note that the output is either a standardized string or a list - if verbose & (with_description is False): - return _print_output(pd.DataFrame.from_dict(table_dict)[["table_id"]]) - if verbose & with_description: - return _print_output( - pd.DataFrame.from_dict(table_dict)[["table_id", "description"]] - ) - if (verbose is False) & (with_description is False): - return table_dict["table_id"] - if (verbose is False) & with_description: - return [ - { - "table_id": table_dict["table_id"][k], - "description": table_dict["description"][k], - } - for k in range(len(table_dict["table_id"])) - ] + dataset_id(str): dataset slug in google big query (gbq). + table_id(str): table slug in google big query (gbq). + table_name(str): table name in base dos dados metadata. - raise ValueError( - "`verbose` and `with_description` argument must be of `bool` type." - ) - - -def get_dataset_description( - dataset_id, - verbose=True, -): - """ - Prints the full dataset description. - - Args: - dataset_id (str): Required. - Dataset id available in list_datasets. - verbose (bool): Optional. - If set to True, information is printed to the screen. If set to False, data is returned as a `str`. + page(int): page for pagination. + page_size(int): page size for pagination. + backend(Backend): backend instance, injected automatically. Returns: - stdout | str + dict: List of tables. """ - url = f"https://basedosdados.org/api/3/action/bd_bdm_dataset_show?dataset_id={dataset_id}" - - response = _safe_fetch(url) - json_response = response.json() + return backend.get_tables(dataset_id, table_id, table_name, page, page_size) - description = json_response["result"]["notes"] - if verbose: - return print(description) - return description - - -def get_table_description( - dataset_id, - table_id, - verbose=True, -): +@check_input +@inject_backend +def get_columns( + table_id: str = None, + column_id: str = None, + columns_name: str = None, + page: int = 1, + page_size: int = 10, + backend: Backend = None, +) -> list[dict]: """ - Prints the full table description. + Get a list of available columns, + either by `table_id`, `column_id` or `column_name` Args: - dataset_id (str): Required. - Dataset id available in list_datasets. - table_id (str): Required. - Table id available in list_dataset_tables - verbose (bool): Optional. - If set to True, information is printed to the screen. If set to False, data is returned as a `str`. - - Returns: - stdout | str - """ + table_id(str): table slug in google big query (gbq). + column_id(str): column slug in google big query (gbq). + column_name(str): table name in base dos dados metadata. - url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - - response = _safe_fetch(url) - - json_response = response.json() - - description = json_response["result"]["description"] - - if verbose: - return print(description) - return description - - -def get_table_columns( - dataset_id, - table_id, - verbose=True, -): - """ - Fetch the names, types and descriptions for the columns in the specified table. Prints - information on screen. - Args: - dataset_id (str): Required. - Dataset id available in list_datasets. - table_id (str): Required. - Table id available in list_dataset_tables - verbose (bool): Optional. - If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. + page(int): page for pagination. + page_size(int): page size for pagination. + backend(Backend): backend instance, injected automatically. Returns: - stdout | list + dict: List of tables. """ - url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - - response = _safe_fetch(url) - - json_response = response.json() + result = backend.get_columns(table_id, column_id, columns_name, page, page_size) + for item in result.get("items", []) or []: + item["bigquery_type"] = item.pop("bigqueryType", {}).get("name") + return result - columns = json_response["result"]["columns"] - if verbose: - return _print_output(pd.DataFrame(columns)) - return columns - - -def get_table_size( - dataset_id, - table_id, - verbose=True, -): - """Use a query to get the number of rows and size (in Mb) of a table. - - WARNING: this query may cost a lot depending on the table. - - Args: - dataset_id (str): Optional. - Dataset id available in basedosdados. It should always come with table_id. - table_id (str): Optional. - Table id available in basedosdados.dataset_id. - It should always come with dataset_id. - verbose (bool): Optional. - If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. +@check_input +@inject_backend +def search( + q: str = None, + page: int = 1, + page_size: int = 10, + backend: Backend = None, +) -> list[dict]: """ - url = f"https://basedosdados.org/api/3/action/bd_bdm_table_show?dataset_id={dataset_id}&table_id={table_id}" - - response = _safe_fetch(url) - - json_response = response.json() - - size = json_response["result"]["size"] - - if size is None: - return print("Size not available") - if verbose: - return _print_output(pd.DataFrame(size)) - return size - - -def search(query, order_by): - """This function works as a wrapper to the `bd_dataset_search` website API - enpoint. + Search for datasets, querying all available metadata for the term `q` Args: - query (str): - String to search in datasets and tables' metadata. - order_by (str): score|popular|recent - Field by which the results will be ordered. + q(str): search term. + + page(int): page for pagination. + page_size(int): page size for pagination. + backend(Backend): backend instance, injected automatically. Returns: - pd.DataFrame: - Response from the API presented as a pandas DataFrame. Each row is - a table. Each column is a field identifying the table. + dict: List of datasets and metadata. """ - - # validate order_by input - if order_by not in ["score", "popular", "recent"]: - raise ValueError( - f'order_by must be score, popular or recent. Received "{order_by}"' + items = [] + for item in backend.search(q, page, page_size).get("results", []): + items.append( + { + "slug": item.get("slug"), + "name": item.get("name"), + "description": item.get("description"), + "n_tables": item.get("n_tables"), + "n_raw_data_sources": item.get("n_raw_data_sources"), + "n_information_requests": item.get("n_information_requests"), + "organization": { + "slug": item.get("organizations", [{}])[0].get("slug"), + "name": item.get("organizations", [{}])[0].get("name"), + }, + } ) - - url = f"https://basedosdados.org/api/3/action/bd_dataset_search?q={query}&order_by={order_by}&resource_type=bdm_table" - - response = _safe_fetch(url) - - json_response = response.json() - - dataset_dfs = [] - # first loop identify the number of the tables in each datasets - for dataset in json_response["result"]["datasets"]: - tables_dfs = [] - len(dataset["resources"]) - # second loop extracts tables' information for each dataset - for table in dataset["resources"]: - data_table = pd.DataFrame( - {k: str(table[k]) for k in list(table.keys())}, index=[0] - ) - tables_dfs.append(data_table) - # append tables' dataframes for each dataset - data_ds = tables_dfs[0].append(tables_dfs[1:]).reset_index(drop=True) - dataset_dfs.append(data_ds) - # append datasets' dataframes - df = dataset_dfs[0].append(dataset_dfs[1:]).reset_index(drop=True) - - return df + return items diff --git a/python-package/basedosdados/upload/connection.py b/python-package/basedosdados/upload/connection.py index b050fec78..a09371b32 100644 --- a/python-package/basedosdados/upload/connection.py +++ b/python-package/basedosdados/upload/connection.py @@ -16,7 +16,7 @@ GetConnectionRequest, ) -from basedosdados.upload.base import Base +from basedosdados.core.base import Base class Connection(Base): diff --git a/python-package/basedosdados/upload/dataset.py b/python-package/basedosdados/upload/dataset.py index 2ea8bf2a8..5584e5490 100644 --- a/python-package/basedosdados/upload/dataset.py +++ b/python-package/basedosdados/upload/dataset.py @@ -8,7 +8,7 @@ from google.cloud import bigquery from loguru import logger -from basedosdados.upload.base import Base +from basedosdados.core.base import Base class Dataset(Base): diff --git a/python-package/basedosdados/upload/storage.py b/python-package/basedosdados/upload/storage.py index 739c1854c..f6291fe0e 100644 --- a/python-package/basedosdados/upload/storage.py +++ b/python-package/basedosdados/upload/storage.py @@ -12,7 +12,7 @@ from tqdm import tqdm from basedosdados.exceptions import BaseDosDadosException -from basedosdados.upload.base import Base +from basedosdados.core.base import Base # google retryble exceptions. References: https://googleapis.dev/python/storage/latest/retry_timeout.html#module-google.cloud.storage.retry diff --git a/python-package/basedosdados/upload/table.py b/python-package/basedosdados/upload/table.py index 1e6af80c5..8257d5b52 100644 --- a/python-package/basedosdados/upload/table.py +++ b/python-package/basedosdados/upload/table.py @@ -17,7 +17,7 @@ from loguru import logger from basedosdados.exceptions import BaseDosDadosException -from basedosdados.upload.base import Base +from basedosdados.core.base import Base from basedosdados.upload.connection import Connection from basedosdados.upload.dataset import Dataset from basedosdados.upload.datatypes import Datatype @@ -298,20 +298,6 @@ def _get_cross_columns_from_bq_api(self): api = self._get_columns_metadata_from_api() api_columns = api.get("partition_columns") + api.get("columns") - # bq_columns_list = [col.get("name") for col in bq_columns] - # api_columns_list = [col.get("name") for col in api_columns] - - # not_in_api_columns = [ - # col for col in bq_columns_list if col not in api_columns_list - # ] - # not_in_bq_columns = [ - # col for col in api_columns_list if col not in bq_columns_list - # ] - # print("bq_columns_list", len(bq_columns_list)) - # print("api_columns_list", len(api_columns_list)) - # print("not_in_api_columns", not_in_api_columns) - # print("not_in_bq_columns", not_in_bq_columns) - if api_columns != []: for bq_col in bq_columns: for api_col in api_columns: diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 5181d9b3b..dffd6c91e 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -51,9 +51,6 @@ all = ["gql", "pandavro", "requests-toolbelt"] avro = ["pandavro"] upload = ["gql", "requests-toolbelt"] -[tool.taskipy.tasks] -lint = "semgrep scan --error --config auto --exclude-rule yaml.github-actions.security.third-party-action-not-pinned-to-commit-sha.third-party-action-not-pinned-to-commit-sha && ruff check ." - [pytest] addopts = "-p no:warnings" diff --git a/python-package/tests/conftest.py b/python-package/tests/conftest.py index 55a7f09cd..b62b2c8b7 100644 --- a/python-package/tests/conftest.py +++ b/python-package/tests/conftest.py @@ -13,7 +13,7 @@ from basedosdados import Metadata # TODO: deprecate from basedosdados import Dataset, Storage, Table -from basedosdados.upload.base import Base +from basedosdados.core.base import Base DATASET_ID = "pytest" TABLE_ID = "pytest"