diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index b80b4c8627..fa694bb009 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -55,22 +55,22 @@ jobs: use-mamba: true activate-environment: rubrix - - name: Set date for conda cache + - name: Get date for conda cache if: steps.filter.outputs.python_code == 'true' - id: set-date + id: get-date run: echo "::set-output name=today::$(/bin/date -u '+%Y%m%d')" shell: bash - name: Cache Conda env if: steps.filter.outputs.python_code == 'true' uses: actions/cache@v2 + id: cache with: path: ${{ env.CONDA }}/envs - key: conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.set-date.outputs.today }}-${{ hashFiles('environment_dev.yml') }}-${{ env.CACHE_NUMBER }} + key: conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.get-date.outputs.today }}-${{ hashFiles('environment_dev.yml') }}-${{ env.CACHE_NUMBER }} env: # Increase this value to reset cache if etc/example-environment.yml has not changed - CACHE_NUMBER: 0 - id: cache + CACHE_NUMBER: 2 - name: Update environment if: steps.filter.outputs.python_code == 'true' && steps.cache.outputs.cache-hit != 'true' diff --git a/docs/index.md b/docs/index.md index 4d4f83091a..13819923d2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -172,6 +172,7 @@ You can join the conversation on Slack! We are a very friendly and inclusive com :caption: Reference :hidden: + reference/telemetry reference/python/index reference/webapp/index diff --git a/docs/reference/telemetry.md b/docs/reference/telemetry.md new file mode 100644 index 0000000000..7750ede7ee --- /dev/null +++ b/docs/reference/telemetry.md @@ -0,0 +1,54 @@ +# Telemetry +Rubrix uses telemetry to report anonymous usage and error information. As an open-source software, this type of information is important to improve and understand how the product is used. + +## How to opt-out +You can opt-out of telemetry reporting using the `ENV` variable `RUBRIX_ENABLE_TELEMETRY` before launching the server. Setting this variable to `0` will completely disable telemetry reporting. + +If you are a Linux/MacOs users you should run: + +```bash +export RUBRIX_ENABLE_TELEMETRY=0 +``` + +If you are Windows users you should run: + +```bash +set RUBRIX_ENABLE_TELEMETRY=0 +``` + +To opt-in again, you can set the variable to `1`. + +## Why reporting telemetry +Anonymous telemetry information enable us to continously improve the product and detect recurring problems to better serve all users. We collect aggregated information about general usage and errors. We do NOT collect any information of users' data records, datasets, or metadata information. + +## Sensitive data +We do not collect any piece of information related to the source data you store in Rubrix. We don't identify individual users. Your data does not leave your server at any time: + +* No dataset record is collected. +* No dataset names or metadata are collected. + +## Information reported +The following usage and error information is reported: + +* The code of the raised error +* The `user-agent` and `accept-language` http headers +* Task name and number of records for bulk operations +* An anonymous generated user uuid +* The rubrix version running the server +* The python version, e.g. `3.8.13` +* The system/OS name, such as `Linux`, `Darwin`, `Windows` +* The system’s release version, e.g. `Darwin Kernel Version 21.5.0: Tue Apr 26 21:08:22 PDT 2022; root:xnu-8020` +* The machine type, e.g. `AMD64` +* The underlying platform spec with as much useful information as possible. (ej. `macOS-10.16-x86_64-i386-64bit`) + + +This is performed by registering information from the following API methods: + +* `/api/me` +* `/api/dataset/{name}/{task}:bulk` +* Raised server API errors + + +For transparency, you can inspect the source code where this is performed here (add link to the source). + +If you have any doubts, don't hesitate to join our [Slack channel](https://join.slack.com/t/rubrixworkspace/shared_invite/zt-whigkyjn-a3IUJLD7gDbTZ0rKlvcJ5g) or open a GitHub issue. We'd be very happy to discuss about how we can improve this. diff --git a/pyproject.toml b/pyproject.toml index 88e29e50ff..45d743cb9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,8 @@ server = [ "passlib[bcrypt]~=1.7.4", # Info status "psutil ~= 5.8.0", + # Telemetry + "segment-analytics-python != 2.2.1" ] listeners = [ "schedule ~= 1.1.0", diff --git a/src/rubrix/server/apis/v0/handlers/text2text.py b/src/rubrix/server/apis/v0/handlers/text2text.py index d9f09fc6be..402c4b4c3f 100644 --- a/src/rubrix/server/apis/v0/handlers/text2text.py +++ b/src/rubrix/server/apis/v0/handlers/text2text.py @@ -69,7 +69,7 @@ response_model=BulkResponse, response_model_exclude_none=True, ) -def bulk_records( +async def bulk_records( name: str, bulk: Text2TextBulkRequest, common_params: CommonTaskHandlerDependencies = Depends(), @@ -100,7 +100,7 @@ def bulk_records( dataset.owner = owner datasets.create_dataset(user=current_user, dataset=dataset) - result = service.add_records( + result = await service.add_records( dataset=dataset, records=[ServiceText2TextRecord.parse_obj(r) for r in bulk.records], ) diff --git a/src/rubrix/server/apis/v0/handlers/text_classification.py b/src/rubrix/server/apis/v0/handlers/text_classification.py index cb18f787f3..2c68d9b917 100644 --- a/src/rubrix/server/apis/v0/handlers/text_classification.py +++ b/src/rubrix/server/apis/v0/handlers/text_classification.py @@ -126,7 +126,7 @@ async def bulk_records( user=current_user, dataset=dataset, records=records ) - result = service.add_records( + result = await service.add_records( dataset=dataset, records=records, ) diff --git a/src/rubrix/server/apis/v0/handlers/token_classification.py b/src/rubrix/server/apis/v0/handlers/token_classification.py index 969fb0e8b3..ea81e0e743 100644 --- a/src/rubrix/server/apis/v0/handlers/token_classification.py +++ b/src/rubrix/server/apis/v0/handlers/token_classification.py @@ -121,7 +121,7 @@ async def bulk_records( records=records, ) - result = service.add_records( + result = await service.add_records( dataset=dataset, records=records, ) diff --git a/src/rubrix/server/apis/v0/handlers/users.py b/src/rubrix/server/apis/v0/handlers/users.py index 14d5668b2b..f5e2b67b7a 100644 --- a/src/rubrix/server/apis/v0/handlers/users.py +++ b/src/rubrix/server/apis/v0/handlers/users.py @@ -13,8 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from fastapi import APIRouter, Security +from fastapi import APIRouter, Request, Security +from rubrix.server.commons import telemetry from rubrix.server.security import auth from rubrix.server.security.model import User @@ -22,17 +23,18 @@ @router.get( - "/me", - response_model=User, - response_model_exclude_none=True, - operation_id="whoami", + "/me", response_model=User, response_model_exclude_none=True, operation_id="whoami" ) -async def whoami(current_user: User = Security(auth.get_user, scopes=[])): +async def whoami( + request: Request, current_user: User = Security(auth.get_user, scopes=[]) +): """ User info endpoint Parameters ---------- + request: + The original request current_user: The current request user @@ -41,4 +43,6 @@ async def whoami(current_user: User = Security(auth.get_user, scopes=[])): The current user """ + + await telemetry.track_login(request, username=current_user.username) return current_user diff --git a/src/rubrix/server/commons/telemetry.py b/src/rubrix/server/commons/telemetry.py new file mode 100644 index 0000000000..995a1e93f5 --- /dev/null +++ b/src/rubrix/server/commons/telemetry.py @@ -0,0 +1,122 @@ +import dataclasses +import logging +import platform +import uuid +from typing import Any, Dict, Optional + +import httpx +from fastapi import Request + +from rubrix.server.commons.models import TaskType +from rubrix.server.errors.base_errors import RubrixServerError +from rubrix.server.settings import settings + +try: + from analytics import Client +except ModuleNotFoundError: + # TODO: show some warning info + settings.enable_telemetry = False + Client = None + + +def _configure_analytics(disable_send: bool = False) -> Client: + API_KEY = settings.telemetry_key or "C6FkcaoCbt78rACAgvyBxGBcMB3dM3nn" + TELEMETRY_HOST = "https://api.segment.io" + + # Check host connection + httpx.options(TELEMETRY_HOST, timeout=1, verify=False) + + return Client( + write_key=API_KEY, + gzip=True, + host=TELEMETRY_HOST, + send=not disable_send, + max_retries=5, + ) + + +@dataclasses.dataclass +class _TelemetryClient: + + client: Client + + __INSTANCE__: "_TelemetryClient" = None + __server_id__: Optional[uuid.UUID] = dataclasses.field(init=False, default=None) + + @property + def server_id(self) -> uuid.UUID: + return self.__server_id__ + + @classmethod + def get(cls): + if settings.enable_telemetry: + if cls.__INSTANCE__ is None: + try: + cls.__INSTANCE__ = cls(client=_configure_analytics()) + except Exception as err: + logging.getLogger(__name__).warning( + f"Cannot initialize telemetry. Error: {err}. Disabling..." + ) + settings.enable_telemetry = False + return None + return cls.__INSTANCE__ + + def __post_init__(self): + + from rubrix import __version__ + + self.__server_id__ = uuid.UUID(int=uuid.getnode()) + self.__server_id_str__ = str(self.__server_id__) + self.__system_info__ = { + "system": platform.system(), + "machine": platform.machine(), + "platform": platform.platform(), + "python_version": platform.python_version(), + "sys_version": platform.version(), + "version": __version__, + } + + def track_data( + self, action: str, data: Dict[str, Any], include_system_info: bool = True + ): + event_data = data.copy() + self.client.track( + user_id=self.__server_id_str__, + event=action, + properties=event_data, + context=self.__system_info__ if include_system_info else {}, + ) + + +def _process_request_info(request: Request): + return { + header: request.headers.get(header) + for header in ["user-agent", "accept-language"] + } + + +async def track_error(error: RubrixServerError, request: Request): + client = _TelemetryClient.get() + if client: + client.track_data( + "ServerErrorFound", {"code": error.code, **_process_request_info(request)} + ) + + +async def track_bulk(task: TaskType, records: int): + client = _TelemetryClient.get() + if client: + client.track_data("LogRecordsRequested", {"task": task, "records": records}) + + +async def track_login(request: Request, username: str): + client = _TelemetryClient.get() + if client: + client.track_data( + "UserInfoRequested", + { + "is_default_user": username == "rubrix", + "user_hash": str(uuid.uuid5(namespace=client.server_id, name=username)), + **_process_request_info(request), + }, + ) diff --git a/src/rubrix/server/errors/api_errors.py b/src/rubrix/server/errors/api_errors.py index 688f49828d..7784d86224 100644 --- a/src/rubrix/server/errors/api_errors.py +++ b/src/rubrix/server/errors/api_errors.py @@ -5,6 +5,7 @@ from fastapi.exception_handlers import http_exception_handler from pydantic import BaseModel +from rubrix.server.commons import telemetry from rubrix.server.errors.adapter import exception_to_rubrix_error from rubrix.server.errors.base_errors import RubrixServerError @@ -40,6 +41,8 @@ class APIErrorHandler: async def common_exception_handler(request: Request, error: Exception): """Wraps errors as custom generic error""" rubrix_error = exception_to_rubrix_error(error) + await telemetry.track_error(rubrix_error, request=request) + return await http_exception_handler( request, RubrixServerHTTPException(rubrix_error) ) diff --git a/src/rubrix/server/server.py b/src/rubrix/server/server.py index d5f316cfc0..f69f8a2e37 100644 --- a/src/rubrix/server/server.py +++ b/src/rubrix/server/server.py @@ -16,7 +16,10 @@ """ This module configures the global fastapi application """ +import inspect import os +import sys +import warnings from pathlib import Path from brotli_asgi import BrotliMiddleware @@ -127,6 +130,35 @@ def configure_app_logging(app: FastAPI): version=str(rubrix_version), ) + +def configure_telemetry(app): + message = "\n" + message += inspect.cleandoc( + """ + Rubrix uses telemetry to report anonymous usage and error information. + + You can know more about what information is reported at: + + https://rubrix.readthedocs.io/en/stable/reference/telemetry.html + + Telemetry is currently enabled. If you want to disable it, you can configure + the environment variable before relaunching the server: + """ + ) + message += "\n\n " + message += ( + "#set RUBRIX_ENABLE_TELEMETRY=0" + if os.name == "nt" + else "$>export RUBRIX_ENABLE_TELEMETRY=0" + ) + message += "\n" + + @app.on_event("startup") + async def check_telemetry(): + if settings.enable_telemetry: + print(message, flush=True) + + for app_configure in [ configure_app_logging, configure_middleware, @@ -135,5 +167,6 @@ def configure_app_logging(app: FastAPI): configure_api_router, configure_app_statics, configure_app_storage, + configure_telemetry, ]: app_configure(app) diff --git a/src/rubrix/server/services/storage/service.py b/src/rubrix/server/services/storage/service.py index 72007fcc56..31883db7fe 100644 --- a/src/rubrix/server/services/storage/service.py +++ b/src/rubrix/server/services/storage/service.py @@ -2,6 +2,7 @@ from fastapi import Depends +from rubrix.server.commons import telemetry from rubrix.server.commons.config import TasksFactory from rubrix.server.daos.records import DatasetRecordsDAO from rubrix.server.services.datasets import ServiceDataset @@ -24,13 +25,15 @@ def get_instance( def __init__(self, dao: DatasetRecordsDAO): self.__dao__ = dao - def store_records( + async def store_records( self, dataset: ServiceDataset, records: List[ServiceRecord], record_type: Type[ServiceRecord], ) -> int: """Store a set of records""" + await telemetry.track_bulk(task=dataset.task, records=len(records)) + metrics = TasksFactory.get_task_metrics(dataset.task) if metrics: for record in records: diff --git a/src/rubrix/server/services/tasks/text2text/service.py b/src/rubrix/server/services/tasks/text2text/service.py index fdb503c8ab..b462aad6a2 100644 --- a/src/rubrix/server/services/tasks/text2text/service.py +++ b/src/rubrix/server/services/tasks/text2text/service.py @@ -60,12 +60,12 @@ def __init__( self.__storage__ = storage self.__search__ = search - def add_records( + async def add_records( self, dataset: ServiceText2TextDataset, records: List[ServiceText2TextRecord], ): - failed = self.__storage__.store_records( + failed = await self.__storage__.store_records( dataset=dataset, records=records, record_type=ServiceText2TextRecord, diff --git a/src/rubrix/server/services/tasks/text_classification/service.py b/src/rubrix/server/services/tasks/text_classification/service.py index cecc47c386..02e4fe3345 100644 --- a/src/rubrix/server/services/tasks/text_classification/service.py +++ b/src/rubrix/server/services/tasks/text_classification/service.py @@ -67,7 +67,7 @@ def __init__( self.__search__ = search self.__labeling__ = labeling - def add_records( + async def add_records( self, dataset: ServiceTextClassificationDataset, records: List[ServiceTextClassificationRecord], @@ -75,7 +75,7 @@ def add_records( # TODO(@frascuchon): This will moved to dataset settings validation once DatasetSettings join the game! self._check_multi_label_integrity(dataset, records) - failed = self.__storage__.store_records( + failed = await self.__storage__.store_records( dataset=dataset, records=records, record_type=ServiceTextClassificationRecord, diff --git a/src/rubrix/server/services/tasks/token_classification/service.py b/src/rubrix/server/services/tasks/token_classification/service.py index fc8bc24e2b..bb82a776e7 100644 --- a/src/rubrix/server/services/tasks/token_classification/service.py +++ b/src/rubrix/server/services/tasks/token_classification/service.py @@ -57,12 +57,12 @@ def __init__( self.__storage__ = storage self.__search__ = search - def add_records( + async def add_records( self, dataset: ServiceTokenClassificationDataset, records: List[ServiceTokenClassificationRecord], ): - failed = self.__storage__.store_records( + failed = await self.__storage__.store_records( dataset=dataset, records=records, record_type=ServiceTokenClassificationRecord, diff --git a/src/rubrix/server/settings.py b/src/rubrix/server/settings.py index 11068f38da..2a0e1707f7 100644 --- a/src/rubrix/server/settings.py +++ b/src/rubrix/server/settings.py @@ -75,24 +75,14 @@ class ApiSettings(BaseSettings): es_records_index_shards: int = 1 es_records_index_replicas: int = 0 - # TODO(@frascuchon): remove in v0.12.0 - disable_es_index_template_creation: bool = False metadata_fields_limit: int = Field( default=50, gt=0, le=100, description="Max number of fields in metadata" ) - @validator("disable_es_index_template_creation", always=True) - def check_index_template_creation_value(cls, value): + enable_telemetry: bool = True - if value is True: - cls.__LOGGER__.warning( - "The environment variable DISABLE_ES_INDEX_TEMPLATE_CREATION won't be used anymore.\n" - "If you want customize the dataset creation index, please refer documentation " - "https://rubrix.readthedocs.io/en/stable" - "/getting_started/advanced_setup_guides.html#change-elasticsearch-index-analyzers" - ) - return value + telemetry_key: Optional[str] = None @property def dataset_index_name(self) -> str: @@ -117,21 +107,32 @@ def obfuscated_elasticsearch(self) -> str: class Config: # TODO: include a common prefix for all rubrix env vars. + env_prefix = "RUBRIX_" fields = { - "elasticsearch_ca_path": { - "env": "RUBRIX_ELASTICSEARCH_CA_PATH", + # TODO(@frascuchon): Remove in 0.20.0 + "elasticsearch": { + "env": ["ELASTICSEARCH", f"{env_prefix}ELASTICSEARCH"], }, "elasticsearch_ssl_verify": { - "env": "RUBRIX_ELASTICSEARCH_SSL_VERIFY", + "env": [ + "ELASTICSEARCH_SSL_VERIFY", + f"{env_prefix}ELASTICSEARCH_SSL_VERIFY", + ] }, - "metadata_fields_limit": {"env": "RUBRIX_METADATA_FIELDS_LIMIT"}, - "namespace": { - "env": "RUBRIX_NAMESPACE", + "cors_origins": {"env": ["CORS_ORIGINS", f"{env_prefix}CORS_ORIGINS"]}, + "docs_enabled": {"env": ["DOCS_ENABLED", f"{env_prefix}DOCS_ENABLED"]}, + "es_records_index_shards": { + "env": [ + "ES_RECORDS_INDEX_SHARDS", + f"{env_prefix}ES_RECORDS_INDEX_SHARDS", + ] }, - "default_es_search_analyzer": { - "env": "RUBRIX_DEFAULT_ES_SEARCH_ANALYZER", + "es_records_index_replicas": { + "env": [ + "ES_RECORDS_INDEX_REPLICAS", + f"{env_prefix}ES_RECORDS_INDEX_SHARDS", + ] }, - "exact_es_search_analyzer": {"env": "RUBRIX_EXACT_ES_SEARCH_ANALYZER"}, } diff --git a/tests/conftest.py b/tests/conftest.py index 979135acc6..57520a1d67 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,8 @@ from _pytest.logging import LogCaptureFixture from rubrix.client.sdk.users import api as users_api +from rubrix.server.commons import telemetry +from rubrix.server.settings import settings try: from loguru import logger @@ -17,7 +19,19 @@ @pytest.fixture -def mocked_client(monkeypatch) -> SecuredClient: +def telemetry_track_data(mocker): + + client = telemetry._TelemetryClient.get() + if client: + # Disable sending data for tests + client.client = telemetry._configure_analytics(disable_send=True) + spy = mocker.spy(client, "track_data") + + return spy + + +@pytest.fixture +def mocked_client(monkeypatch, telemetry_track_data) -> SecuredClient: with TestClient(app, raise_server_exceptions=False) as _client: client_ = SecuredClient(_client) diff --git a/tests/server/commons/__init__.py b/tests/server/commons/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/server/commons/test_telemetry.py b/tests/server/commons/test_telemetry.py new file mode 100644 index 0000000000..2ea25f6cc2 --- /dev/null +++ b/tests/server/commons/test_telemetry.py @@ -0,0 +1,51 @@ +import uuid + +import pytest +from fastapi import Request + +from rubrix.server.commons import telemetry +from rubrix.server.commons.models import TaskType +from rubrix.server.errors import RubrixServerError + +mock_request = Request(scope={"type": "http", "headers": {}}) + + +@pytest.mark.asyncio +async def test_track_login(telemetry_track_data): + await telemetry.track_login(request=mock_request, username="rubrix") + + current_server_id = telemetry._TelemetryClient.get().server_id + expected_event_data = { + "accept-language": None, + "is_default_user": True, + "user-agent": None, + "user_hash": str(uuid.uuid5(current_server_id, name="rubrix")), + } + telemetry_track_data.assert_called_once_with( + "UserInfoRequested", + expected_event_data, + ) + + +@pytest.mark.asyncio +async def test_track_bulk(telemetry_track_data): + task, records = TaskType.token_classification, 100 + + await telemetry.track_bulk(task=task, records=records) + telemetry_track_data.assert_called_once_with( + "LogRecordsRequested", {"task": task, "records": records} + ) + + +@pytest.mark.asyncio +async def test_track_error(telemetry_track_data): + error = RubrixServerError() + await telemetry.track_error(error, request=mock_request) + telemetry_track_data.assert_called_once_with( + "ServerErrorFound", + { + "accept-language": None, + "code": "rubrix.api.errors::RubrixServerError", + "user-agent": None, + }, + ) diff --git a/tests/server/conftest.py b/tests/server/conftest.py new file mode 100644 index 0000000000..878090c624 --- /dev/null +++ b/tests/server/conftest.py @@ -0,0 +1,3 @@ +import pytest + +from rubrix.server.commons import telemetry diff --git a/tests/server/text_classification/test_api.py b/tests/server/text_classification/test_api.py index 1e8a12248b..7db090985b 100644 --- a/tests/server/text_classification/test_api.py +++ b/tests/server/text_classification/test_api.py @@ -119,7 +119,7 @@ def test_create_records_for_text_classification_with_multi_label(mocked_client): assert results.records[0].predicted is None -def test_create_records_for_text_classification(mocked_client): +def test_create_records_for_text_classification(mocked_client, telemetry_track_data): dataset = "test_create_records_for_text_classification" assert mocked_client.delete(f"/api/datasets/{dataset}").status_code == 200 tags = {"env": "test", "class": "text classification"} @@ -178,6 +178,8 @@ def test_create_records_for_text_classification(mocked_client): "words": {"data": 1}, } + telemetry_track_data.assert_called_once() + def test_partial_record_update(mocked_client): name = "test_partial_record_update"