Skip to content

Commit

Permalink
refactor: 🔧 clean up config_hub.py for biothings sdk 0.11.x
Browse files Browse the repository at this point in the history
  • Loading branch information
newgene committed Jul 18, 2022
1 parent 017b1b2 commit 91f51fa
Showing 1 changed file with 25 additions and 164 deletions.
189 changes: 25 additions & 164 deletions src/config_hub.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,27 @@
# ######### #
# HUB VARS #
# ######### #
from biothings.utils.loggers import setup_default_log
from biothings.utils.configuration import ConfigurationError, ConfigurationDefault, ConfigurationValue
import biothings.utils.jsondiff
import importlib
import logging
import os

# db containing the following (internal use)
DATA_HUB_DB_DATABASE = "gene_hubdb"
# for metadata of each src collections
DATA_SRC_MASTER_COLLECTION = 'src_master'
DATA_SRC_DUMP_COLLECTION = 'src_dump' # for src data download information
DATA_SRC_BUILD_COLLECTION = 'src_build' # for src data build information
DATA_PLUGIN_COLLECTION = 'data_plugin' # for data plugins information
# for api information (running under hub control)
API_COLLECTION = 'api'
CMD_COLLECTION = 'cmd' # for launched/running commands in shell
# for launched/running commands in shell
EVENT_COLLECTION = 'event'
# Refer to biothings.hub.default_config for all configurable settings

# where to store info about processes launched by the hub
RUN_DIR = './run'
DATA_SRC_SERVER = 'localhost'
DATA_SRC_PORT = 27017
DATA_SRC_DATABASE = 'mygene_src'

TAXONOMY = {
"human": {"tax_id": "9606", "assembly": "hg38"},
"mouse": {"tax_id": "10090", "assembly": "mm10"},
"rat": {"tax_id": "10116", "assembly": "rn4"},
"fruitfly": {"tax_id": "7227", "assembly": "dm3"},
"nematode": {"tax_id": "6239", "assembly": "ce10"},
"zebrafish": {"tax_id": "7955", "assembly": "zv9"},
"thale-cress": {"tax_id": "3702"},
"frog": {"tax_id": "8364", "assembly": "xenTro3"},
"pig": {"tax_id": "9823", "assembly": "susScr2"}
}

# reporting diff results, number of IDs to consider (to avoid too much mem usage)
MAX_REPORTED_IDS = 1000
# for diff updates, number of IDs randomly picked as examples when rendering the report
MAX_RANDOMLY_PICKED = 10
# size of a diff file when in memory (used when merged/reduced)
MAX_DIFF_SIZE = 50 * 1024**2 # 50MiB (~1MiB on disk when compressed)

# cache file format ("": ascii/text uncompressed, or "gz|zip|xz"
CACHE_FORMAT = "xz"

# How much memory hub is allowed to use:
# - "auto", let hub decides (will use 50%-60% of available RAM)
# - None: no limit
# - otherwise specify a number in bytes
HUB_MAX_MEM_USAGE = None

# Max number of *processes* hub can access to run jobs
HUB_MAX_WORKERS = int(os.cpu_count() / 4)
MAX_SYNC_WORKERS = HUB_MAX_WORKERS
DATA_TARGET_SERVER = 'localhost'
DATA_TARGET_PORT = 27017
DATA_TARGET_DATABASE = 'mygene'

# Max queued jobs in job manager
# this shouldn't be 0 to make sure a job is pending and ready to be processed
# at any time (avoiding job submission preparation) but also not a huge number
# as any pending job will consume some memory).
MAX_QUEUED_JOBS = os.cpu_count() * 4
HUB_DB_BACKEND = {
"module": "biothings.utils.mongo",
"uri": "mongodb://localhost:27017",
}
DATA_HUB_DB_DATABASE = "mygene_hubdb"

# Hub environment (like, prod, dev, ...)
# Used to generate remote metadata file, like "latest.json", "versions.json"
# If non-empty, this constant will be used to generate those url, as a prefix
# with "-" between. So, if "dev", we'll have "dev-latest.json", etc...
# "" means production
HUB_ENV = ""

# Hub name/icon url/version, for display purpose
HUB_NAME = "MyGene"
HUB_ICON = "http://mygene.info/static/img/mygene-logo-shiny.svg"
HUB_NAME = "MyGene Hub (prod)"
HUB_ICON = "https://mygene.info/static/img/mygene-logo-shiny.svg"

# Pre-prod/test ES definitions
INDEX_CONFIG = {
Expand Down Expand Up @@ -205,19 +155,6 @@
}
}

SLACK_WEBHOOK = None

# SSH port for hub console
HUB_SSH_PORT = 7022
HUB_API_PORT = 7080
READONLY_HUB_API_PORT = 7081

################################################################################
# HUB_PASSWD
################################################################################
# The format is a dictionary of 'username': 'cryptedpassword'
# Generate crypted passwords with 'openssl passwd -crypt'
HUB_PASSWD = {"guest": "9RKfd8gDuNf0Q"}

# cached data (it None, caches won't be used at all)
CACHE_FOLDER = None
Expand Down Expand Up @@ -262,12 +199,6 @@
#AUTOHUB_ES_HOST = "localhost:9200"


# don't bother with elements order in a list when diffing,
# mygene optmized uploaders can't produce different results
# when parsing data (parallelization)
importlib.reload(biothings.utils.jsondiff)
biothings.utils.jsondiff.UNORDERED_LIST = True

########################################
# APP-SPECIFIC CONFIGURATION VARIABLES #
########################################
Expand All @@ -282,84 +213,14 @@
# *must* be defined
#

# Individual source database connection
DATA_SRC_SERVER = ConfigurationError("Define hostname for source database")
DATA_SRC_PORT = ConfigurationError("Define port for source database")
DATA_SRC_DATABASE = ConfigurationError("Define name for source database")
DATA_SRC_SERVER_USERNAME = ConfigurationError(
"Define username for source database connection (or None if not needed)")
DATA_SRC_SERVER_PASSWORD = ConfigurationError(
"Define password for source database connection (or None if not needed)")

# Target (merged collection) database connection
DATA_TARGET_SERVER = ConfigurationError(
"Define hostname for target database (merged collections)")
DATA_TARGET_PORT = ConfigurationError(
"Define port for target database (merged collections)")
DATA_TARGET_DATABASE = ConfigurationError(
"Define name for target database (merged collections)")
DATA_TARGET_SERVER_USERNAME = ConfigurationError(
"Define username for target database connection (or None if not needed)")
DATA_TARGET_SERVER_PASSWORD = ConfigurationError(
"Define password for target database connection (or None if not needed)")

HUB_DB_BACKEND = ConfigurationError("Define Hub DB connection")
# Internal backend. Default to mongodb
# For now, other options are: mongodb, sqlite3, elasticsearch
# HUB_DB_BACKEND = {
# "module" : "biothings.utils.sqlite3",
# "sqlite_db_foder" : "./db",
# }
# HUB_DB_BACKEND = {
# "module" : "biothings.utils.mongo",
# "uri" : "mongodb://localhost:27017",
# #"uri" : "mongodb://user:passwd@localhost:27017", # mongodb std URI
# }
# HUB_DB_BACKEND = {
# "module" : "biothings.utils.es",
# "host" : "localhost:9200",
# }

# List of package paths for active datasources (expect data-plugin based sources)
ACTIVE_DATASOURCES = ConfigurationDefault(
default=[],
desc="List of package paths for active datasources")

# Path to a folder to store all downloaded files, logs, caches, etc...
DATA_ARCHIVE_ROOT = ConfigurationError(
"Define path to folder which will contain all downloaded data, cache files, etc...")

# Path to a folder to store all 3rd party parsers, dumpers, etc...
DATA_PLUGIN_FOLDER = ConfigurationDefault(
default="./plugins",
desc="Define path to folder which will contain all 3rd party parsers, dumpers, etc...")

# Path to folder containing diff files
DIFF_PATH = ConfigurationDefault(
default=ConfigurationValue("""os.path.join(DATA_ARCHIVE_ROOT,"diff")"""),
desc="Define path to folder which will contain output files from diff")
# Usually inside DATA_ARCHIVE_ROOT
#DIFF_PATH = os.path.join(DATA_ARCHIVE_ROOT,"diff")

# Path to folder containing release note files
RELEASE_PATH = ConfigurationDefault(
default=ConfigurationValue(
"""os.path.join(DATA_ARCHIVE_ROOT,"release")"""),
desc="Define path to folder which will contain release files")

# Usually inside DATA_ARCHIVE_ROOT
#RELEASE_PATH = os.path.join(DATA_ARCHIVE_ROOT,"release")

# this dir must be created manually
LOG_FOLDER = ConfigurationDefault(
default=ConfigurationValue("""os.path.join(DATA_ARCHIVE_ROOT,"logs")"""),
desc="Define path to folder which will contain log files")
# Usually inside DATA_ARCHIVE_ROOT
#LOG_FOLDER = os.path.join(DATA_ARCHIVE_ROOT,'logs')

# default hub logger
logger = ConfigurationDefault(
default=logging,
desc="Provide a default hub logger instance (use setup_default_log(name,log_folder)")
# Usually use default setup
#logger = setup_default_log("hub", LOG_FOLDER)
TAXONOMY = {
"human": {"tax_id": "9606", "assembly": "hg38"},
"mouse": {"tax_id": "10090", "assembly": "mm10"},
"rat": {"tax_id": "10116", "assembly": "rn4"},
"fruitfly": {"tax_id": "7227", "assembly": "dm3"},
"nematode": {"tax_id": "6239", "assembly": "ce10"},
"zebrafish": {"tax_id": "7955", "assembly": "zv9"},
"thale-cress": {"tax_id": "3702"},
"frog": {"tax_id": "8364", "assembly": "xenTro3"},
"pig": {"tax_id": "9823", "assembly": "susScr2"}
}

0 comments on commit 91f51fa

Please sign in to comment.