This notebook fetches BDM Trial specs from the Google Sheet and then generates documentations in quarto markdown format.

The generated markdown files are saved in the `assets/auto-generated/yml/` folder.

Install pandas, gspread, and PyYAML before running this notebook:

```bash
pip install pandas gspread PyYAML
```

You also need to set `TRIAL_SPEC_SHEET_ID` in the environment variable to the Google Sheet ID of the BDM Trial specs.

In [99]:
# Imports and setup

import os
from pathlib import Path
import pandas as pd
import yaml

In [100]:
# Parameters

OUTPUT_DIR = Path('assets/auto-generated/')  # where to write the generated files
TRIAL_SPEC_SHEET_ID = os.environ.get('TRIAL_SPEC_SHEET_ID', '')

In [101]:
if not (OUTPUT_DIR / '_WARNING_').exists():
    print('The output directory is manually edited.'
          'Skipping auto-generation.')
    exit(1)

The output directory is manually edited.Skipping auto-generation.


In [102]:
# Helper functions

def camel_to_dash(camel_case_string):
  """Converts a CamelCase string to a dash-separated lower case string.

  Args:
    camel_case_string: The CamelCase string to convert.

  Returns:
    The dash-separated lower-case string.
  """
  import re

  if pd.isna(camel_case_string):
    raise ValueError('camel_case_string cannot be None')

  if camel_case_string.strip() == '':
    return camel_case_string.strip()

  # replace spaces with empty string, and prefix uppercase letters with a dash
  s = re.sub('(?!^)([A-Z]+)', r'-\1', camel_case_string.replace(' ', ''))
  return s.lower()

def get_sheet(sheet_id: str, sheet_name, backend: str='requests'):
    """Download a Google Sheets table as a pandas DataFrame."""
    url = ('https://docs.google.com/spreadsheets/d/'
           '{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}')
    url = url.format(sheet_id=sheet_id, sheet_name=sheet_name)
    if backend.lower() == 'requests':
      import requests
      from io import StringIO
      content = requests.get(url).content
      data = pd.read_csv(StringIO(content.decode('utf-8')))
    else:
      data = pd.read_csv(url)
    return data

def convert_sheet_to_yaml(table_info: pd.Series, root_dir) -> Path:
    """Write a pandas DataFrame to a CSV file."""

    assert table_info['name'] is not None, 'table name is required'

    file_name = camel_to_dash(table_info['name']) + '.yml'
    output_path = root_dir / file_name

    # append the category to the file name if it exists
    if (pd.notna(table_info['category']) and len(table_info['category']) > 0):
      category = camel_to_dash(table_info['category'])
      output_path = root_dir / category / file_name

    output_path.parent.mkdir(parents=True, exist_ok=True)

    df = get_sheet(TRIAL_SPEC_SHEET_ID, table_info['name'])

    with open(output_path, 'w') as f:

      # to match quarto listing categories
      if 'category' in df.columns:
        df.rename(columns={'category': 'categories'}, inplace=True)
        df['categories'] = df['categories'].apply(
          lambda cats: cats.split(';') if pd.notna(cats) else [])

      df['description'] = df['description'].fillna('')
      df['notes'] = df['notes'].apply(lambda x:
        x.split('\n\n') if pd.notna(x) else None)
      items = df.to_dict(orient='records')

      # recursively drop nan values from the items
      for item in items:
        keys_to_delete = [key for key in item.keys()
                          if not isinstance(item[key], list)
                             and pd.isna(item[key])]
        for key in keys_to_delete:
          del item[key]

      yaml.safe_dump(
        items,
        f, indent=2, sort_keys=False)

    return output_path

In [103]:
# running gsheet2yaml pipeline

if OUTPUT_DIR.exists() and not (OUTPUT_DIR / '.lock').exists():
    raise Exception(
        'The output directory has been manually edited. '
        'Skipping auto-generation.')

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

tables = get_sheet(TRIAL_SPEC_SHEET_ID, 'Tables')
tables.reset_index(drop=False, inplace=True)
yaml.safe_dump(tables.to_dict(orient='records'),
               open(OUTPUT_DIR / 'trial-tables.yml', 'w'), indent=2, sort_keys=False)
tables = tables.query('publish == True')
tables['category'] = tables['category'].str.split('; ')
tables = tables.explode('category').reset_index(drop=True)

with open(OUTPUT_DIR / '.lock', 'w') as f:
    f.write('WARNING: THIS IS AN AUTO-GENERATED DIRECTORY.\n'
            'EITHER DO NOT MANUALLY EDIT ITS CONTENT OR\n'
            'REMOVE THIS FILE TO DISABLE AUTO-GENERATION.')

tables.apply(convert_sheet_to_yaml, axis=1, root_dir=OUTPUT_DIR) # type: ignore

Exception: The output directory has been manually edited. Skipping auto-generation.

## YAML to QMD

In [104]:
qmd_template = \
"""---
title: '<i class="bi bi-grid-3x3"></i> {title}'
subtitle: "{subtitle}"
order: {order}
toc: false
listing: 
  template: {ejs_template_path}
  field-required: [variable_name, categories, description]
  filter-ui: true
  sort: false
  sort-ui: false
  categories: numbered
  page-size: 10000
  contents:
    - "{yml_path}"
---

"""

In [106]:
INPUT_DIR = Path('assets/auto-generated/trials/')
QMD_OUTPUT_DIR = Path('reference/trials/')

if QMD_OUTPUT_DIR.exists() and not (QMD_OUTPUT_DIR / '.lock').exists():
    raise Exception(
        'The output directory has been manually edited.'
        'Skipping auto-generation.')

tables_df = pd.DataFrame(
    yaml.safe_load(open('assets/auto-generated/trial-tables.yml'))
)
tables_df.set_index('name', inplace=True)

EJS_TEMPLATE = Path('assets/templates/trial-table.ejs')

for yml_file in Path(INPUT_DIR).rglob('*.yml'):
    table_name = yml_file.stem.title().replace('-', '')
    table_title = tables_df.loc[table_name, 'label']
    table_description = tables_df.loc[table_name, 'description']

    # handle @Table annotations in the description
    table_description = table_description.replace(
        '@', '<i class="bi bi-grid-3x3"></i> ')

    qmd_content = qmd_template.format(
        title=table_title,
        subtitle=table_description,
        order=tables_df.loc[table_name, 'index'],
        ejs_template_path=EJS_TEMPLATE.relative_to(QMD_OUTPUT_DIR, walk_up=True),
        yml_path=yml_file.relative_to(QMD_OUTPUT_DIR, walk_up=True)
    )

    qmd_file = QMD_OUTPUT_DIR / yml_file.with_suffix('.qmd').name
    qmd_file.parent.mkdir(parents=True, exist_ok=True)
    with open(qmd_file, 'w') as f:
        f.write(qmd_content)

# Write a warning file
with open(QMD_OUTPUT_DIR / '.lock', 'w') as f:
    f.write('WARNING: THIS IS AN AUTO-GENERATED DIRECTORY.\n'
            'EITHER DO NOT MANUALLY EDIT ITS CONTENT OR\n'
            'REMOVE THIS FILE TO DISABLE AUTO-GENERATION.')

print('Done!')

Exception: The output directory has been manually edited.Skipping auto-generation.