This notebook fetches BDM Trial specs from the Google Sheet and then generates documentations in quarto markdown format.

The generated markdown files are saved in the `assets/auto-generated/yml/` folder.

Install pandas, gspread, and PyYAML before running this notebook:

```bash
pip install pandas gspread PyYAML
```


In [25]:
# Imports and setup

import os
from pathlib import Path
import pandas as pd
import yaml

In [26]:
# Parameters

OUTPUT_DIR = Path('assets/auto-generated/')  # where to write the generated files
TRIAL_SPEC_SHEET_ID = os.environ.get('TRIAL_SPEC_SHEET_ID', '')

In [27]:
# Helper functions

def camel_to_dash(camel_case_string):
  """Converts a CamelCase string to a dash-separated lower case string.

  Args:
    camel_case_string: The CamelCase string to convert.

  Returns:
    The dash-separated lower-case string.
  """
  import re

  if pd.isna(camel_case_string):
    raise ValueError('camel_case_string cannot be None')

  if camel_case_string.strip() == '':
    return camel_case_string.strip()

  # replace spaces with empty string, and prefix uppercase letters with a dash
  s = re.sub('(?!^)([A-Z]+)', r'-\1', camel_case_string.replace(' ', ''))
  return s.lower()

def get_sheet(sheet_id: str, sheet_name, backend: str='requests'):
    """Download a Google Sheets table as a pandas DataFrame."""
    url = ('https://docs.google.com/spreadsheets/d/'
           '{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}')
    url = url.format(sheet_id=sheet_id, sheet_name=sheet_name)
    if backend.lower() == 'requests':
      import requests
      from io import StringIO
      content = requests.get(url).content
      data = pd.read_csv(StringIO(content.decode('utf-8')))
    else:
      data = pd.read_csv(url)
    return data

def convert_sheet_to_yaml(table_info: pd.Series, root_dir) -> Path:
    """Write a pandas DataFrame to a CSV file."""

    assert table_info['name'] is not None, 'table name is required'

    file_name = camel_to_dash(table_info['name']) + '.yml'
    output_path = root_dir / file_name

    # append the category to the file name if it exists
    if (pd.notna(table_info['category']) and len(table_info['category']) > 0):
      category = camel_to_dash(table_info['category'])
      output_path = root_dir / category / file_name

    output_path.parent.mkdir(parents=True, exist_ok=True)

    df = get_sheet(TRIAL_SPEC_SHEET_ID, table_info['name'])

    with open(output_path, 'w') as f:

      # to match quarto listing categories
      if 'category' in df.columns:
        df.rename(columns={'category': 'categories'}, inplace=True)
        df['categories'] = df['categories'].apply(
          lambda cats: cats.split(';') if pd.notna(cats) else [])

      df['description'] = df['description'].fillna('')
      items = df.to_dict(orient='records')

      # recursively drop nan values from the items
      for item in items:
        keys_to_delete = [key for key in item.keys()
                          if not isinstance(item[key], list)
                             and pd.isna(item[key])]
        for key in keys_to_delete:
          del item[key]

      yaml.safe_dump(
        items,
        f, indent=2, sort_keys=False)

    return output_path

In [28]:
# Run the pipeline

tables = get_sheet(TRIAL_SPEC_SHEET_ID, 'Tables')
yaml.safe_dump(tables.to_dict(orient='records'),
               open(OUTPUT_DIR / 'trial-tables.yml', 'w'), indent=2, sort_keys=False)
tables = tables.query('publish == True')
tables['category'] = tables['category'].str.split('; ')
tables = tables.explode('category').reset_index(drop=True)

tables.apply(convert_sheet_to_yaml, axis=1, root_dir=OUTPUT_DIR) # type: ignore

0           assets/auto-generated/trial/instrument.yml
1                assets/auto-generated/trial/trial.yml
2             assets/auto-generated/trial/stimulus.yml
3    assets/auto-generated/trial/stimulus-component...
4                assets/auto-generated/trial/click.yml
5               assets/auto-generated/trial/option.yml
6     assets/auto-generated/trial/option-component.yml
7                   assets/auto-generated/glossary.yml
dtype: object