This notebook fetches BDM Trial specs from the Google Sheet and then generates documentations in quarto markdown format.

The generated markdown files are saved in the `assets/yml/tables/` folder.

Install pandas, gspread, and PyYAML before running this notebook:

```bash
pip install pandas gspread PyYAML
```


In [69]:
# Imports and setup

import os
from pathlib import Path
import pandas as pd
import yaml

In [70]:
# Parameters

OUTPUT_DIR = Path('assets/auto-generated/')  # where to write the generated files
TRIAL_SPEC_SHEET_ID = os.environ.get('TRIAL_SPEC_SHEET_ID', '')

In [71]:
# Helper functions

def camel_to_dash(camel_case_string):
  """Converts a CamelCase string to a dash-separated lower case string.

  Args:
    camel_case_string: The CamelCase string to convert.

  Returns:
    The dash-separated lower-case string.
  """
  import re

  if pd.isna(camel_case_string):
    raise ValueError('camel_case_string cannot be None')

  if camel_case_string.strip() == '':
    return camel_case_string.strip()

  # replace spaces with empty string, and prefix uppercase letters with a dash
  s = re.sub('(?!^)([A-Z]+)', r'-\1', camel_case_string.replace(' ', ''))
  return s.lower()

def get_sheet(sheet_id: str, table_name, backend: str='requests'):
    """Download a Google Sheets table as a pandas DataFrame."""
    url = ('https://docs.google.com/spreadsheets/d/'
           '{sheet_id}/gviz/tq?tqx=out:csv&sheet={table_name}')
    url = url.format(sheet_id=sheet_id, table_name=table_name)
    if backend.lower() == 'requests':
      import requests
      from io import StringIO
      content = requests.get(url).content
      data = pd.read_csv(StringIO(content.decode('utf-8')))
    else:
      data = pd.read_csv(url)
    return data


def convert_sheet_to_yaml(table_info: pd.Series, root_dir) -> Path:
    """Write a pandas DataFrame to a CSV file."""

    assert table_info['table_name'] is not None, 'table_name is required'

    file_name = camel_to_dash(table_info['table_name']) + '.yml'
    output_path = root_dir / file_name

    # append the category to the file name if it exists
    if (pd.notna(table_info['category']) and len(table_info['category']) > 0):
      category = camel_to_dash(table_info['category'])
      output_path = root_dir / category / file_name

    output_path.parent.mkdir(parents=True, exist_ok=True)

    df = get_sheet(TRIAL_SPEC_SHEET_ID, table_info['table_name'])

    with open(output_path, 'w') as f:
      data = df.groupby('category', sort=False).apply(
        lambda x: x.to_dict(orient='records'), include_groups=False).to_dict()
      data = [{'category': c, 'variables': v} for  c,v in data.items()]
      yaml.dump(
        data,
        f, default_flow_style=False, indent=2, sort_keys=False)

    return output_path


In [72]:
# Run the pipeline

tables = get_sheet(TRIAL_SPEC_SHEET_ID, 'Tables')
tables['category'] = tables['category'].str.split('; ')
tables = tables.explode('category').reset_index(drop=True)

tables.apply(convert_sheet_to_yaml, axis=1, root_dir=OUTPUT_DIR) # type: ignore

0           assets/auto-generated/trial/instrument.yml
1                assets/auto-generated/trial/trial.yml
2             assets/auto-generated/trial/stimulus.yml
3    assets/auto-generated/trial/stimulus-component...
4                assets/auto-generated/trial/click.yml
5               assets/auto-generated/trial/option.yml
6     assets/auto-generated/trial/option-component.yml
7                 assets/auto-generated/vocabulary.yml
dtype: object