This notebook fetches BDM Trial specs from the Google Sheet and then generates documentations in quarto markdown format.

The generated markdown files are saved in the `spec/auto-generated` folder.

Install pandas, gspread, and tabulate before running this notebook.

In [None]:
# Setup
import pandas as pd
from pathlib import Path


# Constants
OUTPUT_DIR = Path('spec/auto-generated/')  # where to write the generated files
TRIAL_SPEC_SHEET_ID = '1yF34tvpxHtyNfVvXMdUXIVyS1UkDW7Bc6IaonI-I2vQ'

# Helpers

def camel_to_dash(camel_case_string):
  """Converts a CamelCase string to a dash-separated lower case string.

  Args:
    camel_case_string: The CamelCase string to convert.

  Returns:
    The dash-separated lower-case string.
  """
  import re

  if camel_case_string is None:
    raise ValueError('camel_case_string should not be None')

  if camel_case_string.strip() == '':
    return camel_case_string.strip()

  # replace spaces with empty string, and prefix uppercase letters with a dash
  s = re.sub('(?!^)([A-Z]+)', r'-\1', camel_case_string.replace(' ', ''))
  return s.lower()

def get_sheet(sheet_id: str, table_name: str, backend: str='requests'):
    """Download a Google Sheets table as a pandas DataFrame."""
    url = ('https://docs.google.com/spreadsheets/d/'
           '{sheet_id}/gviz/tq?tqx=out:csv&sheet={table_name}')
    url = url.format(sheet_id=sheet_id, table_name=table_name)
    print(f'downloading from {url}')
    if backend.lower() == 'requests':
      import requests
      from io import StringIO
      content = requests.get(url).content
      data = pd.read_csv(StringIO(content.decode('utf-8')))
    else:
      data = pd.read_csv(url)
    print('Done:', data.shape)
    return data


def sheet_to_qmd(table_info: pd.Series, root_dir):
    """Write a pandas DataFrame to a CSV file."""

    assert table_info['table_name'] is not None, 'table_name is required'

    file_name = camel_to_dash(table_info['table_name']) + '.qmd'
    output_path = root_dir / file_name

    # append the category to the file name if it exists
    category = camel_to_dash(table_info['category'])

    if (category is not None and len(category) > 0):
      output_path = root_dir / category / file_name

    output_path.parent.mkdir(parents=True, exist_ok=True)

    df = get_sheet(TRIAL_SPEC_SHEET_ID, table_info['table_name'])
    df.to_markdown(output_path, index=False)

    return output_path


In [None]:
from IPython.display import display

tables_df = get_sheet(TRIAL_SPEC_SHEET_ID, 'Tables')
display(tables_df)

tables_df['category'] = tables_df['category'].str.split('; ')
tables_df = tables_df.explode('category').reset_index(drop=True)
display(tables_df)

_ = tables_df.apply(
    lambda row: sheet_to_qmd(row, root_dir=OUTPUT_DIR), axis=1)

# page
    # name (data_type)
    # description
    # index_scope
    # enum_values
    # notes

downloading from https://docs.google.com/spreadsheets/d/1yF34tvpxHtyNfVvXMdUXIVyS1UkDW7Bc6IaonI-I2vQ/gviz/tq?tqx=out:csv&sheet=Tables


SSLError: HTTPSConnectionPool(host='docs.google.com', port=443): Max retries exceeded with url: /spreadsheets/d/1yF34tvpxHtyNfVvXMdUXIVyS1UkDW7Bc6IaonI-I2vQ/gviz/tq?tqx=out:csv&sheet=Tables (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1020)')))