In [1]:
import pandas as pd
import yaml

In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"
FILE = "fake_data.xlsx"

In [3]:
df = pd.read_excel(f"{GCS_FILE_PATH}{FILE}", sheet_name="fake")

In [4]:
# Get lists of districts and counties sorted in the correct order
districts = sorted(df.district.fillna(0).unique().tolist())
# counties = sorted(df.full_county_name.fillna("Various").unique().tolist())

In [5]:
districts

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 74, 75]

In [6]:
# counties

In [7]:
# My yaml
MY_SITE_YAML = "./test2.yml"

In [8]:
# Open my old yaml file as analyses_data. 
# What's the difference between yaml load vs Loader?
with open(MY_SITE_YAML) as analyses:
    analyses_data = yaml.load(analyses, yaml.Loader)

In [9]:
analyses_data

{'title': 'Project Prioritization - 10 Year Non-SHOPP',
 'directory': './project_prioritization/',
 'readme': './project_prioritization/README.md',
 'notebook': './project_prioritization/county_landing_page.ipynb',
 'parts': [{'chapters': [{'caption': 'District 0',
     'params': {'district': 0},
     'sections': [{'county': 'Various'}]},
    {'caption': 'District 1',
     'params': {'district': 1},
     'sections': [{'county': 'Del Norte'},
      {'county': 'Humboldt'},
      {'county': 'Lake'},
      {'county': 'Mendocino'},
      {'county': 'Various'}]},
    {'caption': 'District 2',
     'params': {'district': 2},
     'sections': [{'county': 'Lassen'},
      {'county': 'Modoc'},
      {'county': 'Shasta'},
      {'county': 'Siskiyou'},
      {'county': 'Tehama'},
      {'county': 'Trinity'}]},
    {'caption': 'District 3',
     'params': {'district': 3},
     'sections': [{'county': 'Butte'},
      {'county': 'Nevada'},
      {'county': 'Placer'},
      {'county': 'Sacramento'},
 

In [10]:
# Del later - only grab a few districts
# test_districts = districts[:2]

In [11]:
# test_districts

In [12]:
# Loop through each district, grab the valid itp_ids
# populate each dict key (caption, params, sections) needed to go into analyses.yml
chapters_list = []
for district in sorted(districts):
    chapter_dict = {}
    
    subset = df[df.district == district]
    
    # Only grab the counties found in this district
    counties_in_district = sorted(subset.full_county_name.fillna("Various")
                                   .unique().tolist())
    
    chapter_dict['caption'] = f'District {district}'
    chapter_dict['params'] = {'district': district}
    chapter_dict['sections'] = [{'county': county} for county in 
                                counties_in_district]
    chapters_list += [chapter_dict]

# Make this into a list item
parts_list = [{'chapters': chapters_list}]

analyses_data['parts'] = parts_list

output = yaml.dump(analyses_data)

print(output)

directory: ./project_prioritization/
notebook: ./project_prioritization/county_landing_page.ipynb
parts:
- chapters:
  - caption: District 0
    params:
      district: 0
    sections:
    - county: Various
  - caption: District 1
    params:
      district: 1
    sections:
    - county: Del Norte
    - county: Humboldt
    - county: Lake
    - county: Mendocino
    - county: Various
  - caption: District 2
    params:
      district: 2
    sections:
    - county: Lassen
    - county: Modoc
    - county: Shasta
    - county: Siskiyou
    - county: Tehama
    - county: Trinity
  - caption: District 3
    params:
      district: 3
    sections:
    - county: Butte
    - county: Nevada
    - county: Placer
    - county: Sacramento
    - county: Various
    - county: Yolo
    - county: Yuba
  - caption: District 4
    params:
      district: 4
    sections:
    - county: Alameda
    - county: Contra Costa
    - county: Marin
    - county: Napa
    - county: San Francisco
    - county: San 

In [13]:
with open("./test2.yml", 'w') as analyses:
    analyses.write(output)