# About

Within the context of the CERISE project we have created several sreadsheets that list different entities and their properties. 
This code aims to automate the extraction of data from those spreadsheets to render them as pages in a quarto website. 

NOTE: 
I wasn't able to download the relevant online data with code. Instead, I download the spreadsheets manually as csv files (stored in the data folder).
In a future iteration the website generation could be fully automated using CI.



In [8]:
import shutil
import os
import pandas as pd
from pathlib import Path
import yaml

from slugify import slugify # this requires intsalling the python-slugify package

# Custom functions

In [9]:
usecases = [{
    'input_file': 'data/WP2A1-Greenskills initiatives - initiatives.csv',
    'output_dir': 'green-skills-initiatives/auto-generated/',
    'usecase': 'initiative'
}, {
    'input_file': 'data/WP2A2 - Green Skills Resources(Green Skills Resources).csv',
    'output_dir': 'green-skills-resources/auto-generated/',
    'usecase': 'resource'
}]

In [10]:
def to_qmd(row: pd.Series, usecase: dict):
    output_dir = Path(usecase['output_dir'])
    output_file = output_dir / f'{slugify(row["title"])}.qmd'

    output_dir.mkdir(parents=True, exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        row = row.copy().dropna()
        frontmatter = yaml.safe_dump(row.to_dict())
        f.write(f'---\n{frontmatter}---\n\n')
        # title and description are automatically added to the page (quarto takes care of that)
        match usecase['usecase']:
            case 'initiative':
                if 'image' in row:
                    f.write('![{{< meta title >}}]({{< meta image >}})\n\n')
                if 'type' in row and pd.notna(row['type']):
                    f.write('## Type of Initiative\n\n')
                    f.write('{{< meta type >}}\n\n')
                if 'provider_name' in row and pd.notna(row['provider_name']):
                    f.write('## Provider\n\n')
                    f.write('{{< meta provider_name >}}\n\n')
                if 'creator_name' in row and pd.notna(row['creator_name']):
                    f.write('## Creator\n\n')
                    f.write('{{< meta creator_name >}}\n\n')
                for column in row.index:
                    if column in ['title', 'description', 'type', 'provider_name',
                                  'creator_name', 'id', 'image']:
                        continue
                    f.write(f'### {column.capitalize().replace('_',' ')}\n\n')
                    f.write(f'{{{{< meta {column} >}}}}\n\n')
            case _:
                if 'image' in row:
                    f.write('![]({{< meta image >}})\n\n')
                for column in row.index:
                    if column in ['title', 'description', 'id', 'image']:
                        continue
                    f.write(f'### {column.capitalize().replace('_',' ')}\n\n')
                    f.write(f'{{{{< meta {column} >}}}}\n\n')

for usecase in usecases:
    df = pd.read_csv(usecase['input_file'])

    df.columns = [slugify(c, separator='_').lower() for c in df.columns]

    df.rename({
        'name': 'title',
        'Description': 'description',
        'type': 'type_of_initiative',
        'image_url': 'image',
        'category_hei_green_certification_green_skills_courses_labor_market': 'hei_category',
        'language': 'languages',
    }, errors='ignore', axis=1, inplace=True)

    df.apply(to_qmd, axis=1, usecase=usecase)


# :warning: The rest of thare outdated

In [None]:

def save_qmd(content, output_dir, filename):

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    output_file = Path(output_dir) / (slugify(filename) + '.qmd')
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(content)

In [39]:
# loading of the data from spreadsheet does not work because of csv formatting issue.


## Instructions google sheet:
# document_id = '1FE-NipeF_ns5yoiXXi0mHm0FF21VYoJfC5recSE441s'
# sheet_name = 'initiatives'
# sheet_url = f"https://docs.google.com/spreadsheets/d/{document_id}/gviz/tq?tqx=out:json&sheet={sheet_name}"

# initiatives = pd.read_csv(sheet_url)

In [40]:
# describes how data from the spreadsheet is converted to a page:

def convert_row_to_qmd(row, output_dir, usecase):

    # loop over variables to generate output if it variables have been entered.
    vars = row.index[6:]
    for v in vars:

        if (not pd.isna(row[v])):

            text += f'### {v}\n\n{row[v]}\n\n' # replace v by v_title to get from a separate table

    save_qmd(text, output_dir, filename)

    return text

In [41]:
def extract_data_for_website(data_filename, output_dir, data_page, usecase):
    
    
    
    # importing data 
    df = pd.read_csv(data_filename, encoding = "ISO-8859-1")
        
    # loop over rows and save each row as qmd file
    for i in range(len(df)):
        row = df.loc[i]
        convert_row_to_qmd(row, output_dir, usecase)
    
    # list files in the output folder (should yeield all the auto-generated qmd files):
    auto_files = os.listdir(output_dir)
    
    # add included statements in the initiative file
    text = '' 
    text += '\n\n'

    for i in auto_files:
        text += '{{'
        text += f'< include {output_dir}{i} >'
        text += '}}\n\n'

    save_qmd(text, output_dir, data_page)


# Data extraction and conversion


## 1. List of green skills initiatives


In [42]:
usecases = [
                {
                    'input_file': 'data/WP2A1-Greenskills initiatives - initiatives.csv',
                    'output_dir': 'green-skills-initiatives/auto-generated/',
                    'data_page':  'green-skills-initiatives-list',
                    'usecase': 'initiative'
                },
                {
                    'input_file': 'data/WP2A2 - Green Skills Resources(Green Skills Resources).csv',
                    'output_dir': 'green-skills-resources/auto-generated/',
                    'data_page': 'green-skills-resources-list',
                    'usecase': 'resource'
                }
            ]


In [43]:
for u in usecases:
    # process the input data
    extract_data_for_website(u['input_file'], 
                             u['output_dir'],
                             u['data_page'],
                             u['usecase'])
    
    

In [44]:
usecases[0]

{'input_file': 'data/WP2A1-Greenskills initiatives - initiatives.csv',
 'output_dir': 'green-skills-initiatives/auto-generated/',
 'data_page': 'green-skills-initiatives-list',
 'usecase': 'initiative'}

In [45]:


# importing data from manually downloaded spreadsheet
input_data = pd.read_csv('data/WP2A1-Greenskills initiatives - initiatives.csv')

# location to store the auto-generated qmd files 
output_dir = 'green-skills-initiatives/auto-generated/' 

# there is an intro page for that spreadsheet
data_page = 'green-skills-initiatives-list'

# process the input data
extract_data_for_website(input_data, output_dir, data_page, usecases[0])


TypeError: argument of type 'method' is not iterable

## 2. List of resources

In [13]:
# importing data from manually downloaded spreadsheet
input_data = pd.read_csv('data/WP2A2 - Green Skills Resources(Green Skills Resources).csv', encoding = "ISO-8859-1")

In [None]:

# importing data from manually downloaded spreadsheet
input_data = pd.read_csv('data/WP2A2 - Green Skills Resources(Green Skills Resources).csv')

# location to store the auto-generated qmd files 
output_dir = 'green-skills-resources/auto-generated/' 

# there is an intro page for that spreadsheet
data_page = 'green-skills-resources-list'

# process the input data
extract_data_for_website(input_data, output_dir, data_page)


In [None]:
input_data


In [10]:
for i in range(len(initiatives)):
    row = initiatives.loc[i,]
    convert_row_to_qmd(row)

In [11]:
# list files in the initiatives folder:
auto_files = os.listdir(output_dir)

In [12]:
# add included statements in the initiative file

text = '' # '## Green Skills Initiatives Repository'
text += '\n\n'

for i in auto_files:
    text += '{{'
    text += f'< include {output_dir}{i} >'
    text += '}}\n\n'

save_qmd(text, 'green-skills-initiatives-list')
