In [None]:
from IPython.display import Markdown as md
from IPython.display import display, HTML
import json
import glob
import ast
import logging
import os
import sys

In [2]:
HTML(
    """
<style>
    canvas.marks { display: block; margin: auto; }
    div.vega-embed { width: 100%; }
</style>
"""
)

In [3]:
logger = logging.getLogger("root")
logging.basicConfig(
    format="\033[1;36m(def %(funcName)s %(lineno)s): \033[1;37m %(message)s",
    level=logging.INFO,
)

TAB = "\t"
title = "{{ cookiecutter.project_name }}"
project_short_description = "{{ cookiecutter.project_short_description }}"
project_slug = "{{ cookiecutter.project_slug }}"

In [None]:
def return_doc_string(script):
    """
    Parses a python file for a docstring and
    retrieves the details to output as a dictionary
    """
    # create a container for the output
    output = {}
    with open(script, "r") as file:
        # read the file
        file_content = file.read()
        # parse the module
        module = ast.parse(file_content)
        # find the docstring in the file
        docstring = ast.get_docstring(module)
        # index our special characters that set off our make command
        start_index = docstring.find("_") + 1
        end_index = docstring.find("|")
        # let's isolate the make command
        if start_index < 0:
            output["make"] = None
        elif start_index > 0:
            output["make"] = docstring[start_index:end_index]
        else:
            output["make"] = None
        # isolate the doc string
        output["docs"] = docstring.replace(f"_{output['make']}|", "").rstrip()
        return output

In [None]:
def extract_js_docstrings():
    """
    Scans a given directory for .js files and extracts text between /** and **/ comment blocks.
    """
    output = ""
    directory = os.path.join(os.getcwd(), "scripts")
    markdown_output = []
    pattern = re.compile(r"/\*\*(.*?)\*/", re.DOTALL)
    for filename in os.listdir(directory):
        if filename.endswith(".js"):
            file_path = os.path.join(directory, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                matches = pattern.findall(content)
                if matches:
                    for docstring in matches:
                        summary = (
                            docstring.strip().splitlines()[0].strip()
                            if docstring.strip().splitlines()
                            else "No summary available"
                        )
                        markdown_output.append(f"- **{filename}**: {summary}")

    return "\n\n".join(markdown_output)

In [None]:
def document_etl_files():
    """
    Attempt at autodocumentation to list a directory
    of etl scripts, pull the name of the file and the doc
    string so I can auto document what these things do
    """
    if os.getcwd().split("/")[-1] == "analysis":
        os.chdir(os.path.dirname(os.getcwd()))
    etl_dir = os.path.join(os.getcwd(), "etl")
    etl_scripts = glob.glob(os.path.join(etl_dir, "*.py"))
    output = []
    sorted_scripts = sorted(etl_scripts)
    for script in sorted_scripts:
        scriptname = os.path.basename(script)
        docstring = return_doc_string(script)
        etl_docs_output = ""
        etl_docs_output += f"- `{scriptname}`\n\n"
        if docstring["make"] != None:
            etl_docs_output += f"{TAB}- {docstring['make']}\n\n"
        else:
            etl_docs_output += f"{TAB}- **Make command**: None\n\n"
        etl_docs_output += f"{TAB}- **What it does**: {docstring['docs']}\n\n"
        output.append(etl_docs_output)
    return "\n".join(output)

In [None]:
etl_docs = document_etl_files()

In [None]:
def document_analysis_notebooks():
    """
    Attempt at autodocumentation to list a directory
    of analysis notebooks, pull the name of the file
    and details from the first cell which contains a
    description of what it does.
    """
    # create a path to the analysis notebooks
    path = os.path.join(os.getcwd(), "analysis")
    # find all of the analysis notebooks
    analysis_notebooks = glob.glob(os.path.join(path, "*.ipynb"))
    # create a holding container
    output = []
    # sort the notebooks alphabetically
    notebooks = sorted(analysis_notebooks)
    # loop through the notebooks
    for notebook in notebooks:
        # get the name of the notebook
        notebook_name = os.path.basename(notebook)
        # load the notebook content
        with open(notebook, "r", encoding="utf-8") as f:
            notebook = json.load(f)
        # access the first cell
        first_cell = notebook["cells"][0]
        # access the abstract
        abstract = first_cell["source"][3].replace("abstract: ", "").replace('"', "")
        # structure some output
        notebooks_output = ""
        # here's the filename of the notebook
        notebooks_output += f"- `{notebook_name}`\n\n"
        # here's the abstract of the notebook
        notebooks_output += f"{TAB}- **Prompt it answers**: {abstract}\n\n"
        # add it to our container
        output.append(notebooks_output)
    # return the output
    return "\n".join(output)

In [None]:
analysis_notebooks = document_analysis_notebooks()

In [4]:
# bring in data from top_level_analysis for AP findings.
# %run "analysis/top_level_analysis.ipynb"

In [6]:
overview = (
    f"# {title}\n\n"
    f"*Current maintainer(s) Christopher L. Keller (<ckeller@ap.org>)*\n\n"
    f"## Project Overview\n\n"
    f"{project_short_description}.\n\n"
    f"## Project notes\n\n"
    f"### Staff involved\n\n"
    f"*Created by:*\n\n"
    f"Christopher L. Keller<br />\n\n"
    f"Data & Graphics Reporter<br />\n\n"
    f"The Associated Press<br />\n\n"
    f"**Email**: ckeller@ap.org<br />\n\n"
    f"**Phone**: (505) 435-2921<br />\n\n"
    f"*Reporter:*\n\n"
    f"- *TK:*\n"
)

In [23]:
data_notes = (
    f"## Data notes\n\n"
    f"### Data sources\n\n"
    f"*TK:*\n\n"
    f"### AP's findings\n\n"
    f"*TK:*\n\n"
    f"### Metadata\n\n"
    f"*TK:*\n\n"
    f"### Known limitations and Caveats\n\n"
    f"*TK:*\n\n"
    f"### Noteworthy Links\n\n"
    f"*TK:*\n\n"
    f"### Notes\n\n"
    f"*TK:*\n\n"
    f"### Sources\n\n"
    f"*TK:*\n\n"
)

In [None]:
scripts_and_notebooks = (
    f"## Scripts and Notebooks\n\n"
    f"### ETL prep data for analysis\n\n"
    f"These are found in `/etl`. They are:\n\n"
    f"{etl_docs}"
    f"### Notebooks for analysis\n\n"
    f"These are found in `/analysis`. They are:\n\n"
    f"{analysis_notebooks}"
)

In [None]:
makefile_commands = (
    f"### Makefile Commands\n\n"
    f"**datakit tasks**\n\n"
    f"- dkgitlab: runs `datakit gitlab integrate`\n\n"
    f"- dkdata: runs `datakit data init`\n\n"
    f"- dkdata_push: `datakit data push`\n\n"
    f"- dkdata_pull: runs `datakit data pull`\n\n"
    f"- run_notebook: runs `pipenv run jupyter lab --no-browser`\n\n"
    f"- analysis_files: runs `pipenv run quarto render analysis/*.ipynb --to html --execute`\n\n"
    f"- rmd_to_notebook: runs `pipenv run jupytext --set-formats Rmd,ipynb analysis/*.Rmd`\n\n"
    f"- notebook_to_rmd: runs `pipenv run jupytext --set-formats ipynb,Rmd analysis/*.ipynb`\n\n"
    f"- readme: Moves `readme.ipynb` from the root directory to the `analysis` directory and runs the `quarto render` to overwite the README.md file.\n\n"
    f"**package management tasks**\n\n"
    f"- sync: runs `pipenv sync`\n\n"
    f"- install_linting: runs `pipenv install black flake8`\n\n"
    f"- install_etl: runs `pipenv install p_tqdm`\n\n"
    f"- install_analysis: runs `plotnine`\n\n"
    f"- install_web_scrape: runs `pipenv install requests beautifulsoup4`\n\n"
    f"- install_gis: runs `pipenv install geopandas`\n\n"
    f"**data tasks**\n\n"
    f'- check_s3: runs `aws s3 ls --recursive "s3://data.ap.org/projects/2024/{project_slug}/data" --profile default`\n\n'
    f"**data distribution tasks**\n\n"
    f"- copy_distributed_data: runs `cp <PATH_TO_LOCAL_FILE> data/public`\n\n"
    f"- create_distribution: runs `datakit dworld create --slug campus-protest-arrests-big-picture`\n\n"
    f"- upload_distribution: runs `datakit dworld push`\n\n"
    f"- update_documentation: runs `datakit dworld summary`\n\n"
)

In [None]:
technical = (
    f"## Technical\n\n"
    f"### Assumptions\n\n"
    f"- You've installed [Homebrew](https://brew.sh/).\n\n"
    f"- You've set up your [python environment](https://github.com/associatedpress/cookiecutter-python-project#full-virtual-environment-setup-from-package-management-to-rendering-analyses) with [Pyenv](https://github.com/pyenv/pyenv) to manage our python installations and [Pipenv](https://pipenv.pypa.io/en/latest/) to manage the python packages.\n\n"
    f"- You've setup datakit and configured the various datakit plugins.\n\n"
    f"### Rebuilding or updating the project\n\n"
    f"- Clone the repo down\n\n"
    f"{TAB}- `git clone git@gitlab.inside.ap.org:data/{project_slug}.git`\n\n"
    f"- Change into directory\n\n"
    f"{TAB}- `cd {project_slug}`\n\n"
    f"- Build the datakit config\n\n"
    f"{TAB}- `pipenv run python .first_install.py`\n\n"
    f"- Install the python dependencies from the Pipfile\n\n"
    f"{TAB}- `make sync`\n\n"
    f"- Retrieve the data files\n\n"
    f"{TAB}- `datakit data pull`\n\n"
    f"### Closing out\n\n"
    f"- When finished, push the data files back up\n\n"
    f"{TAB} - `datakit data push` to send the data files up to AWS.\n\n"
    f"- Check to see if the data is stored in the project's S3 bucket\n\n"
    f"{TAB}  - `aws s3 ls --recursive 's3://data.ap.org/projects/2024/{project_slug}/data' --profile default` \n\n"
    # f"### Building the project from scratch\n\n"
    # f"- `datakit project create`\n\n"
    # f"{TAB}- You will be prompted to add:\n\n"
    # f"{TAB}- your first and last name\n\n"
    # f"{TAB}- email\n\n"
    # f"{TAB}- project name\n\n"
    # f"{TAB}- project_slug\n\n"
    # f"{TAB}- project_short_description\n\n"
    # f"- `datakit gitlab integrate`: Code is all tracked in version control and hosted on Gitlab. Important analysis code doesn't live on only one computer, and a detailed revision history is now an assumed feature of all projects. The integration automatically creates a project in gitlab, ready for the reporter to push commits. Issues can be quickly filed using gitlab issues add without having to use the web interface.\n\n"
    # f"- Initialize project to use S3 to store data files\n\n"
    # f"{TAB}- `datakit data init`\n\n"
)

In [None]:
# create the holding tank for our README.md file
readme = (
    f"{overview}"
    f"{data_notes}"
    f"{scripts_and_notebooks}"
    f"{makefile_commands}"
    f"{technical}"
)

In [28]:
# open a file in write mode ('w')
with open("./README.md", "w") as file:
    # Write content to the file
    file.write(readme)

In [None]:
md(readme)