# Purpose
Use the GitHub Api to grab and analyze information about our Action usage.

You must [create](https://docs.github.com/en/enterprise-server@3.6/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) and save a Github personal access token and save it set it to the `GITHUB_ACTION_ANALYSIS_ACCESS_TOKEN` env var. You should only need the `public_repo` permission.

The calls to the github API are pretty slow and I think we are limit to [5000 requests per hour](https://docs.github.com/en/free-pro-team@latest/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user).

In [None]:
import requests
import os

from datetime import date, datetime

def get_jobs(url: str):
    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {token}",
        "X-GitHub-Api-Version": "2022-11-28"
    }
    response = requests.get(url, headers=headers)
    return response.json()["jobs"]


def get_all_runs(owner: str, repo: str, token: str, min_date: date = None) -> list[dict]:
    """
    Get a list of all action runs for  repository.
    
    Args:
        owner: the github organization name
        repo: the name of the repo
        token: personal github token
        min_date: function defaults to getting all workflow runs.
            Can use this arg to limit number of API calls.
    Returns:
        Returns a list of json responses from the API as dictionaries.
    """
    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {token}",
        "X-GitHub-Api-Version": "2022-11-28"
    }

    url = f"https://api.github.com/repos/{owner}/{repo}/actions/runs"
    # API documentation https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28#list-workflow-runs-for-a-repository

    all_runs = []

    invariant = True

    page = 1
    while invariant:
        response = requests.get(url, headers=headers, params={"page": page})
        
        if response.status_code == 200:
            data = response.json()["workflow_runs"]
            if not data:
                break  # No more data, exit the loop
            all_runs.extend(data)
            page += 1

            for run in data:
                if min_date:
                    run_date = datetime.strptime(run["created_at"], "%Y-%m-%dT%H:%M:%SZ").date()
                    if run_date < min_date:
                        invariant = False
                        break
                all_runs.append(run)
        else:
            print("Request failed with status code:", response.status_code)
            print("Response content:", response.text)
            break

    return all_runs

# Replace these values with your actual owner, repo, and token
owner = "catalyst-cooperative"
repo = "pudl"
token = os.environ["GITHUB_ACTION_ANALYSIS_ACCESS_TOKEN"]
min_date = date(2023, 8, 31)

all_runs = get_all_runs(owner, repo, token, min_date)

In [None]:
all_jobs = []
for run in tqdm(all_runs):
    jobs = get_jobs(run["jobs_url"])
    all_jobs.extend(jobs)

In [None]:
import pandas as pd
import json

In [None]:
jobs_df = pd.read_json(json.dumps(all_jobs), orient="records")
jobs_df.info()

In [None]:
jobs_df["duration"] = jobs_df.completed_at - jobs_df.started_at

In [None]:
jobs_df["duration"].describe()

In [None]:
jobs_df.name.value_counts()

In [None]:
job_name = "ci-integration"
jobs_df.query("name == @job_name").duration.describe()

In [None]:
jobs_df.groupby("name").duration.sum() / jobs_df.duration.sum() * 100

In [None]:
jobs_df.duration.isna().value_counts()

In [None]:
monthly_minutes = jobs_df.resample("M", on="created_at").duration.sum().dt.total_seconds() / 60

monthly_minutes.plot.bar()