# CI Workflow YAML Parsing & Feature Extraction Pipeline

In [1]:
import yaml
import logging
import pandas as pd
from pathlib import Path

In [10]:
# ─── Setup ────────────────────────────────────────────────────────────────────
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("ci_analysis_sample")

CONFIG_ROOT = Path("ci_configs")
OUT_CSV     = Path("workflow_features.csv")
AGG_OUT_CSV     = Path("aggregatED_workflow_features.csv")

### `extract_features(path: Path)`

Parses a single workflow YAML and returns:

- **`runs_on`**: set of runner labels (e.g. `ubuntu-latest`)  
- **`n_jobs`** / **`n_steps`**: count of jobs and total steps  
- **`uses_cache`** / **`has_matrix`**: booleans for cache action and matrix strategy  

Skips invalid YAML and infers `owner`/`repo` from the folder name.  

In [3]:
# ─── Extraction function (as before) ──────────────────────────────────────────
def extract_features(path: Path):
    try:
        data = yaml.safe_load(path.read_text())
        if not isinstance(data, dict):
            raise ValueError("Top‐level not a mapping")
    except Exception as e:
        logger.warning(f"Skipping {path.name}: YAML error: {e}")
        return None

    owner, repo = path.parent.name.split("--",1)
    jobs = data.get("jobs",{}) or {}

    # runs_on
    runs_on = set()
    for job in jobs.values():
        r = job.get("runs-on") or job.get("runs_on")
        if isinstance(r, str): runs_on.add(r)
        elif isinstance(r, list): runs_on.update(r)

    # counts
    n_jobs  = len(jobs)
    n_steps = sum(len(job.get("steps",[])) for job in jobs.values() if isinstance(job.get("steps"),list))

    # cache, matrix
    uses_cache = any(
        isinstance(step,dict) and "actions/cache" in step.get("uses","")
        for job in jobs.values() for step in job.get("steps",[])
    )
    has_matrix = any(
        isinstance(job.get("strategy"),dict) and "matrix" in job["strategy"]
        for job in jobs.values()
    )

    return {
        "owner": owner, "repo": repo,
        "workflow_file": path.name,
        "runs_on": list(runs_on),
        "n_jobs": n_jobs, "n_steps": n_steps,
        "uses_cache": uses_cache, "has_matrix": has_matrix
    }

In [7]:
records = []
folders = sorted(CONFIG_ROOT.iterdir())
logger.info(f"Sampling first {len(folders)} repos in {CONFIG_ROOT}")

for repo_dir in folders:
    for yml in repo_dir.glob("*.yml"):
        rec = extract_features(yml)
        if rec:
            records.append(rec)

2025-06-27 00:28:01,483 INFO Sampling first 14764 repos in ci_configs
  in "<unicode string>", line 3, column 1:
    on:
    ^
  in "<unicode string>", line 11, column 8:
       uses: actions/checkout@v2
           ^
  in "<unicode string>", line 47, column 21:
              group_id: {{ groupId }}                 #  ... 
                        ^
found unhashable key
  in "<unicode string>", line 47, column 22:
              group_id: {{ groupId }}                 # U ... 
                         ^
  in "<unicode string>", line 55, column 21:
              group_id: {{ groupId }}                 #  ... 
                        ^
found unhashable key
  in "<unicode string>", line 55, column 22:
              group_id: {{ groupId }}                 # U ... 
                         ^
  in "<unicode string>", line 14, column 2:
     respond-to-events:
     ^
expected <block end>, but found '-'
  in "<unicode string>", line 22, column 2:
     - name: Respond to new issues
     ^
  in "<u

In [9]:
df = pd.DataFrame(records)
df.to_csv(OUT_CSV, index=False)
logger.info(f"Wrote {len(df)} feature rows to {OUT_CSV}")

2025-06-27 00:33:15,545 INFO Wrote 27756 feature rows to workflow_features.csv


In [11]:
# aggregate to one row per (owner,repo)
repo_df = (
    df
    .groupby(["owner","repo"], as_index=False)
    .agg({
        "runs_on":    lambda lists: sorted({os for lst in lists for os in lst}),
        "n_jobs":     "sum",
        "n_steps":    "sum",
        "uses_cache": "max",   # True if any workflow uses cache
        "has_matrix": "max"    # True if any workflow has matrix
    })
)

# write out the per‐repo features
repo_df.to_csv(AGG_OUT_CSV, index=False)
logger.info(f"Wrote {len(repo_df)} repo‐level feature rows to {OUT_CSV}")


2025-06-27 00:41:42,115 INFO Wrote 13730 repo‐level feature rows to workflow_features.csv


### Integrating CI Runtime Metrics with Workflow Configuration Features

This script loads GitHub Actions run metadata (`build_runs.csv`) and aggregated workflow configuration features (`aggregated_workflow_features.csv`), then performs an inner join on `owner` and `repo` to retain only repositories present in both datasets, producing a unified table (`ci_runs_with_workflow_features.csv`) for further analysis.

In [12]:
import pandas as pd
import ast

# 1) Load CI run metadata
ci_runs_df = pd.read_csv(
    "build_runs.csv",    # replace with your actual CI runs filename
    parse_dates=["run_started_at", "updated_at"]
)
ci_runs_df["duration_minutes"] = (
    ci_runs_df["updated_at"] - ci_runs_df["run_started_at"]
).dt.total_seconds() / 60

# 2) Load aggregated workflow features
workflow_features_df = pd.read_csv(
    "aggregated_workflow_features.csv",  # replace with your actual features filename
    converters={"runs_on": ast.literal_eval}
)

# 3) Keep only repos present in both datasets
combined_df = pd.merge(
    left=ci_runs_df,
    right=workflow_features_df,
    on=["owner", "repo"],
    how="inner",
    suffixes=("_run", "_workflow")
)

print(f"CI runs records:       {len(ci_runs_df)}")
print(f"Workflow features:     {len(workflow_features_df)}")
print(f"Records in both sets:  {len(combined_df)}")

# 4) Preview the merged table
print(combined_df.head())

# 5) Optionally save for further analysis
combined_df.to_csv("ci_runs_with_workflow_features.csv", index=False)


CI runs records:       12675
Workflow features:     13730
Records in both sets:  11800
           owner              repo       run_id  run_number  workflow_id  \
0        dgarijo            widoco  13308358456         395     24445829   
1   snowballstem  snowball-website  15717069293         155     70994949   
2         apache   cordova-android  15840192717        1313       457902   
3  sumeetchhetri              gatf  15052293010           2    155241087   
4        vert-x3        vertx-unit  15894194436          40    163702269   

                                       workflow_name     event head_branch  \
0                                                 CI      push      master   
1                                     Update website      push        main   
2                                            Node CI      push      master   
3  maven in /. for io.github.bonigarcia:webdriver...   dynamic      master   
4                            vertx-unit (5.x-stable)  schedule    