In [1]:
from collections import defaultdict, Counter
import requests
from datetime import datetime, timedelta
import os
import pandas as pd
GH_API_TOKEN = os.environ.get("GH_API_TOKEN")

## Setup
Methods for retreiving GitHub issues and then filtering and aggregating them.

In [2]:
TOPIC_KEYWORDS_PYTHON = {
    "dataset": ["dataset", "Dataset", "open_dataset", "write_dataset"],
    "compute": ["compute", "expression", "filter", "join", "hash"],
    "ipc": ["ipc", "streaming format", "file format"],
    "feather": ["feather", "read_feather", "write_feather"],
    "tables": [".Table", ".RecordBatch", ".Schema"],
    "filesystem": ["LocalFileSystem", "azure", "fs", "filesystem", "S3", "HDFS"],
    "pandas": ["pandas", "to_pandas", "from_pandas"],
    "parquet": ["parquet", "ParquetFile", "ParquetDataset"],
    "acero": ["acero", "Declaration"],
    "extension": ["extension types", "ExtensionType", "ExtensionArray"],
    "install": ["pip install", "build", "wheel", "conda", "mamba"]
}

In [3]:
def fetch_component_issues(component, start_date="2024-01-01", end_date="2025-12-31", state="open"):
    headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {GH_API_TOKEN}"}
    base_url = "https://api.github.com/search/issues"
    results = []

    start = datetime.fromisoformat(start_date)
    end = datetime.fromisoformat(end_date)

    # Step through time in chunks to avoid 1000 result cap
    while start < end:
        chunk_end = min(start + timedelta(days=30), end)
        query = (
            f"repo:apache/arrow is:issue is:{state} "
            f"label:{component} "
            f"created:{start.date()}..{chunk_end.date()}"
        )
        page = 1

        while True:
            params = {"q": query, "per_page": 100, "page": page}
            response = requests.get(base_url, headers=headers, params=params)
            response.raise_for_status()
            data = response.json()
            items = data.get("items", [])
            results.extend(items)
            if len(items) < 100:
                break  # No more pages
            page += 1

        start = chunk_end + timedelta(days=1)

    return results


In [4]:
# Add code here to set GITHUB_TOKEN
issues = fetch_component_issues("\"Component: Python\"")

In [5]:
# ------ USAGE ------
# Fetch open issues created in 2024 and 2025 together
print(f"Found {len(issues)} open issues created in 2024&2025.")

Found 434 open issues created in 2024&2025.


In [6]:
def filter_issues_by_keywords(issues, keywords_dict):
    "Filter issues by the keywords included in the issue title and body"
    filtered_issues = {key: [] for key in keywords_dict}
    for issue in issues:
      for label in issue["labels"]:
        if label["name"] == "Component: Python":
          for topic, keywords in keywords_dict.items():
              if any(keyword.lower() in (issue['title'] + ' ' + (issue.get('body', '') or '')).lower() for keyword in keywords):
                  filtered_issues[topic].append(issue)
    return filtered_issues

In [7]:

# Filter issues by keywords
filtered_issues = filter_issues_by_keywords(issues, TOPIC_KEYWORDS_PYTHON)
print(f"Filtered issues by {len(filtered_issues.keys())} keywords:")
for key in filtered_issues.keys():
   print(f"Keyword {key} got {len(filtered_issues[key])} issues.")

Filtered issues by 11 keywords:
Keyword dataset got 107 issues.
Keyword compute got 93 issues.
Keyword ipc got 21 issues.
Keyword feather got 15 issues.
Keyword tables got 110 issues.
Keyword filesystem got 82 issues.
Keyword pandas got 88 issues.
Keyword parquet got 124 issues.
Keyword acero got 12 issues.
Keyword extension got 9 issues.
Keyword install got 77 issues.


In [8]:
filtered_issues

{'dataset': [{'url': 'https://api.github.com/repos/apache/arrow/issues/39854',
   'repository_url': 'https://api.github.com/repos/apache/arrow',
   'labels_url': 'https://api.github.com/repos/apache/arrow/issues/39854/labels{/name}',
   'comments_url': 'https://api.github.com/repos/apache/arrow/issues/39854/comments',
   'events_url': 'https://api.github.com/repos/apache/arrow/issues/39854/events',
   'html_url': 'https://github.com/apache/arrow/issues/39854',
   'id': 2108036423,
   'node_id': 'I_kwDOAxgDSc59phVH',
   'number': 39854,
   'title': '[Python] cannot allocate memory in static TLS block exception raised when pyarrow is imported before _mysql',
   'user': {'login': 'mdobrzanski',
    'id': 807888,
    'node_id': 'MDQ6VXNlcjgwNzg4OA==',
    'avatar_url': 'https://avatars.githubusercontent.com/u/807888?v=4',
    'gravatar_id': '',
    'url': 'https://api.github.com/users/mdobrzanski',
    'html_url': 'https://github.com/mdobrzanski',
    'followers_url': 'https://api.github.c

Filter issues by the keywords included in the issue title and body

In [9]:

rows = []
for topic, issues in filtered_issues.items():
  for issue in issues:
      type_labels = [label["name"] for label in issue["labels"] if label["name"].lower().startswith("type:")]
      if not type_labels:
          type_labels = ["Type: unlabeled"]
      for label in type_labels:
          rows.append({"title": issue["title"], "url": issue["url"], "created_at": issue["created_at"], "topic": topic, "type_label": label})
df = pd.DataFrame(rows)

In [10]:
df

Unnamed: 0,title,url,created_at,topic,type_label
0,[Python] cannot allocate memory in static TLS ...,https://api.github.com/repos/apache/arrow/issu...,2024-01-30T14:44:33Z,dataset,Type: bug
1,[Python] Parsing timestamp with microsecond i...,https://api.github.com/repos/apache/arrow/issu...,2024-01-29T21:24:33Z,dataset,Type: bug
2,[Python] Dataset.to_batches accumulates memory...,https://api.github.com/repos/apache/arrow/issu...,2024-01-26T15:44:18Z,dataset,Type: bug
3,[Python] Write_dataset() run time does not sca...,https://api.github.com/repos/apache/arrow/issu...,2024-01-23T19:09:01Z,dataset,Type: bug
4,[Python] Float value keys written as `x` inste...,https://api.github.com/repos/apache/arrow/issu...,2024-01-23T16:25:43Z,dataset,Type: bug
...,...,...,...,...,...
734,[Python] Skip test_gdb.py tests if PyArrow was...,https://api.github.com/repos/apache/arrow/issu...,2025-06-06T15:02:36Z,install,Type: enhancement
735,[Python] Some jobs are failing due to missing ...,https://api.github.com/repos/apache/arrow/issu...,2025-06-05T08:07:19Z,install,Type: bug
736,`tests` files shipped with python wheel,https://api.github.com/repos/apache/arrow/issu...,2025-05-22T09:13:44Z,install,Type: bug
737,[Python] PyArrow fails compiling without CSV e...,https://api.github.com/repos/apache/arrow/issu...,2025-06-30T14:41:30Z,install,Type: bug


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Count of issues per topic/type
counts = df.groupby(["topic", "type_label"]).size().unstack(fill_value=0)

# Compute total issues per topic
topic_order = counts.sum(axis=1).sort_values(ascending=False).index

# Reorder rows of counts
counts_sorted = counts.loc[topic_order]

# Plot
counts_sorted.plot(kind="bar", stacked=True, figsize=(12, 6))
plt.ylabel("Number of issues")
plt.xlabel("Topic")
plt.title("Issue Counts by Topic and Type")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()