# gathering metadata on assistive technology

using github graphql api to gather metadata about
projects tagged with assistive technology.

this exploration looks specifically at assistive technology tags.
this might be a good _training_ set for a more open general search
that tries to differentiate between assistive tech presenting and the real deal.

In [1]:
    import os, pandas
    from toolz.curried import *
    from pandas import DataFrame, Series, Index
    __import__("dotenv").load_dotenv()
    client = __import__("python_graphql_client").GraphqlClient(
        "https://api.github.com/graphql", dict(Authorization=F"token {os.environ['GITHUB_TOKEN']}")
    )


In [2]:
search_blob = """{
      search(query: "%s", type: REPOSITORY, first: 50, after: %s) {
        repositoryCount
    wikiCount
    pageInfo {
      hasNextPage
      endCursor
    }
    edges {
      node {
        ... on Repository {
          url
          id
          stargazerCount
          forkCount
          description
          issues {
            totalCount
          }
          languages(first: 20) {
            nodes {
              name
            }
          }
          licenseInfo {
            name
          }
          fundingLinks {
            platform
            url
          }      
          pullRequests {
            totalCount
          }
          updatedAt
          object(expression: "HEAD:README.md") {
            ... on Blob {
              text
            }
          }
          repositoryTopics(first: 20) {
            edges {
              node {
                topic {
                  name
                }
              }
            }
          }
        }
      }
    }
      }
}"""

In [3]:
    async def search_one(query, after=None):
        search_cache.parent.mkdir(exist_ok=True, parents=True)
        with __import__("shelve").open(search_cache) as cache:
            id = F"{query}-{after}"
            cache[id] = await client.execute_async(
                search_blob % (query, str(after and F'"{after}"' or "null")))
            return cache[id]


In [4]:
    import platformdirs
    from pathlib import Path
    cache = platformdirs.user_cache_path("a11yhood") / "github"
    cache = Path("data/github")
    search_cache = cache / "search_responses.pkl"

In [5]:
    async def search(query, stop=20):       
        queries = [await search_one(query)]
        ct = 0
        while queries[-1]["data"]["search"]:
            if info := queries[-1]["data"]["search"].get("pageInfo"):
                if info["hasNextPage"]:
                    print(queries[-1]["data"]["search"]["pageInfo"]["endCursor"])
                    queries.append(await search_one(query, queries[-1]["data"]["search"]["pageInfo"]["endCursor"]))
                    print(F"done {ct}")
                    ct += 1
                    if ct == stop: break
                    continue
            break
        return queries

In [None]:
df = pandas.concat([
    DataFrame(results := await search("topic:assistive-technology", os.environ.get("PAGES", 1))),
    DataFrame(results := await search("topic:screen-reader", os.environ.get("PAGES", 1)))])

Y3Vyc29yOjUw
done 0
Y3Vyc29yOjUw


done 0


Y3Vyc29yOjUw


done 0


In [None]:
    with __import__("shelve").open(search_cache) as db:
        data = {k: v for k, v in db.items()}

In [None]:
    Series(data).apply(compose(get("edges"), get("search"), get("data"))).explode(
        
    ).apply(compose(Series, get("node")))

https://github.com/a11yhood/research/blob/main/2025-01-14-graphql.ipynb

In [None]:
data = df["data"].apply(Series
)["search"].apply(Series
)["edges"].explode(
).apply(Series)["node"].apply(Series).set_index("id")
data.pullRequests = data.pullRequests.apply(operator.itemgetter("totalCount"))
data.issues = data.issues.apply(operator.itemgetter("totalCount"))
data["README"] = data.pop("object").apply(lambda x: x and operator.itemgetter("text") or "")

data = data.join(
    data.pop("repositoryTopics").apply(Series)["edges"]
    .explode().apply(operator.itemgetter("node")).apply(operator.itemgetter("topic")).apply(operator.itemgetter("name"))
    .groupby(level=0).agg(list).rename("topics")
)

In [None]:
data.topics.explode().value_counts().to_frame("count").T.style.set_caption(
    "the column titles are the tags of the returned search results and their counts are in the cells below"
)