# Datasets to the rescue!

In [1]:
# !pip install requests
# !pip install python-dotenv

In [2]:
import requests
from dotenv import dotenv_values
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset

In [3]:
url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"

In [4]:
response = requests.get(url)

In [5]:
response.status_code

200

In [6]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/6322',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/6322/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/6322/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/6322/events',
  'html_url': 'https://github.com/huggingface/datasets/pull/6322',
  'id': 1952947461,
  'node_id': 'PR_kwDODunzps5dT5vG',
  'number': 6322,
  'title': 'Fix regex `get_data_files` formatting for base paths',
  'user': {'login': 'ZachNagengast',
   'id': 1981179,
   'node_id': 'MDQ6VXNlcjE5ODExNzk=',
   'avatar_url': 'https://avatars.githubusercontent.com/u/1981179?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/ZachNagengast',
   'html_url': 'https://github.com/ZachNagengast',
   'followers_url': 'https://api.github.com/users/ZachNagengast/followers',


In [7]:
config = dotenv_values(".env")

In [8]:
headers = {"Authorization": f"token {config['GITHUB_TOKEN']}"}

In [9]:
def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path('.'),
):
    
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100 # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []
            print(f"Reached Github rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl")

In [10]:
Path("./datasets-issues.jsonl").exists()

True

In [11]:
if not Path("./datasets-issues.jsonl").exists(): fetch_issues()

In [12]:
df = pd.read_json("./datasets-issues.jsonl", orient="records", lines=True)

In [13]:
df.transpose().head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6269,6270,6271,6272,6273,6274,6275,6276,6277,6278
url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
repository_url,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets
labels_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
comments_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
events_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
html_url,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/pull/6301,https://github.com/huggingface/datasets/pull/6300,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/pull/6298,https://github.com/huggingface/datasets/pull/6297,https://github.com/huggingface/datasets/pull/6296,https://github.com/huggingface/datasets/pull/6295,https://github.com/huggingface/datasets/issues...,...,https://github.com/huggingface/datasets/pull/10,https://github.com/huggingface/datasets/pull/9,https://github.com/huggingface/datasets/pull/8,https://github.com/huggingface/datasets/pull/7,https://github.com/huggingface/datasets/issues/6,https://github.com/huggingface/datasets/issues/5,https://github.com/huggingface/datasets/issues/4,https://github.com/huggingface/datasets/issues/3,https://github.com/huggingface/datasets/issues/2,https://github.com/huggingface/datasets/pull/1
id,1943466532,1942096078,1940183999,1940153432,1939649238,1938797389,1938752707,1938453845,1937362102,1937359605,...,603909327,603894874,601783243,601780534,600330836,600295889,600185417,600180050,599767671,599457467
node_id,I_kwDODunzps5z1vIk,I_kwDODunzps5zwgjO,PR_kwDODunzps5cpPVh,PR_kwDODunzps5cpIoG,I_kwDODunzps5znLLW,PR_kwDODunzps5ckg6j,PR_kwDODunzps5ckXBa,PR_kwDODunzps5cjUs1,PR_kwDODunzps5cfiW8,I_kwDODunzps5zecL1,...,MDExOlB1bGxSZXF1ZXN0NDA2NjAxNzQ2,MDExOlB1bGxSZXF1ZXN0NDA2NTkwMDQw,MDExOlB1bGxSZXF1ZXN0NDA0OTg0NDUz,MDExOlB1bGxSZXF1ZXN0NDA0OTgyMzA2,MDU6SXNzdWU2MDAzMzA4MzY=,MDU6SXNzdWU2MDAyOTU4ODk=,MDU6SXNzdWU2MDAxODU0MTc=,MDU6SXNzdWU2MDAxODAwNTA=,MDU6SXNzdWU1OTk3Njc2NzE=,MDExOlB1bGxSZXF1ZXN0NDAzMDk1NDYw
number,6303,6302,6301,6300,6299,6298,6297,6296,6295,6294,...,10,9,8,7,6,5,4,3,2,1
title,Parquet uploads off-by-one naming scheme,ArrowWriter/ParquetWriter `write` method does ...,Unpin `tensorflow` maximum version,Unpin `jax` maximum version,Support for newer versions of JAX,Doc readme improvements,Fix ArrayXD cast,Move `exceptions.py` to `utils/exceptions.py`,Fix parquet columns argument in streaming mode,IndexError: Invalid key is out of bounds for s...,...,"Name json file ""squad.json"" instead of ""squad....",[Clean up] Datasets,Fix issue 6: error when the citation is missin...,Fix issue 5: allow empty datasets,Error when citation is not given in the Datase...,ValueError when a split is empty,[Feature] Keep the list of labels of a dataset...,[Feature] More dataset outputs,Issue to read a local dataset,changing nlp.bool to nlp.bool_


It seems there is an issue with `load_dataset`: https://github.com/huggingface/datasets/issues/5422

Therefore lets use the suggestion here https://github.com/huggingface/datasets/issues/5422#issuecomment-1718035537

In [14]:
from datasets import Dataset
issues_dataset = Dataset.from_pandas(df)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
    num_rows: 6279
})

In [15]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

In [16]:
for url, pr in zip(sample['html_url'], sample['pull_request']):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/huggingface/datasets/issues/1965
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/pull/3472
>> Pull request: {'diff_url': 'https://github.com/huggingface/datasets/pull/3472.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/3472', 'merged_at': '2021-12-22T16:52:52Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/3472.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/3472'}

>> URL: https://github.com/huggingface/datasets/pull/892
>> Pull request: {'diff_url': 'https://github.com/huggingface/datasets/pull/892.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/892', 'merged_at': '2020-11-27T18:08:44Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/892.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/892'}



In [17]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

Map:   0%|          | 0/6279 [00:00<?, ? examples/s]

In [18]:
issues_dataset_closed = issues_dataset.filter(lambda x: x["is_pull_request"] == False and x['state'] == 'closed')

Filter:   0%|          | 0/6279 [00:00<?, ? examples/s]

In [19]:
issues_dataset_closed.set_format("pandas")

In [20]:
issues_dataset_closed

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 1946
})

In [21]:
issues_dataset_closed_df = issues_dataset_closed[:]

In [22]:
issues_dataset_closed_df.transpose().head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945
url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
repository_url,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets
labels_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
comments_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
events_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
html_url,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues/38,https://github.com/huggingface/datasets/issues/6,https://github.com/huggingface/datasets/issues/5,https://github.com/huggingface/datasets/issues/4,https://github.com/huggingface/datasets/issues/3,https://github.com/huggingface/datasets/issues/2
id,1939649238,1937238047,1936129871,1932758192,1929551712,1927044546,1921354680,1921036328,1920329373,1914951043,...,618632573,618628264,618615855,618611310,611677656,600330836,600295889,600185417,600180050,599767671
node_id,I_kwDODunzps5znLLW,I_kwDODunzps5zd-gf,I_kwDODunzps5zZv9P,I_kwDODunzps5zM4yw,I_kwDODunzps5zAp9g,I_kwDODunzps5y3F3C,I_kwDODunzps5yhYu4,I_kwDODunzps5ygLAo,I_kwDODunzps5ydead,I_kwDODunzps5yI9WD,...,MDU6SXNzdWU2MTg2MzI1NzM=,MDU6SXNzdWU2MTg2MjgyNjQ=,MDU6SXNzdWU2MTg2MTU4NTU=,MDU6SXNzdWU2MTg2MTEzMTA=,MDU6SXNzdWU2MTE2Nzc2NTY=,MDU6SXNzdWU2MDAzMzA4MzY=,MDU6SXNzdWU2MDAyOTU4ODk=,MDU6SXNzdWU2MDAxODU0MTc=,MDU6SXNzdWU2MDAxODAwNTA=,MDU6SXNzdWU1OTk3Njc2NzE=
number,6299,6293,6291,6287,6284,6277,6275,6274,6270,6263,...,117,116,115,114,38,6,5,4,3,2
title,Support for newer versions of JAX,Choose columns to stream parquet data in strea...,Casting type from Array2D int to Array2D float...,"map() not recognizing ""text""",Add Belebele multiple-choice machine reading c...,FileNotFoundError: Couldn't find a module scri...,Would like to Contribute a dataset,FileNotFoundError for dataset with multiple bu...,Dataset.from_generator raises with sharded gen...,CI is broken: ImportError: cannot import name ...,...,❓ How to remove specific rows of a dataset ?,🐛 Trying to use ROUGE metric : pyarrow.lib.Arr...,AttributeError: 'dict' object has no attribute...,Couldn't reach CNN/DM dataset,[Checksums] Error for some datasets,Error when citation is not given in the Datase...,ValueError when a split is empty,[Feature] Keep the list of labels of a dataset...,[Feature] More dataset outputs,Issue to read a local dataset


In [23]:
(issues_dataset_closed_df['closed_at'] - issues_dataset_closed_df['created_at']).mean()

Timedelta('77 days 06:27:28.649023638')

In [24]:
(issues_dataset_closed_df['closed_at'] - issues_dataset_closed_df['created_at']).median()

Timedelta('5 days 17:36:28.500000')

In [25]:
(issues_dataset_closed_df['closed_at'] - issues_dataset_closed_df['created_at']).describe()

count                           1946
mean      77 days 06:27:28.649023638
std      175 days 16:54:51.213781774
min                  0 days 00:00:18
25%           0 days 20:17:49.500000
50%           5 days 17:36:28.500000
75%                 34 days 16:36:16
max               1081 days 08:05:48
dtype: object

In [26]:
pull_request_dataset = issues_dataset.filter(lambda x: x["is_pull_request"] == True and x['state'] == 'closed')

Filter:   0%|          | 0/6279 [00:00<?, ? examples/s]

In [27]:
pull_request_dataset.set_format("pandas")
pull_request_df = pull_request_dataset[:]

In [28]:
pull_request_df.transpose().head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
repository_url,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets
labels_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
comments_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
events_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
html_url,https://github.com/huggingface/datasets/pull/6301,https://github.com/huggingface/datasets/pull/6300,https://github.com/huggingface/datasets/pull/6298,https://github.com/huggingface/datasets/pull/6297,https://github.com/huggingface/datasets/pull/6295,https://github.com/huggingface/datasets/pull/6289,https://github.com/huggingface/datasets/pull/6286,https://github.com/huggingface/datasets/pull/6281,https://github.com/huggingface/datasets/pull/6278,https://github.com/huggingface/datasets/pull/6265,...,https://github.com/huggingface/datasets/pull/15,https://github.com/huggingface/datasets/pull/14,https://github.com/huggingface/datasets/pull/13,https://github.com/huggingface/datasets/pull/12,https://github.com/huggingface/datasets/pull/11,https://github.com/huggingface/datasets/pull/10,https://github.com/huggingface/datasets/pull/9,https://github.com/huggingface/datasets/pull/8,https://github.com/huggingface/datasets/pull/7,https://github.com/huggingface/datasets/pull/1
id,1940183999,1940153432,1938797389,1938752707,1937362102,1935628506,1932640128,1928456959,1927957877,1915651566,...,604906708,604761315,604547951,604518583,603921624,603909327,603894874,601783243,601780534,599457467
node_id,PR_kwDODunzps5cpPVh,PR_kwDODunzps5cpIoG,PR_kwDODunzps5ckg6j,PR_kwDODunzps5ckXBa,PR_kwDODunzps5cfiW8,PR_kwDODunzps5cZiay,PR_kwDODunzps5cPKNK,PR_kwDODunzps5cBQPd,PR_kwDODunzps5b_iKb,PR_kwDODunzps5bWDfc,...,MDExOlB1bGxSZXF1ZXN0NDA3NDEwOTk3,MDExOlB1bGxSZXF1ZXN0NDA3MjkzNjU5,MDExOlB1bGxSZXF1ZXN0NDA3MTIxMjkw,MDExOlB1bGxSZXF1ZXN0NDA3MDk3MzA4,MDExOlB1bGxSZXF1ZXN0NDA2NjExODk2,MDExOlB1bGxSZXF1ZXN0NDA2NjAxNzQ2,MDExOlB1bGxSZXF1ZXN0NDA2NTkwMDQw,MDExOlB1bGxSZXF1ZXN0NDA0OTg0NDUz,MDExOlB1bGxSZXF1ZXN0NDA0OTgyMzA2,MDExOlB1bGxSZXF1ZXN0NDAzMDk1NDYw
number,6301,6300,6298,6297,6295,6289,6286,6281,6278,6265,...,15,14,13,12,11,10,9,8,7,1
title,Unpin `tensorflow` maximum version,Unpin `jax` maximum version,Doc readme improvements,Fix ArrayXD cast,Fix parquet columns argument in streaming mode,testing doc-builder,Create DefunctDatasetError,Improve documentation of dataset.from_generator,No data files duplicates,Remove `apache_beam` import in `BeamBasedBuild...,...,[Tests] General Test Design for all dataset sc...,[Download] Only create dir if not already exist,[Make style],[Map Function] add assert statement if map fun...,[Convert TFDS to HFDS] Extend script to also a...,"Name json file ""squad.json"" instead of ""squad....",[Clean up] Datasets,Fix issue 6: error when the citation is missin...,Fix issue 5: allow empty datasets,changing nlp.bool to nlp.bool_


In [29]:
(pull_request_df['closed_at'] - pull_request_df['created_at']).describe()

count                          3706
mean     15 days 03:06:04.498920669
std      71 days 14:28:05.141772524
min                 0 days 00:00:06
25%          0 days 03:16:48.500000
50%          1 days 04:23:30.500000
75%                 5 days 05:20:17
max               984 days 16:05:16
dtype: object

In [30]:
issue_number = 6301
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)

In [31]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/1759794986',
  'html_url': 'https://github.com/huggingface/datasets/pull/6301#issuecomment-1759794986',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/6301',
  'id': 1759794986,
  'node_id': 'IC_kwDODunzps5o5Fcq',
  'user': {'login': 'github-actions[bot]',
   'id': 41898282,
   'node_id': 'MDM6Qm90NDE4OTgyODI=',
   'avatar_url': 'https://avatars.githubusercontent.com/in/15368?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/github-actions%5Bbot%5D',
   'html_url': 'https://github.com/apps/github-actions',
   'followers_url': 'https://api.github.com/users/github-actions%5Bbot%5D/followers',
   'following_url': 'https://api.github.com/users/github-actions%5Bbot%5D/following{/other_user}',
   'gists_url': 'https://api.github.com/users/github-actions%5Bbot%5D/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/github-actions%5Bbot%5D/starred{/owner}{/repo}'

In [38]:
import requests
from retrying import retry

In [42]:
@retry(
    stop_max_attempt_number=5,  # Number of max retries
    wait_fixed=36_00_000  # Wait for 1 hour between each retry
)
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return [r['body'] for r in response.json()]

In [43]:
get_comments(2792)

["@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n    def test_load_dataset(self, dataset_name):\r\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n    self.parent.assertTrue(len(dataset[split]) > 0)\r\nE   AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?",
 'Thanks for the help, @albertvillanova! All tests are passing now.']

In [34]:
if not Path('data-issues-comments.csv').exists():
    issues_with_comments_dataset = issues_dataset.map(lambda x: {"comments": get_comments(x["number"])})
    issues_with_comments_dataset.to_csv('data-issues-comments.csv')

In [40]:
issues_with_comments_dataset = load_dataset("csv", data_files = "data-issues-comments.csv", split="train")

In [41]:
issues_with_comments_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 6279
})

In [42]:
len(issues_with_comments_dataset)

6279

In [44]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [45]:
issues_with_comments_dataset.push_to_hub("github-issues")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/400 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


## Nbdev github issues

In [49]:
if not Path("./nbdev-issues.jsonl").exists(): 
    nbdev_issues_dataset = fetch_issues('fastai', 'nbdev')

In [50]:
df = pd.read_json("./nbdev-issues.jsonl", orient="records", lines=True)

In [52]:
df.transpose().head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1456,1457,1458,1459,1460,1461,1462,1463,1464,1465
url,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...
repository_url,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,...,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev,https://api.github.com/repos/fastai/nbdev
labels_url,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...
comments_url,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...
events_url,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...,https://api.github.com/repos/fastai/nbdev/issu...
html_url,https://github.com/fastai/nbdev/pull/1374,https://github.com/fastai/nbdev/pull/1373,https://github.com/fastai/nbdev/issues/1372,https://github.com/fastai/nbdev/issues/1371,https://github.com/fastai/nbdev/issues/1370,https://github.com/fastai/nbdev/issues/1369,https://github.com/fastai/nbdev/issues/1368,https://github.com/fastai/nbdev/issues/1367,https://github.com/fastai/nbdev/issues/1366,https://github.com/fastai/nbdev/issues/1365,...,https://github.com/fastai/nbdev/issues/10,https://github.com/fastai/nbdev/issues/9,https://github.com/fastai/nbdev/issues/8,https://github.com/fastai/nbdev/pull/7,https://github.com/fastai/nbdev/pull/6,https://github.com/fastai/nbdev/pull/5,https://github.com/fastai/nbdev/pull/4,https://github.com/fastai/nbdev/pull/3,https://github.com/fastai/nbdev/pull/2,https://github.com/fastai/nbdev/pull/1
id,1953415859,1951064047,1950586692,1945085395,1924456725,1906570101,1898478453,1896648057,1887710163,1877574729,...,531889946,531358476,531082067,530500488,530496253,530493499,530490036,530150863,528462404,528258363
node_id,PR_kwDODUMwOs5dVgfQ,PR_kwDODUMwOs5dNiY8,I_kwDODUMwOs50Q5dE,I_kwDODUMwOs5z76XT,I_kwDODUMwOs5ytOEV,I_kwDODUMwOs5xo_N1,I_kwDODUMwOs5xKHt1,I_kwDODUMwOs5xDI15,I_kwDODUMwOs5whCvT,I_kwDODUMwOs5v6YRJ,...,MDU6SXNzdWU1MzE4ODk5NDY=,MDU6SXNzdWU1MzEzNTg0NzY=,MDU6SXNzdWU1MzEwODIwNjc=,MDExOlB1bGxSZXF1ZXN0MzQ3MTY5MDgz,MDExOlB1bGxSZXF1ZXN0MzQ3MTY1Nzcx,MDExOlB1bGxSZXF1ZXN0MzQ3MTYzODU2,MDExOlB1bGxSZXF1ZXN0MzQ3MTYxMjIy,MDExOlB1bGxSZXF1ZXN0MzQ2ODkwNDgy,MDExOlB1bGxSZXF1ZXN0MzQ1NTE5Mzk2,MDExOlB1bGxSZXF1ZXN0MzQ1MzU0ODU4
number,1374,1373,1372,1371,1370,1369,1368,1367,1366,1365,...,10,9,8,7,6,5,4,3,2,1
title,fixes #1364,add support for 3.11,In settings.ini `min_python = 3.11` causes pro...,[Question] Can I install nbdev in a virtual en...,nbdev_test having issues because of Python cac...,keyword only '*' does not work with documentation,`nbdev_test` fails to run tests when preceded ...,How to document a module and or constants?,`black` is not found when `nbdev_export` is ru...,nbdev_prepare and nbdev_test hang if I use the...,...,Link to generate repo broken,Link to a template in Getting Started,Docs not building,Tools,Docker build,Tools,added dockerfile,Update LICENSE,typos README,fix typo


In [53]:
from datasets import Dataset
nbdev_issues_dataset = Dataset.from_pandas(df)
nbdev_issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'],
    num_rows: 1466
})

In [82]:
nbdev_issues_dataset

[[],
 [{'color': 'a2eeef',
   'default': True,
   'description': 'New feature or request',
   'id': 1683921099,
   'name': 'enhancement',
   'node_id': 'MDU6TGFiZWwxNjgzOTIxMDk5',
   'url': 'https://api.github.com/repos/fastai/nbdev/labels/enhancement'}],
 [{'color': 'FBCA04',
   'default': True,
   'description': "Something isn't working",
   'id': 1683921090,
   'name': 'bug',
   'node_id': 'MDU6TGFiZWwxNjgzOTIxMDkw',
   'url': 'https://api.github.com/repos/fastai/nbdev/labels/bug'}],
 [],
 [{'color': 'FBCA04',
   'default': True,
   'description': "Something isn't working",
   'id': 1683921090,
   'name': 'bug',
   'node_id': 'MDU6TGFiZWwxNjgzOTIxMDkw',
   'url': 'https://api.github.com/repos/fastai/nbdev/labels/bug'}],
 [{'color': 'FBCA04',
   'default': True,
   'description': "Something isn't working",
   'id': 1683921090,
   'name': 'bug',
   'node_id': 'MDU6TGFiZWwxNjgzOTIxMDkw',
   'url': 'https://api.github.com/repos/fastai/nbdev/labels/bug'}],
 [{'color': 'FBCA04',
   'defau