## Analysis

In [2]:
pip install python-dateutil


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [330]:
import json
import pandas as pd
from dateutil.parser import parse
from datasets import load_dataset, Dataset

small_ds = load_dataset("bigcode-data/the-stack-gh-pull-requests", use_auth_token=True, split="train", streaming=True)

size = 500_000

ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)

# 10k subset of random samples from ds
ds = list(ds.take(size))
ds = Dataset.from_pandas(pd.DataFrame(data=ds))

In [365]:
ds

Dataset({
    features: ['pull_request.guid', 'pull_request.code_review_events', 'pull_request.events', 'pull_request.issue_events', 'bucket', '__index_level_0__'],
    num_rows: 500000
})

In [444]:
# merge all three instances

pull_request_info_cols = [
    "repo.name",
    "repo.id",
    "org.id",
    "public",
    "pull_request.id",
    "pull_request.guid",
    "pull_request.number",
    "pull_request.title",
    "pull_request.body",
    "pull_request.state",
    "pull_request.user.login",
    "pull_request.user.id",
    # add user type
    "pull_request.head.user.type",
    "pull_request.base.user.type",
    "pull_request.created_at",
    "pull_request.closed_at",
    "pull_request.merged_at",
    "pull_request.merged_by.login",
    "pull_request.milestone.title",
    "pull_request.milestone.description",
    "pull_request.milestone.number",
    # commits
    'pull_request.commits',
    'pull_request.additions',
    'pull_request.deletions',
    # changed files
    'pull_request.changed_files',
    "pull_request.comments",
    "pull_request.review_comments",
]

head_info_cols = [
    "pull_request.head.label",
    "pull_request.head.ref",
    "pull_request.head.user.login",
    "pull_request.head.user.type",
    "pull_request.head.repo.owner.login",
    "pull_request.head.repo.owner.type",
    "pull_request.head.repo.license.name",
    "pull_request.head.sha",
    'pull_request.head.repo.name',
    'pull_request.head.repo.owner.login',
    'pull_request.head.repo.homepage',
    'pull_request.head.repo.description',
    'pull_request.head.repo.language',
    'pull_request.head.repo.stargazers_count',
    'pull_request.head.repo.license.name',
    'pull_request.head.repo.default_branch',
    'pull_request.head.repo.private'
]
base_info_cols = [
    "pull_request.base.label",
    "pull_request.base.ref",
    "pull_request.base.sha",
    "pull_request.base.user.login",
    "pull_request.base.user.type",
    "pull_request.base.repo.owner.login",
    "pull_request.base.repo.owner.type",
    "pull_request.base.repo.license.name",
    "pull_request.base.repo.default_branch",
    "pull_request.base.repo.description",
    "pull_request.base.repo.language",
    "pull_request.base.repo.watchers_count",
    "pull_request.base.repo.open_issues_count",
    "pull_request.base.repo.forks_count",
    'pull_request.base.repo.name',
    'pull_request.base.repo.owner.login',
    'pull_request.base.repo.homepage',
    'pull_request.base.repo.description',
    'pull_request.base.repo.language',
    'pull_request.base.repo.stargazers_count',
    'pull_request.base.repo.private',
    'pull_request.comments',
    'pull_request.review_comments',
    'pull_request.label.name',
]

reviews_info = [# review events only
    'actor.login',
    'actor.id',
    'user.login',
    'user.type',
    'review.state',
    'review.id', 
    'review.body', 
    'review.commit_id', 
    'review.submitted_at', 
    'review.author_association',
    "pull_request.state",
    "pull_request.merged",
    "pull_request.merged_by.login",
    "pull_request.merged_by.type",
    # comments
    'comment.id',
    'comment.diff_hunk',
    'comment.body',
    'comment.path',
    'comment.position',
    'comment.original_position',
    'comment.commit_id',
    'comment.original_commit_id',
    'comment.created_at',
    'comment.updated_at',
    'comment.author_association',
    'comment.start_line',
    'comment.original_start_line',
    'comment.start_side',
    'comment.line',
    'comment.original_line',
    'comment.side',
    'comment.in_reply_to_id',]


issues_info = [
 'author',
 'comment',
 'comment_id']
 
event_info = reviews_info + issues_info

def get_event_info(review):
    res = {k: review[k] if k in review else None for k in event_info}
    # for keys in issues_info add prefix issue.
    for k in issues_info:
        res["issue." + k] = res[k]
        del res[k]
    return res

def load_json(data):
    try:
        data = json.loads(data)
        if isinstance(data, dict):
            data = [data]
        return data
    except TypeError:
        return []

def update_datetime(e):
    e["created_at"] = parse(e["created_at"])
    return e

def merge_events(row):
    events = load_json(row["pull_request.events"])
    reviews = load_json(row["pull_request.code_review_events"])
    issues = load_json(row["pull_request.issue_events"])

    assert len(issues) <= 1
    if issues:
        issues_events = issues[0]["events"]
        # for each events in each category group all events sorted by "created_at" in one list
        for e in issues_events:
            e["created_at"] = parse(e["datetime"])
            del e["datetime"]
    else:
        issues_events = []
    events = [update_datetime(e) for e in events]
    reviews = [update_datetime(e) for e in reviews]
    all_events = sorted(
        events + reviews + issues_events,
        key=lambda x: x["created_at"]
    )
    try:
        base_data = events[0] if events else reviews[0]
    except IndexError:
        # init empty dict
        base_data = {}
        if issues:
            base_data = {}
            first_event = issues[0]
            base_data['pull_request.title'] = first_event["events"][0]["title"]
            base_data["repo.name"] = first_event["repo"]
            base_data["pull_request.number"] = first_event["pull_request"]["number"]
            base_data["pull_request.user.login"] = first_event["pull_request"]["user_login"]
            print("filling PR data from issue event")
        else:
            raise IndexError("No events for PR")
    
    # Initialize with default values
    pr_info = {k: None for k in pull_request_info_cols}
    head_info = {k: None for k in head_info_cols}
    base_info = {k: None for k in base_info_cols}

    # Fill available data
    pr_info.update({k: base_data[k] if k in base_data else None for k in pull_request_info_cols})
    head_info.update({k: base_data[k] if k in base_data else None  for k in head_info_cols })
    base_info.update({k: base_data[k]  if k in base_data else None for k in base_info_cols})

    # each comment should have "comments" and "review_comments" fields with "extra_review_info" field
    comments = [{"type": e["type"],
                "action": e["action"],
                "created_at": e["created_at"],
                **get_event_info(e)} for e in all_events]
    new_row = {"pull_request_info": pr_info, "head_repo_info": head_info, "base_repo_info": base_info, "events": comments}
    return new_row

In [None]:
merged_ds = ds.map(merge_events, remove_columns=["pull_request.events", "pull_request.code_review_events", "pull_request.issue_events", '__index_level_0__','pull_request.guid'])

In [449]:
merged_ds.push_to_hub("loubnabnl/code_reviews_500k")

Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 37.20ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.10s/it]
Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 33.12ba/s]s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [00:09<00:00,  9.55s/it]
Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 39.47ba/s]s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [00:09<00:00,  9.99s/it]
Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 37.45ba/s]s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [00:23<00:00, 23.74s/it]
Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 34.84ba/s]s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.48s/it]
Creating parquet from Arrow format: 100%|██████████| 84/84 [00:03<00:00, 26.04ba/s]s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.62s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 6/6 [02:10<00:0