In [1]:
import os
import plt as plt
import pandas as pd
import numpy as np
from dateparser import parse as parse_date

In [2]:
SCOPE = "train_val"
COMBINED_FOLDER = "combined_datasets"
FINAL_FOLDER = "final_datasets"

In [3]:
COMBINED_CSV = f"{COMBINED_FOLDER}/{SCOPE}/combined.csv"
FINAL_CSV = f"{FINAL_FOLDER}/{SCOPE}/augmented.csv"

In [4]:
df = pd.read_csv(COMBINED_CSV)
df

Unnamed: 0,number,title,created_at,closed_at,merged_at,additions,deletions,changed_files,commits,author,...,max_multi_comments,min_blank,avg_blank,max_blank,title_length,description_length,files_with_content,is_bugfix,is_refactor,is_feature
0,20092,Fixed(ArrayField):Normalized IndexTransformati...,2025-11-13T14:45:04+00:00,2025-11-13T14:53:16+00:00,,3,-1,1,1,akshatsinha0,...,0.0,58.0,58.000000,58.0,70,1150.0,1,1,0,0
1,20090,Refs #31055 -- Augmented regression tests for ...,2025-11-13T09:30:52+00:00,2025-11-13T10:35:03+00:00,2025-11-13T10:35:03+00:00,54,4,4,2,felixxm,...,283.0,58.0,160.750000,310.0,69,,4,0,0,0
2,20082,Refs #35844 -- Checked for forkserver mode in ...,2025-11-11T21:38:10+00:00,2025-11-11T23:28:08+00:00,2025-11-11T23:28:08+00:00,5,2,2,2,jacobtylerwalls,...,8.0,10.0,49.000000,88.0,89,231.0,2,0,0,0
3,20087,Fixed #36730 -- Fixed constraint validation cr...,2025-11-12T19:50:09+00:00,2025-11-13T09:03:51+00:00,2025-11-13T09:03:51+00:00,39,2,3,1,adamchainz,...,172.0,45.0,239.333333,509.0,75,649.0,3,1,0,0
4,20081,Fixed #36731 -- Supported asynchronous get_con...,2025-11-11T11:20:44+00:00,2025-11-14T20:09:23+00:00,,20,-1,4,4,rroblf01,...,40.0,2.0,50.250000,134.0,67,1190.0,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8009,9800,üìù Update JSON Schema `examples` docs with Pyda...,2023-07-04T16:44:28+00:00,2023-07-04T16:50:05+00:00,2023-07-04T16:50:05+00:00,620,10,9,3,tiangolo,...,0.0,6.0,17.444444,99.0,53,53.0,9,0,0,0
8010,9795,‚úÖ Update tests to import Annotated from typing...,2023-07-04T11:14:52+00:00,2023-07-04T11:14:58+00:00,2023-07-04T11:14:58+00:00,2,4,2,1,tiangolo,...,0.0,13.0,13.000000,13.0,74,74.0,2,0,0,0
8011,9787,‚è™Ô∏è Revert usage of custom logic for TypeAdapte...,2023-07-03T19:32:01+00:00,2023-07-03T19:37:04+00:00,2023-07-03T19:37:04+00:00,2,10,1,1,tiangolo,...,0.0,89.0,89.000000,89.0,88,159.0,1,0,0,0
8012,9791,‚¨Ü [pre-commit.ci] pre-commit autoupdate,2023-07-04T01:40:24+00:00,2024-08-17T04:14:28+00:00,,21,21,6,3,pre-commit-ci[bot],...,183.0,0.0,134.500000,498.0,39,418.0,6,0,0,0


In [5]:
# Temporal features
for col in ["created_at", "merged_at", "closed_at"]:
    df[col + "_dt"] = pd.to_datetime(df[col], errors='coerce', utc=True)

df["hour_created"] = df["created_at_dt"].dt.hour.fillna(-1).astype(int)
df["day_of_month_created"] = df["created_at_dt"].dt.day.fillna(-1).astype(int)
df["day_of_week_created"] = df["created_at_dt"].dt.weekday.fillna(-1).astype(int)
df["month_created"] = df["created_at_dt"].dt.month.fillna(-1).astype(int)
df["year_created"] = df["created_at_dt"].dt.year.fillna(-1).astype(int)
df["created_in_weekend"] = df["created_at_dt"].dt.weekday.apply(lambda x: 1 if x >= 5 else 0)
df["time_to_close"] = ((df["closed_at_dt"] - df["created_at_dt"]).dt.total_seconds()/3600).fillna(0)

In [6]:
# Comments / reviewers
def parse_list_column(x):
    try:
        return eval(x) if pd.notna(x) else []
    except:
        return []

df["comments_list_parsed"] = df["comments_list"].apply(parse_list_column)
df["reviewers_list_parsed"] = df["reviewers_list"].apply(parse_list_column)
df["num_comments"] = df["comments_list_parsed"].apply(len)
df["num_reviewers"] = df["reviewers_list_parsed"].apply(len)
df["num_comment_round"] = df["num_comments"]

In [7]:
# Previous PRs / author experience
df = df.sort_values(by=["author", "created_at_dt"]).reset_index(drop=True)
df["previous_prs"] = df.groupby("author").cumcount()
df["previous_accepted_prs"] = df.groupby("author")["merged_at_dt"].transform(lambda x: x.notna().cumsum().shift(fill_value=0))
df["author_experience"] = df.groupby("author")["merged_at_dt"].transform(lambda x: x.notna().cumsum().shift(fill_value=0))
df["author_acceptance_rate"] = df["previous_accepted_prs"] / df["previous_prs"].replace(0,1)
df["core_contributor_flag"] = (df["author_experience"] >= 10).astype(int)
df["main_author_ratio"] = 1.0

In [8]:
# Comments metrics
def compute_burstiness(comment_times):
    times_sorted = sorted([parse_date(t) for t in comment_times if isinstance(t, str) and parse_date(t)])
    if len(times_sorted) < 2:
        return 0.0
    intervals = [(t2 - t1).total_seconds() / 3600 for t1, t2 in zip(times_sorted[:-1], times_sorted[1:])]
    return float(np.std(intervals) / (np.mean(intervals) + 1e-9))

def compute_review_activity_decay(comment_times):
    times_sorted = sorted([parse_date(t) for t in comment_times if isinstance(t, str) and parse_date(t)])
    if len(times_sorted) < 2:
        return 0.0
    hours_since_start = [(t - min(times_sorted)).total_seconds() / 3600 for t in times_sorted]
    counts = np.arange(1, len(hours_since_start) + 1)
    slope = np.polyfit(hours_since_start, counts, 1)[0]
    return float(slope)

def compute_response_time_median(comment_times, commit_times):
    comment_times_parsed = sorted([parse_date(t) for t in comment_times if isinstance(t, str) and parse_date(t)])
    commit_times_parsed = sorted([parse_date(t) for t in commit_times if isinstance(t, str) and parse_date(t)])
    response_times = []
    for ct in comment_times_parsed:
        later_commits = [cm for cm in commit_times_parsed if cm > ct]
        if later_commits:
            response_times.append((min(later_commits) - ct).total_seconds() / 3600)
    return float(np.median(response_times)) if response_times else 0.0


df["commits_list"] = df["commits_list"].apply(lambda x: x if isinstance(x, list) else [])
df["comments_burstiness"] = df["comments_list_parsed"].apply(
    lambda x: compute_burstiness([c["created_at"] for c in x if isinstance(c, dict)])
)

df["review_activity_decay"] = df["comments_list_parsed"].apply(
    lambda x: compute_review_activity_decay([c["created_at"] for c in x if isinstance(c, dict)])
)

df["response_time_median"] = df.apply(
    lambda row: compute_response_time_median(
        [c["created_at"] for c in row["comments_list_parsed"] if isinstance(c, dict)],
        [c["timestamp"] for c in row["commits_list"] if isinstance(c, dict)]
    ), axis=1
)

In [9]:
def label_pr_time(pr_created_dt, pr_merged_dt, pr_closed_dt):
    if pd.isna(pr_created_dt):
        return None

    if pd.notna(pr_merged_dt):
        days = (pr_merged_dt - pr_created_dt).total_seconds() / 86400
        if days < 1:
            return "immediate"
        elif days < 5:
            return "fast"
        elif days < 14:
            return "moderate"
        else:
            return "long"
    elif pd.notna(pr_closed_dt):
        return "never"
    else:
        return None

In [10]:
df["pr_time_label"] = df.apply(
    lambda row: label_pr_time(row["created_at_dt"], row["merged_at_dt"], row["closed_at_dt"]), axis=1
)

In [11]:
df.to_csv(FINAL_CSV, index=False)
print(f"Saved final CSV to {FINAL_CSV}")

Saved final CSV to final_datasets/train_val/augmented.csv


In [12]:
df

Unnamed: 0,number,title,created_at,closed_at,merged_at,additions,deletions,changed_files,commits,author,...,previous_prs,previous_accepted_prs,author_experience,author_acceptance_rate,core_contributor_flag,main_author_ratio,comments_burstiness,review_activity_decay,response_time_median,pr_time_label
0,12695,Fixed #31449 Autocomplete dosnot work when For...,2020-04-10T05:57:58+00:00,2020-04-10T06:09:26+00:00,,11,4,2,1,007gzs,...,0,0,0,0.0,0,1.0,0.000000,0.000000,0.0,never
1,21394,Fix: Raise TypeError for invalid state_size in...,2025-06-17T23:25:09+00:00,2025-06-17T23:29:11+00:00,,1,1,1,1,00Harshh,...,0,0,0,0.0,0,1.0,0.000000,11.356467,0.0,never
2,21395,Fix: Raise TypeError for invalid state_size in...,2025-06-17T23:40:10+00:00,2025-06-19T18:21:50+00:00,,1,1,1,1,00Harshh,...,1,0,0,0.0,0,1.0,0.000000,0.000000,0.0,never
3,12079,üß™ Add test for segmentation fault when using `...,2024-08-27T07:49:33+00:00,2025-02-27T13:03:30+00:00,,30,-1,1,1,07pepa,...,0,0,0,0.0,0,1.0,0.000000,0.004663,0.0,never
4,11168,‚úèÔ∏è Fix import typo in reference example for `S...,2024-02-20T09:42:13+00:00,2024-08-17T06:53:53+00:00,2024-08-17T06:53:53+00:00,1,1,1,2,0shah0,...,0,0,0,0.0,0,1.0,0.000000,0.000000,0.0,long
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8009,17960,Fixed #35281 - Set HTTP 413 for RequestDataToo...,2024-03-09T13:42:13+00:00,2024-04-17T10:15:55+00:00,,31,5,4,1,zvyn,...,0,0,0,0.0,0,1.0,0.000000,0.000000,0.0,never
8010,5633,add --notimingintensive; block from github jobs,2020-10-07T03:38:26+00:00,2020-10-07T04:59:29+00:00,,20,19,5,1,zzzeek,...,0,0,0,0.0,0,1.0,0.998579,1.279620,0.0,never
8011,5865,Limit AsyncAdaptedQueue to Python 3.7,2021-01-23T19:23:02+00:00,2021-01-24T23:40:08+00:00,,8,10,2,2,zzzeek,...,1,0,0,0.0,0,1.0,2.063491,0.179768,0.0,never
8012,7474,Replace c extension with cython versions.,2021-12-18T15:58:17+00:00,2021-12-18T16:06:26+00:00,,2993,2999,43,1,zzzeek,...,2,0,0,0.0,0,1.0,0.000000,0.000000,0.0,never
