<a href="https://colab.research.google.com/github/ShantKhatri/aqa-triage-data/blob/filtered-pr-retrieve/Automated_Fine_Tuning_Data_Harvester.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Setup and Dependencies
### This cell installs the necessary Python libraries and clones the OpenJ9 repository.


In [None]:
!pip install PyGithub python-dateutil -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/416.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/416.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m416.5/416.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/856.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
if not os.path.exists('openj9'):
    !git clone https://github.com/eclipse-openj9/openj9.git

Cloning into 'openj9'...
remote: Enumerating objects: 292916, done.[K
remote: Counting objects: 100% (650/650), done.[K
remote: Compressing objects: 100% (341/341), done.[K
remote: Total 292916 (delta 510), reused 309 (delta 309), pack-reused 292266 (from 4)[K
Receiving objects: 100% (292916/292916), 192.70 MiB | 22.14 MiB/s, done.
Resolving deltas: 100% (222019/222019), done.
Updating files: 100% (10335/10335), done.


In [None]:
import os
import re
import json
import subprocess
from github import Github, RateLimitExceededException
from getpass import getpass
from datetime import datetime, timedelta
from dateutil import tz

# --- Configuration ---
REPO_PATH = "openj9"
TRAINING_DATA_FILE = "training_data.jsonl"
PROCESSED_LOG_FILE = "processed_prs_log.txt"
GITHUB_REPO = "eclipse-openj9/openj9"
# Keywords to find "fix" pull requests
FIX_KEYWORDS = ['fix', 'fixes', 'revert', 'reverts', 'corrects', 'resolves']

In [None]:
# --- GitHub Authentication ---
# To search 10k or more PRs, a token is MANDATORY.
# Create a Personal Access Token (PAT) here: https://github.com/settings/tokens
try:
    if 'github_token' in locals() and github_token:
        g = Github(github_token)
        print("Authenticated with existing token.")
    else:
        raise NameError
except NameError:
    print("A GitHub Personal Access Token is REQUIRED to search 10,000 PRs.")
    github_token = getpass("Enter your GitHub Token: ")
    g = Github(github_token)

try:
    repo = g.get_repo(GITHUB_REPO)
    rate_limit = g.get_rate_limit()
    print(f"Successfully connected to the {GITHUB_REPO} repository.")
    print(f"API Rate Limit: {rate_limit.core.remaining}/{rate_limit.core.limit} requests remaining.")
    if rate_limit.core.remaining < 1000:
         print("WARNING: Your remaining API requests are low. The script may fail.")
except Exception as e:
    print(f"Failed to connect to repository. Please check your token and repository name. Error: {e}")

Authenticated with existing token.
✅ Successfully connected to the eclipse-openj9/openj9 repository.
❌ Failed to connect to repository. Please check your token and repository name. Error: 'RateLimitOverview' object has no attribute 'core'


In [None]:
#@title 3. Data Generation Script

def get_commit_author_date(sha):
    """Gets the author date of a specific commit and returns it as a datetime object."""
    try:
        os.chdir(REPO_PATH)
        # Use %ai for author date in ISO 8601 format
        cmd = ["git", "show", "-s", "--format=%ai", sha]
        date_str = subprocess.check_output(cmd).decode('utf-8').strip()
        os.chdir("..")
        # Manually parse the ISO 8601 format with timezone
        dt = datetime.strptime(date_str[:-6], '%Y-%m-%d %H:%M:%S ')
        offset_hours = int(date_str[-5:-2])
        offset_minutes = int(date_str[-2:])
        offset = timedelta(hours=offset_hours, minutes=offset_minutes)
        if date_str[-6] == '-':
            dt -= offset
        else:
            dt += offset
        return dt.replace(tzinfo=tz.tzutc())
    except Exception as e:
        # Silently fail to avoid cluttering the output
        pass
    if os.path.basename(os.getcwd()) == REPO_PATH:
        os.chdir("..")
    return None

def find_commits_for_day(commit_date):
    """Finds all commits for a given day in the New York timezone."""
    if not commit_date:
        return None, None

    ny_tz = tz.gettz('America/New_York')
    commit_date_ny = commit_date.astimezone(ny_tz)

    start_of_day_ny = commit_date_ny.replace(hour=0, minute=0, second=0, microsecond=0)
    end_of_day_ny = start_of_day_ny + timedelta(days=1) - timedelta(seconds=1)

    # Format for git log command
    since_str = start_of_day_ny.strftime('%Y-%m-%d %H:%M:%S %z')
    until_str = end_of_day_ny.strftime('%Y-%m-%d %H:%M:%S %z')

    try:
        os.chdir(REPO_PATH)
        cmd = ["git", "log", "--pretty=%H", f"--since='{since_str}'", f"--until='{until_str}'", "--reverse"]
        commit_list = subprocess.check_output(" ".join(cmd), shell=True).decode('utf-8').strip().splitlines()
        os.chdir("..")

        if not commit_list:
            return None, None

        if len(commit_list) > 1:
            good_sha = commit_list[0]
            bad_sha = commit_list[-1]
            return good_sha, bad_sha
        else:
             os.chdir(REPO_PATH)
             parent_cmd = ["git", "log", "-n", "1", "--pretty=%P", commit_list[0]]
             parent_sha_list = subprocess.check_output(parent_cmd).decode('utf-8').strip().split()
             os.chdir("..")
             if parent_sha_list:
                return parent_sha_list[0], commit_list[0]

    except Exception as e:
        pass
    if os.path.basename(os.getcwd()) == REPO_PATH:
        os.chdir("..")
    return None, None


def find_culprit_sha_in_body(body):
    """Parses a PR body to find a commit SHA."""
    if not body:
        return None
    # Regex to find a 7 to 40 character hexadecimal string, often preceded by context words
    match = re.search(r'(?:fixe?s?|revert?s?|commit|sha)\s*:?\s*#?\s*([0-9a-f]{7,40})\b', body, re.IGNORECASE)
    if match:
        return match.group(1)
    # Broader search if the first one fails
    match = re.search(r'\b[0-9a-f]{7,40}\b', body)
    if match:
        return match.group(0)
    return None


print("Helper functions are defined. Proceed to the final step to run the script.")

Helper functions are defined. Proceed to the final step to run the script.


In [None]:
print("Starting the data harvesting process...")
print(f"Searching for PRs with keywords: {FIX_KEYWORDS}")

# Clear previous log file
if os.path.exists(PROCESSED_LOG_FILE):
    os.remove(PROCESSED_LOG_FILE)

try:
    pulls = repo.get_pulls(state='closed', sort='updated', direction='desc')

    training_examples_count = 0
    culprit_found_count = 0

    for i, pr in enumerate(pulls):
        if i >= 10000:
            print("\nReached search limit of 10,000 PRs.")
            break

        if (i % 100 == 0) and (i > 0):
            print(f"...scanned {i} PRs...")

        if not pr.merged:
            continue

        # Check if the title contains any of our keywords
        if any(keyword in pr.title.lower() for keyword in FIX_KEYWORDS):

            culprit_sha = find_culprit_sha_in_body(pr.body)
            if not culprit_sha:
                continue

            # Log this PR as it contains a culprit SHA
            culprit_found_count += 1
            pr_info = f"PR #{pr.number}: {pr.title} (Culprit SHA Found: {culprit_sha[:10]}) - URL: {pr.html_url}\n"
            with open(PROCESSED_LOG_FILE, 'a') as log_file:
                log_file.write(pr_info)

            culprit_date = get_commit_author_date(culprit_sha)
            if not culprit_date:
                continue

            good_sha, bad_sha = find_commits_for_day(culprit_date)
            if not good_sha or not bad_sha:
                continue

except RateLimitExceededException:
    print("\n GITHUB API RATE LIMIT EXCEEDED.")
    print("The script has been stopped. Please wait for an hour or use a different token.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")

finally:
    print(f"\n\nHarvesting complete. Scanned approximately {i+1} PRs.")
    print(f"Found {culprit_found_count} PRs with a potential culprit SHA.")
    print(f"Successfully generated {training_examples_count} training examples.")

    print(f"\n- A full log of all PRs with culprit SHAs has been saved to '{PROCESSED_LOG_FILE}'.")


Starting the data harvesting process...
Searching for PRs with keywords: ['fix', 'fixes', 'revert', 'reverts', 'corrects', 'resolves']
...scanned 100 PRs...
...scanned 200 PRs...
...scanned 300 PRs...
...scanned 400 PRs...
...scanned 500 PRs...
...scanned 600 PRs...
...scanned 700 PRs...
...scanned 800 PRs...
...scanned 900 PRs...
...scanned 1000 PRs...
...scanned 1100 PRs...
...scanned 1200 PRs...
...scanned 1300 PRs...
...scanned 1400 PRs...
...scanned 1500 PRs...
...scanned 1600 PRs...
...scanned 1700 PRs...
...scanned 1800 PRs...
...scanned 1900 PRs...
...scanned 2000 PRs...
...scanned 2100 PRs...
...scanned 2200 PRs...
...scanned 2300 PRs...
...scanned 2400 PRs...
...scanned 2500 PRs...
...scanned 2600 PRs...
...scanned 2700 PRs...
...scanned 2800 PRs...
...scanned 2900 PRs...
...scanned 3000 PRs...
...scanned 3100 PRs...
...scanned 3200 PRs...
...scanned 3300 PRs...
...scanned 3400 PRs...
...scanned 3500 PRs...
...scanned 3600 PRs...
...scanned 3700 PRs...
...scanned 3800 PRs...


Request GET /repos/eclipse-openj9/openj9/pulls/12215 failed with 403: Forbidden
INFO:github.GithubRetry:Request GET /repos/eclipse-openj9/openj9/pulls/12215 failed with 403: Forbidden
Setting next backoff to 11.486647s
INFO:github.GithubRetry:Setting next backoff to 11.486647s


...scanned 6200 PRs...
...scanned 6300 PRs...
...scanned 6400 PRs...
...scanned 6500 PRs...
...scanned 6600 PRs...
...scanned 6700 PRs...
...scanned 6800 PRs...
...scanned 6900 PRs...
...scanned 7000 PRs...
...scanned 7100 PRs...
...scanned 7200 PRs...
...scanned 7300 PRs...
...scanned 7400 PRs...
...scanned 7500 PRs...
...scanned 7600 PRs...
...scanned 7700 PRs...
...scanned 7800 PRs...
...scanned 7900 PRs...
...scanned 8000 PRs...
...scanned 8100 PRs...
...scanned 8200 PRs...
...scanned 8300 PRs...
...scanned 8400 PRs...
...scanned 8500 PRs...
...scanned 8600 PRs...
...scanned 8700 PRs...
...scanned 8800 PRs...
...scanned 8900 PRs...
...scanned 9000 PRs...
...scanned 9100 PRs...
...scanned 9200 PRs...
...scanned 9300 PRs...
...scanned 9400 PRs...
...scanned 9500 PRs...
...scanned 9600 PRs...
...scanned 9700 PRs...
...scanned 9800 PRs...
...scanned 9900 PRs...

Reached search limit of 10,000 PRs.


Harvesting complete. Scanned approximately 10001 PRs.
Found 131 PRs with a potential cu