# 1. Getting Git Commit messages with their corresponding URLs

## What the Code Does
- **Session with Retry Logic:**  
  Creates an HTTP session configured with a retry strategy to handle common HTTP errors and API rate limits.

- **Helper Function (`safe_get`):**  
  Implements a GET request helper that retries on errors and waits for the GitHub API rate-limit reset if needed.

- **Fetching Commits (`fetch_commits`):**  
  Retrieves commit data from the specified GitHub repository using paginated API calls.

- **Filtering Commits (`is_python_only_commit`):**  
  Checks if a commit modifies only Python files by examining if every file in the commit ends with `.py`.

- **CSV Writing:**  
  Opens (or creates) a CSV file (`python_only_commits.csv`), writes a header if the file is new, and appends commits that have:
  - A commit message between 50 and 300 characters.
  - Only Python file changes.
  Flushes the CSV to ensure data is saved incrementally.

## How to Use It
1. **Configure Your Environment:**
   - Replace the placeholder GitHub API token with your own token.
   - Optionally, update the `repos` list to include any additional repositories in the `"owner/repo"` format.

2. **Run the Code:**
   - Execute the cell to begin fetching commits from the listed repositories.
   - Monitor the progress via the `tqdm` progress bars for repositories and commits.

3. **Review the Output:**
   - The CSV file named `python_only_commits.csv` will be created (or updated), containing the commit URL and commit message for each valid Python-only commit.


In [None]:
import requests
import csv
import os
from tqdm import tqdm
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# GitHub API token
GITHUB_TOKEN = ''

# Headers for GitHub API requests
headers = {
    'Authorization': f'token {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}

repos = [

    # Store list of repos

    #EXAMPLE:
    # 'discord/flask-oauthlib',
    # 'discord/chromium-build',
    # 'discord/node-gyp',
    # 'discord/luigi',
    # 'discord/hyper-h2'

]

# Create a persistent session with retry logic for common HTTP errors
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

def safe_get(url, headers, retries=3, delay=2):
    """
    A helper function to perform a GET request with GitHub API rate-limit handling.
    It retries the request (up to 'retries' times) and, if a rate-limit (HTTP 403 with X-RateLimit-Remaining "0")
    is encountered, waits until the reset time.
    """
    for attempt in range(retries):
        try:
            response = session.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return response
            elif response.status_code == 403:
                remaining = response.headers.get("X-RateLimit-Remaining")
                if remaining == "0":
                    reset_timestamp = int(response.headers.get("X-RateLimit-Reset", 0))
                    current_time = int(time.time())
                    wait_time = reset_timestamp - current_time + 5  # add a small buffer
                    if wait_time > 0:
                        print(f"Rate limit exceeded. Waiting for {wait_time} seconds before retrying {url}...")
                        time.sleep(wait_time)
                        continue  # After waiting, try again
                else:
                    print(f"Attempt {attempt+1}: Received 403 for {url} (not due to rate limit).")
            else:
                print(f"Attempt {attempt+1}: Non-200 response ({response.status_code}) for {url}")
        except Exception as e:
            print(f"Attempt {attempt+1}: Error fetching {url}: {e}")
        time.sleep(delay)
    return None

def fetch_commits(repo, per_page=100, max_pages=100):
    """
    Fetch commit data for the given repository using safe_get for rate-limit handling.
    """
    commits = []
    for page in range(1, max_pages + 1):
        url = f'https://api.github.com/repos/{repo}/commits?per_page={per_page}&page={page}'
        response = safe_get(url, headers)
        # Stop fetching if no more commits or an error occurred.
        if not response or not response.json():
            print(f"No more commits in {repo} or encountered an error.")
            break
        commits.extend(response.json())
        time.sleep(0.5)  # Small delay to reduce rate-limit issues further
    return commits

def is_python_only_commit(commit_url):
    """
    Check if the commit (accessed via its URL) includes only Python (.py) files.
    Uses safe_get for rate-limit handling.
    """
    response = safe_get(commit_url, headers)
    if not response or response.status_code != 200:
        return False

    files = response.json().get('files', [])
    if not files:
        return False

    return all(file['filename'].endswith('.py') for file in files)

csv_filename = 'python_only_commits.csv'
print("Fetching commits that only include Python files...")

# Check if the CSV file exists already (to avoid rewriting headers)
file_exists = os.path.isfile(csv_filename)

# Open CSV file in append mode to save results incrementally
with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)

    # If file doesn't exist, write headers first
    if not file_exists:
        writer.writerow(['commit_url', 'commit_message'])

    # Iterate over each repository
    for repo in tqdm(repos, desc="Repositories"):
        commits = fetch_commits(repo)

        for commit_info in tqdm(commits, desc=f"Commits in {repo}"):
            commit_url = commit_info['url']
            commit_message = commit_info['commit']['message']

            # Ensure commit message is within length limits
            if 50 <= len(commit_message) <= 300:
                if is_python_only_commit(commit_url):
                    writer.writerow([commit_url, commit_message])
                    csvfile.flush()  # Ensure that each commit is saved immediately

print("Incremental fetching complete.")


# 2.Enriching Commits with Diff Data via Concurrent Fetching

## What the Code Does

- **Session & Retry Setup:**  
  - Creates a HTTP session with a retry strategy (using `HTTPAdapter` and `Retry`) to handle network errors and GitHub API rate limits.

- **Diff Fetching Function:**  
  - **`fetch_diff`:**  
    - Fetches the diff (the code changes) for a given commit URL.
    - Implements retry logic and handles rate limiting by checking for HTTP 403 responses. If the rate limit is exceeded, it waits until the reset time (plus a small buffer) before retrying the request.

- **Data Loading & Processing:**  
  - Loads previously saved commit details from the CSV file (`python_only_commits.csv`) generated in Cell 1.
  - **`process_commit`:**  
    - Retrieves diff information for each commit by using the `fetch_diff` function.
    - Packages the commit message along with its corresponding diff.

- **Concurrent Execution & Output:**  
  - Uses a `ThreadPoolExecutor` to fetch diff data concurrently for all commits, which significantly improves processing speed.
  - Writes the resulting commit messages and diffs incrementally to a new CSV file (`python_commit_dataset.csv`), ensuring that each row is immediately flushed to disk.

## How to Use It

1. **Input Requirements:**  
   - Make sure that the CSV file `python_only_commits.csv` (produced by Cell 1) is present.
   - Update the `GITHUB_TOKEN` variable with a valid GitHub token if needed.

2. **Execution:**  
   - Run this cell after Cell 1 has been executed and the initial dataset has been created.
   - The code will concurrently process each commit, fetching its diff and combining it with the commit message.

3. **Output:**  
   - A new CSV file named `python_commit_dataset.csv` is generated.
   - This file contains two columns: `commit_message` and `diff`, providing deeper insights into the code changes for each commit.

   


In [None]:
import requests
import csv
from tqdm import tqdm
import concurrent.futures  # For faster parallel processing
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# GitHub API token
GITHUB_TOKEN = ''

# Create a persistent session with retry logic for common HTTP errors
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

def fetch_diff(commit_url, retries=3, delay=2):
    headers_diff = {
        'Authorization': f'token {GITHUB_TOKEN}',
        'Accept': 'application/vnd.github.v3.diff'
    }
    for attempt in range(retries):
        try:
            response = session.get(commit_url, headers=headers_diff, timeout=10)
            if response.status_code == 200:
                return response.text
            elif response.status_code == 403:
                # Check if rate limit has been exceeded
                remaining = response.headers.get("X-RateLimit-Remaining")
                if remaining == "0":
                    reset_timestamp = int(response.headers.get("X-RateLimit-Reset", 0))
                    current_time = int(time.time())
                    wait_time = reset_timestamp - current_time + 5  # add a small buffer
                    if wait_time > 0:
                        print(f"Rate limit exceeded. Waiting for {wait_time} seconds before retrying {commit_url}...")
                        time.sleep(wait_time)
                        continue  # After waiting, retry the request
                else:
                    print(f"Attempt {attempt+1}: Non-200 response ({response.status_code}) for {commit_url}")
            else:
                print(f"Attempt {attempt+1}: Non-200 response ({response.status_code}) for {commit_url}")
        except Exception as e:
            print(f"Attempt {attempt+1}: Error fetching {commit_url}: {e}")
        time.sleep(delay)
    return None

# Load previously saved commits from CSV file
commit_data = []
with open('python_only_commits.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        commit_data.append(row)

def process_commit(commit):
    diff = fetch_diff(commit['commit_url'])
    if diff:
        return {
            'commit_message': commit['commit_message'],
            'diff': diff
        }
    return None

print("Fetching commit diffs...")

# Open the CSV file in write mode with proper quoting and line terminator.
with open('python_commit_dataset.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL, lineterminator='\n')
    writer.writerow(['commit_message', 'diff'])

    # Use ThreadPoolExecutor for concurrent fetching.
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(process_commit, commit) for commit in commit_data]

        # Use tqdm to display progress as futures complete.
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            result = future.result()
            if result:
                writer.writerow([result['commit_message'], result['diff']])
                csvfile.flush()  # Flush after writing each row to save incrementally

print("Final dataset created with git diffs and commits.")


# 3. Cleaning Commit Messages for Transformer Training

## What the Code Does
- **Set CSV Field Limit:**  
  - Adjusts the field size limit so large text fields are processed without issues.
- **Load Dataset:**  
  - Reads `python_commit_dataset.csv` using pandas.
- **Clean Commit Messages:**  
  - Removes merge pull request phrases (e.g., "Merge pull request #… from …").
  - Eliminates lines containing "PiperOrigin-RevId" and "Signed-off-by" to strip out non-informative metadata.
- **Save Cleaned Data:**  
  - Writes the cleaned dataset to `cleaned_python_commit_dataset.csv` with proper quoting.

## How to Use It
1. Make sure `python_commit_dataset.csv` is in your working directory.
2. Run this cell to clean the commit messages.
3. Use the resulting `cleaned_python_commit_dataset.csv` for transformer training.


In [None]:
import pandas as pd
import re
import sys
import csv

# Increase CSV field size limit to handle large fields properly
csv.field_size_limit(sys.maxsize)

# Load your original dataset using pandas robustly
df = pd.read_csv(
    'python_commit_dataset.csv',
    engine='python',
    on_bad_lines='skip'
)

def clean_message(msg):
    if pd.isna(msg):
        return ""
    # Ensure we're working with a string
    msg = str(msg)

    # Remove merge pull request phrases anywhere in the message.
    # This pattern matches: "Merge pull request" then whitespace,
    # then a '#' followed by non-whitespace, then whitespace,
    # then "from" and another whitespace and non-whitespace branch name.
    merge_pr_pattern = r"Merge\s+pull\s+request\s+#\S+\s+from\s+\S+"
    msg = re.sub(merge_pr_pattern, "", msg, flags=re.IGNORECASE)

    # Remove unwanted lines containing internal metadata
    unwanted_patterns = ["PiperOrigin-RevId", "Signed-off-by", "Signed-Off-By:", "Signed-off by", "Signed-off-by"]
    msg = "\n".join(
        line for line in msg.splitlines() if not any(up in line for up in unwanted_patterns)
    ).strip()

    return msg

# Apply cleaning function to the commit_message column
df['commit_message'] = df['commit_message'].apply(clean_message)

# Save cleaned dataset with proper CSV quoting
df.to_csv('cleaned_python_commit_dataset.csv', index=False, quoting=csv.QUOTE_ALL)


# 4. Verifying Dataset Integrity

- **Loads Both Datasets:**  
  - Reads the original dataset (`python_commit_dataset.csv`) and the cleaned dataset (`cleaned_python_commit_dataset.csv`) using pandas.
- **Compares Row Counts:**  
  - Prints the row count for each to verify no rows were lost or duplicated during cleaning.



In [None]:
import pandas as pd
import sys
import csv

# Increase CSV field size limit (helps pandas handle large fields)
csv.field_size_limit(sys.maxsize)

# Load original dataset robustly (using pandas' reliable parsing options)
original_df = pd.read_csv(
    'python_commit_dataset.csv',
    engine='python',
    on_bad_lines='skip'
)

# Load modified (cleaned) dataset using pandas
modified_df = pd.read_csv(
    'cleaned_python_commit_dataset.csv',
    engine='python',
    on_bad_lines='skip'
)

# Display row counts clearly
print("Row count comparison (using pandas):")
print(f"- Original dataset: {original_df.shape[0]} rows")
print(f"- Modified dataset: {modified_df.shape[0]} rows")

if original_df.shape[0] == modified_df.shape[0]:
    print("Row count matches. Cleaning did not alter row count.")
else:
    print("Row count mismatch. Cleaning altered row count!")


## 5. Download the Cleaned Dataset

In [None]:
from google.colab import files

# Download cleaned dataset
files.download('cleaned_python_commit_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>