In [None]:
import pandas as pd
import re
import statsmodels.api as sm
from statsmodels.formula.api import ols
import numpy as np

from scipy import stats
from scipy.stats import f_oneway, mannwhitneyu, kruskal
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

if "__file__" in globals():
    script_dir = os.path.dirname(os.path.abspath(__file__))
else:
    script_dir = os.getcwd() 

# Path to parent/Data Analysis
data_analysis_path = os.path.abspath(os.path.join(script_dir, "..", "Data Analysis"))
data_analysis_path = os.path.abspath(data_analysis_path)

sys.path.append(data_analysis_path)

import bug_fix_list
import internal_list
import external_list
import functional_list
import code_smell_list

bug_words = bug_fix_list.bug_words
internal_words = internal_list.internal_words
external_words = external_list.external_words
functional_words = functional_list.functional_words
smell_words = code_smell_list.smell_words


Mounted at /content/drive


In [2]:
# How do time-to-first-review and time-to-merge differ between PRs with vs. without SAR patterns?
# -- How do these metrics vary by agent?
# -- How do they compare against non-SAR PRs?
# Compare individual agent times for SAR vs non-SAR

print('Starting script.')
pd.set_option('display.max_rows', None)
pull_request_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")
comments_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_comments.parquet")
pr_task_type_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_task_type.parquet")
print('Finished querying parquets.')
print(len(pull_request_df))
pull_request_df.rename(columns={'id': 'pr_id'}, inplace=True)

Starting script.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Finished querying parquets.
33596


In [3]:
# need body from pr_comments
combined_df = pull_request_df.merge(
    comments_df[['id', 'body']].add_prefix('comment_'),
    how='left',
    left_on='pr_id',
    right_on='comment_id',
    suffixes=('', '_task')
)
combined_df.drop(columns=['comment_id'])

# only look at refactor
combined_df = combined_df.merge(
    pr_task_type_df[['id', 'type']],
    how='left',
    left_on='pr_id',
    right_on='id'
)
combined_df.rename(columns={'type': 'pr_type'}, inplace=True)
combined_df = combined_df.loc[combined_df['pr_type'].str.contains('refactor', na=False)].copy()

print(combined_df['pr_id'].nunique())

print(f"Length of combined_df: {len(combined_df)}")

2288
Length of combined_df: 2288


In [4]:
print('Beginning regex searches.')

def to_regex_pattern(word):
    return re.escape(word).replace(r'\*', '.*')

bug_patterns = re.compile('|'.join(to_regex_pattern(w) for w in bug_words), re.IGNORECASE)
internal_patterns = re.compile('|'.join(to_regex_pattern(w) for w in internal_words), re.IGNORECASE)
external_patterns = re.compile('|'.join(to_regex_pattern(w) for w in external_words), re.IGNORECASE)
functional_patterns = re.compile('|'.join(to_regex_pattern(w) for w in functional_words), re.IGNORECASE)
smell_patterns = re.compile('|'.join(to_regex_pattern(w) for w in smell_words), re.IGNORECASE)

all_patterns = re.compile(
    '|'.join(to_regex_pattern(w) for w in (bug_words + internal_words + external_words + functional_words + smell_words)),
    re.IGNORECASE
)

print('Finished assembling regex patterns.')


Beginning regex searches.
Finished assembling regex patterns.


In [5]:

print('Searching for sar patterns.')
# search for any pattern
combined_df['is_sar'] = (
    combined_df['body'].str.contains(all_patterns, na=False) |
    combined_df['comment_body'].str.contains(all_patterns, na=False) |
    combined_df['title'].str.contains(all_patterns, na=False)
)
print('Finished searching any pattern.')

# search for specific category of pattern (bug, internal, external, functional, smell)
combined_df['bug'] = (
    combined_df['body'].str.contains(bug_patterns, na=False) |
    combined_df['comment_body'].str.contains(bug_patterns, na=False) |
    combined_df['title'].str.contains(bug_patterns, na=False)
)
print('Finished searching bug patterns.')
combined_df['internal'] = (
    combined_df['body'].str.contains(internal_patterns, na=False) |
    combined_df['comment_body'].str.contains(internal_patterns, na=False) |
    combined_df['title'].str.contains(internal_patterns, na=False)
)
print('Finished searching internal patterns.')
combined_df['external'] = (
    combined_df['body'].str.contains(external_patterns, na=False) |
    combined_df['comment_body'].str.contains(external_patterns, na=False) |
    combined_df['title'].str.contains(external_patterns, na=False)
)
print('Finished searching external patterns.')
combined_df['functional'] = (
    combined_df['body'].str.contains(functional_patterns, na=False) |
    combined_df['comment_body'].str.contains(functional_patterns, na=False) |
    combined_df['title'].str.contains(functional_patterns, na=False)
)
print('Finished searching functional patterns.')
combined_df['smell'] = (
    combined_df['body'].str.contains(smell_patterns, na=False) |
    combined_df['comment_body'].str.contains(smell_patterns, na=False) |
    combined_df['title'].str.contains(smell_patterns, na=False)
)
print('Finished searching smell patterns.')

# search any category pattern in specific locations in the PR
combined_df['sar_in_pr_title'] = combined_df['title'].str.contains(all_patterns, na=False)
print('Finished searching for sar in title.')
combined_df['sar_in_pr_body'] = combined_df['body'].str.contains(all_patterns, na=False)
print('Finished searching for sar in body.')
combined_df['sar_in_pr_comment'] = combined_df['comment_body'].str.contains(all_patterns, na=False)
print('Finished searching for sar in comment.')

# add column for merge time in DAYS instead of seconds
combined_df['merge_time'] = (pd.to_datetime(combined_df['merged_at']) - pd.to_datetime(combined_df['created_at'])).dt.total_seconds() / 86400

Searching for sar patterns.
Finished searching any pattern.
Finished searching bug patterns.
Finished searching internal patterns.
Finished searching external patterns.
Finished searching functional patterns.
Finished searching smell patterns.
Finished searching for sar in title.
Finished searching for sar in body.
Finished searching for sar in comment.


In [6]:

unique_prs = combined_df.drop_duplicates(subset=['pr_id'])

total_requests = (
    unique_prs
    .groupby(['agent', 'is_sar'])
    .size()
    # ** issue with my python LSP, not a problem at runtime **
    .reset_index(name='total_requests') # type: ignore
)

agents = unique_prs['agent'].unique()
sar_flags = [True, False]
all_groups = pd.MultiIndex.from_product([agents, sar_flags], names=['agent', 'is_sar'])

total_merged = (
    unique_prs[unique_prs['merged_at'].notna()]
    .groupby(['agent', 'is_sar'])['merge_time']
    .size()
    .reindex(all_groups, fill_value=0)
    .reset_index(name='total_merged') # type: ignore
)

# find average of both sar and non_sar, and drop unmerged PRs
average_merged = (
    unique_prs[unique_prs['merged_at'].notna()]
    .groupby(['agent', 'is_sar'])['merge_time']
    .mean()
    .round(2)
    .reindex(all_groups, fill_value='n/a')
    .reset_index(name='average_merge_time(days)') # type: ignore
)

sar_categories = (
    unique_prs
    .groupby(['agent', 'is_sar'])
    [['bug', 'internal', 'external', 'functional', 'smell']]
    .sum()
    .reset_index()
)

sar_locations = (
    unique_prs
    .groupby(['agent', 'is_sar'])
    [['sar_in_pr_title', 'sar_in_pr_body', 'sar_in_pr_comment']]
    .sum()
    .reset_index()
)


summary = (
    total_requests
    .merge(total_merged, on=['agent', 'is_sar'], how='left')
    .merge(average_merged, on=['agent', 'is_sar'], how='left')
    .merge(sar_categories, on=['agent', 'is_sar'], how='left')
    .merge(sar_locations, on=['agent', 'is_sar'], how='left')
)
summary['merge_rate(%)'] = ((summary['total_merged'] / summary['total_requests']).round(2) * 100).astype(int)

In [7]:
combined_df

Unnamed: 0,pr_id,number,title,body,agent,user_id,user,state,created_at,closed_at,...,is_sar,bug,internal,external,functional,smell,sar_in_pr_title,sar_in_pr_body,sar_in_pr_comment,merge_time
1,3265118634,2,ファイルパス参照を相対パスに統一し、doc/からdocs/に統一,## 背景\n\n現在、本プロジェクトにおいて以下のパス構成の不整合が生じています：\n\n...,Claude_Code,61827001,cm-kojimat,closed,2025-07-26T04:56:55Z,2025-07-26T22:12:24Z,...,False,False,False,False,False,False,False,False,False,0.719086
20,3214782537,1538,Major Architecture Refactor - Configuration Sy...,### **User description**\r\nResolves #1529 \r\...,Claude_Code,1206,delano,closed,2025-07-09T07:05:44Z,2025-07-17T18:34:41Z,...,False,False,False,False,False,False,False,False,False,8.478438
30,3165023078,100,Convert to typescript,All JavaScript source files have been successf...,Claude_Code,14286938,jefago,closed,2025-06-21T11:39:26Z,2025-06-23T14:18:25Z,...,False,False,False,False,False,False,False,False,False,2.110405
57,3222101465,2520,♻️ Refactor database schema design workflow to...,## Issue\r\n\r\n- resolve: #2504\r\n\r\n## Why...,Claude_Code,31152321,MH4GF,closed,2025-07-11T08:45:31Z,2025-07-14T03:57:14Z,...,False,False,False,False,False,False,False,False,False,2.799792
80,3262310380,713,refactor: rename chain_dag to chain_let for se...,## Summary\n\nRenames chain_dag to chain_let t...,Claude_Code,4249447,lmeyerov,closed,2025-07-25T07:41:02Z,2025-07-28T07:42:25Z,...,False,False,False,False,False,False,False,False,False,
97,3165677582,5,Refactor: Transform to GitHub Issue Management...,Co-authored-by: Claude <noreply@anthropic.com>,Claude_Code,883228,nakamasato,closed,2025-06-22T05:31:32Z,2025-06-22T05:31:45Z,...,False,False,False,False,False,False,False,False,False,
111,3250200794,394,Refactor matrix functions for fixed dimension ...,Removes the `axis` parameter from matrix funct...,Claude_Code,5635139,max-sixty,closed,2025-07-21T22:25:43Z,2025-07-21T22:29:10Z,...,False,False,False,False,False,False,False,False,False,0.002396
124,3114327280,639,Rename ResourceReference to ResourceTemplateRe...,This change renames the ResourceReference inte...,Claude_Code,167242713,dsp-ant,closed,2025-06-03T14:53:41Z,2025-06-05T11:53:04Z,...,False,False,False,False,False,False,False,False,False,1.874572
137,3245746342,390,Migrate code generator to Scala 3,## Summary\nMigrated the DNP3 code generator f...,Claude_Code,305813,jadamcrain,closed,2025-07-19T22:54:38Z,2025-07-19T23:09:43Z,...,False,False,False,False,False,False,False,False,False,0.010475
160,3132983801,341,"migrate m_evn/m_odd to k{Even,Odd}Parity",Complete migration from m_evn/m_odd to descrip...,Claude_Code,130992531,jons-pf,closed,2025-06-10T10:14:35Z,2025-06-10T17:26:23Z,...,False,False,False,False,False,False,False,False,False,0.299861


In [8]:
summary

Unnamed: 0,agent,is_sar,total_requests,total_merged,average_merge_time(days),bug,internal,external,functional,smell,sar_in_pr_title,sar_in_pr_body,sar_in_pr_comment,merge_rate(%)
0,Claude_Code,False,23,13,1.87,0,0,0,0,0,0,0,0,56
1,Claude_Code,True,3,0,,2,0,1,0,0,0,3,0,0
2,Copilot,False,207,102,2.96,0,0,0,0,0,0,0,0,49
3,Copilot,True,94,42,4.43,74,19,22,3,1,12,93,0,45
4,Cursor,False,94,63,1.44,0,0,0,0,0,0,0,0,67
5,Cursor,True,17,9,0.91,13,1,4,0,0,3,15,0,53
6,Devin,False,346,208,0.98,0,0,0,0,0,0,0,0,60
7,Devin,True,91,40,0.91,71,12,10,1,4,9,90,0,44
8,OpenAI_Codex,False,1301,1036,0.45,0,0,0,0,0,0,0,0,80
9,OpenAI_Codex,True,112,91,0.24,79,17,5,15,1,19,103,0,81


# Testing


## Two-Sample test of SAR and non-SAR PRs

In [9]:
rq_testing = unique_prs[unique_prs['merged_at'].notna()][['agent', 'is_sar', 'merge_time']].copy()

# Check normality assumption to see if can use ANOVA test
# Shapiro-Wilk test
for agent in rq_testing['agent'].unique():
    for sar in [True, False]:
        data = rq_testing[(rq_testing['agent'] == agent) &
                          (rq_testing['is_sar'] == sar)]['merge_time']
        if len(data) >= 3:  # Need at least 3 samples
            stat, p = stats.shapiro(data)
            print(f"  {agent} (SAR={sar}): W={stat:.4f}, p={p:.4f}")

# Levene's test
print("\nLevene's test for homogeneity of variance:")
groups = [group['merge_time'].values for name, group in
          rq_testing.groupby(['agent', 'is_sar'])]
stat, p = stats.levene(*groups)
print(f"  Statistic={stat:.4f}, p={p:.4f}")

  Claude_Code (SAR=False): W=0.7677, p=0.0029
  Copilot (SAR=True): W=0.5173, p=0.0000
  Copilot (SAR=False): W=0.4360, p=0.0000
  OpenAI_Codex (SAR=True): W=0.2964, p=0.0000
  OpenAI_Codex (SAR=False): W=0.1369, p=0.0000
  Cursor (SAR=True): W=0.6445, p=0.0003
  Cursor (SAR=False): W=0.4519, p=0.0000
  Devin (SAR=True): W=0.3941, p=0.0000
  Devin (SAR=False): W=0.5754, p=0.0000

Levene's test for homogeneity of variance:
  Statistic=11.4696, p=0.0000


# Normality Assumptions:
- All Shapiro-Wilk p-values are less than 0.05 and Levene's p is less than 0.05, so normality and homogeneity assumptions are violated!!!
Use Kruskal-Wallis test instead

In [10]:
rq_testing['agent_sar'] = rq_testing['agent'] + '_' + rq_testing['is_sar'].astype(str)
groups_kw = [group['merge_time'].values for name, group in
             rq_testing.groupby('agent_sar')]
stat, p = kruskal(*groups_kw)
print(f"Kruskal-Wallis H-statistic: {stat:.4f}, p-value: {p:.4f}")

Kruskal-Wallis H-statistic: 558.7417, p-value: 0.0000


Kruskal-Wallis H: 558.7417 and p < 0.05 means there is significant differences across the agent-SAR combos

In [11]:
agent_results = []

for agent in rq_testing['agent'].unique():
    print(f"\n{agent}")
    print("-" * 60)

    agent_data = rq_testing[rq_testing['agent'] == agent]
    sar_data = agent_data[agent_data['is_sar'] == True]['merge_time']
    non_sar_data = agent_data[agent_data['is_sar'] == False]['merge_time']

    n_sar = len(sar_data)
    n_non_sar = len(non_sar_data)

    print(f"  SAR PRs: n={n_sar}, mean={sar_data.mean():.4f}, median={sar_data.median():.4f}")
    print(f"  Non-SAR PRs: n={n_non_sar}, mean={non_sar_data.mean():.4f}, median={non_sar_data.median():.4f}")

    if n_sar >= 3 and n_non_sar >= 3:
        # Use Mann-Whitney U (non-parametric)
        stat, p_val = mannwhitneyu(sar_data, non_sar_data, alternative='two-sided')
        test_used = "Mann-Whitney U test"

        # Calculate effect size (Cohen's d)
        pooled_std = np.sqrt(((n_sar-1)*sar_data.std()**2 +
                              (n_non_sar-1)*non_sar_data.std()**2) /
                             (n_sar + n_non_sar - 2))
        cohens_d = (sar_data.mean() - non_sar_data.mean()) / pooled_std

        print(f"  statistic={stat:.4f}, p-value={p_val:.4f}")
        print(f"  Cohen's d (effect size): {cohens_d:.4f}")

        agent_results.append({
            'agent': agent,
            'n_sar': n_sar,
            'n_non_sar': n_non_sar,
            'mean_sar': sar_data.mean(),
            'mean_non_sar': non_sar_data.mean(),
            'median_sar': sar_data.median(),
            'median_non_sar': non_sar_data.median(),
            'test': test_used,
            'p_value': p_val,
            'cohens_d': cohens_d,
            'significant': p_val < 0.05
        })
    else:
        print(f"  Insufficient data for statistical testing (need n>=3 for both groups)")

results_df = pd.DataFrame(agent_results)
if len(results_df) > 0:
    print(results_df.to_string(index=False))


Claude_Code
------------------------------------------------------------
  SAR PRs: n=0, mean=nan, median=nan
  Non-SAR PRs: n=13, mean=1.8745, median=0.7191
  Insufficient data for statistical testing (need n>=3 for both groups)

Copilot
------------------------------------------------------------
  SAR PRs: n=42, mean=4.4269, median=0.4706
  Non-SAR PRs: n=102, mean=2.9632, median=0.7525
  statistic=2016.0000, p-value=0.5812
  Cohen's d (effect size): 0.1879

OpenAI_Codex
------------------------------------------------------------
  SAR PRs: n=91, mean=0.2449, median=0.0023
  Non-SAR PRs: n=1036, mean=0.4508, median=0.0009
  statistic=55384.0000, p-value=0.0056
  Cohen's d (effect size): -0.0728

Cursor
------------------------------------------------------------
  SAR PRs: n=9, mean=0.9149, median=0.2703
  Non-SAR PRs: n=63, mean=1.4429, median=0.0855
  statistic=294.0000, p-value=0.8648
  Cohen's d (effect size): -0.1567

Devin
----------------------------------------------------

- Only OpenAI-Codex had significant results, as the p-value was below 0.05
  - SAR PRs merge faster than non-SAR PRs
  - The mean for a SAR is 0.24 days while the mean for a non-SAR was 0.45 days
  - Median for SAR: 0.002 days (which is about 3 minutes), median of non-SAR: 0.001 days (about 1.4 minutes)
  - SAR patterns have slightly faster merges
  - Cohen's D = 0.07 which means it is a very small effect though

- Other agents did not have significant results (P > 0.05), but Copilot's SAR PRs tended to be slower. Claude also couldn't be tested as there were no merged SAR PRs in the dataset. All PRs that had SAR patterns were not merged, so no merge time

# Threats to Validity
- Failed normality: The medians are much lower, meaning data is right-skewed with outliers
- Very large sample size imbalance for all agents
- All Cohen's d values are very small, suggesting that any differences are not very meaningful :(


# Results
- Time-to-merge does not differ meaningfully between PRs with and without SAR patterns overall (p = 1.000).
- Significant variation exists across agents (p < 0.001), but the agent-SAR interaction is not significant (p = 0.221).
- Only OpenAI_Codex shows a statistically significant difference (p = 0.006), where SAR PRs merge slightly faster than non-SAR PRs (median: 0.002 vs 0.001 days). However, this difference is not practically meaningful (Cohen's d = -0.07).