In [None]:
import pandas as pd
import re
import statsmodels.api as sm
from statsmodels.formula.api import ols
import numpy as np

from scipy import stats
from scipy.stats import f_oneway, mannwhitneyu, kruskal, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

if "__file__" in globals():
    script_dir = os.path.dirname(os.path.abspath(__file__))
else:
    script_dir = os.getcwd() 

# Path to parent/Data Analysis
data_analysis_path = os.path.abspath(os.path.join(script_dir, "..", "Data Analysis"))
data_analysis_path = os.path.abspath(data_analysis_path)

sys.path.append(data_analysis_path)

import bug_fix_list
import internal_list
import external_list
import functional_list
import code_smell_list

bug_words = bug_fix_list.bug_words
internal_words = internal_list.internal_words
external_words = external_list.external_words
functional_words = functional_list.functional_words
smell_words = code_smell_list.smell_words


Mounted at /content/drive


In [2]:
# How do time-to-first-review and time-to-merge differ between PRs with vs. without SAR patterns?
# -- How do these metrics vary by agent?
# -- How do they compare against non-SAR PRs?
# Compare individual agent times for SAR vs non-SAR

print('Starting script.')
pd.set_option('display.max_rows', None)
pull_request_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")
comments_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_comments.parquet")
pr_task_type_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_task_type.parquet")
print('Finished querying parquets.')
print(len(pull_request_df))
pull_request_df.rename(columns={'id': 'pr_id'}, inplace=True)

Starting script.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Finished querying parquets.
33596


In [3]:
# need body from pr_comments
combined_df = pull_request_df.merge(
    comments_df[['id', 'body']].add_prefix('comment_'),
    how='left',
    left_on='pr_id',
    right_on='comment_id',
    suffixes=('', '_task')
)
combined_df.drop(columns=['comment_id'])

# only look at refactor
combined_df = combined_df.merge(
    pr_task_type_df[['id', 'type']],
    how='left',
    left_on='pr_id',
    right_on='id'
)
combined_df.rename(columns={'type': 'pr_type'}, inplace=True)
combined_df = combined_df.loc[combined_df['pr_type'].str.contains('refactor', na=False)].copy()

print(combined_df['pr_id'].nunique())

print(f"Length of combined_df: {len(combined_df)}")

2288
Length of combined_df: 2288


In [4]:
print('Beginning regex searches.')

def to_regex_pattern(word):
    return re.escape(word).replace(r'\*', '.*')

bug_patterns = re.compile('|'.join(to_regex_pattern(w) for w in bug_words), re.IGNORECASE)
internal_patterns = re.compile('|'.join(to_regex_pattern(w) for w in internal_words), re.IGNORECASE)
external_patterns = re.compile('|'.join(to_regex_pattern(w) for w in external_words), re.IGNORECASE)
functional_patterns = re.compile('|'.join(to_regex_pattern(w) for w in functional_words), re.IGNORECASE)
smell_patterns = re.compile('|'.join(to_regex_pattern(w) for w in smell_words), re.IGNORECASE)

all_patterns = re.compile(
    '|'.join(to_regex_pattern(w) for w in (bug_words + internal_words + external_words + functional_words + smell_words)),
    re.IGNORECASE
)

print('Finished assembling regex patterns.')


Beginning regex searches.
Finished assembling regex patterns.


In [5]:

print('Searching for sar patterns.')
# search for any pattern
combined_df['is_sar'] = (
    combined_df['body'].str.contains(all_patterns, na=False) |
    combined_df['comment_body'].str.contains(all_patterns, na=False) |
    combined_df['title'].str.contains(all_patterns, na=False)
)
print('Finished searching any pattern.')

# search for specific category of pattern (bug, internal, external, functional, smell)
combined_df['bug'] = (
    combined_df['body'].str.contains(bug_patterns, na=False) |
    combined_df['comment_body'].str.contains(bug_patterns, na=False) |
    combined_df['title'].str.contains(bug_patterns, na=False)
)
print('Finished searching bug patterns.')
combined_df['internal'] = (
    combined_df['body'].str.contains(internal_patterns, na=False) |
    combined_df['comment_body'].str.contains(internal_patterns, na=False) |
    combined_df['title'].str.contains(internal_patterns, na=False)
)
print('Finished searching internal patterns.')
combined_df['external'] = (
    combined_df['body'].str.contains(external_patterns, na=False) |
    combined_df['comment_body'].str.contains(external_patterns, na=False) |
    combined_df['title'].str.contains(external_patterns, na=False)
)
print('Finished searching external patterns.')
combined_df['functional'] = (
    combined_df['body'].str.contains(functional_patterns, na=False) |
    combined_df['comment_body'].str.contains(functional_patterns, na=False) |
    combined_df['title'].str.contains(functional_patterns, na=False)
)
print('Finished searching functional patterns.')
combined_df['smell'] = (
    combined_df['body'].str.contains(smell_patterns, na=False) |
    combined_df['comment_body'].str.contains(smell_patterns, na=False) |
    combined_df['title'].str.contains(smell_patterns, na=False)
)
print('Finished searching smell patterns.')

# search any category pattern in specific locations in the PR
combined_df['sar_in_pr_title'] = combined_df['title'].str.contains(all_patterns, na=False)
print('Finished searching for sar in title.')
combined_df['sar_in_pr_body'] = combined_df['body'].str.contains(all_patterns, na=False)
print('Finished searching for sar in body.')
combined_df['sar_in_pr_comment'] = combined_df['comment_body'].str.contains(all_patterns, na=False)
print('Finished searching for sar in comment.')

# add column for merge time in DAYS instead of seconds
combined_df['merge_time'] = (pd.to_datetime(combined_df['merged_at']) - pd.to_datetime(combined_df['created_at'])).dt.total_seconds() / 86400

Searching for sar patterns.
Finished searching any pattern.
Finished searching bug patterns.
Finished searching internal patterns.
Finished searching external patterns.
Finished searching functional patterns.
Finished searching smell patterns.
Finished searching for sar in title.
Finished searching for sar in body.
Finished searching for sar in comment.


In [6]:

unique_prs = combined_df.drop_duplicates(subset=['pr_id'])

total_requests = (
    unique_prs
    .groupby(['agent', 'is_sar'])
    .size()
    # ** issue with my python LSP, not a problem at runtime **
    .reset_index(name='total_requests') # type: ignore
)

agents = unique_prs['agent'].unique()
sar_flags = [True, False]
all_groups = pd.MultiIndex.from_product([agents, sar_flags], names=['agent', 'is_sar'])

total_merged = (
    unique_prs[unique_prs['merged_at'].notna()]
    .groupby(['agent', 'is_sar'])['merge_time']
    .size()
    .reindex(all_groups, fill_value=0)
    .reset_index(name='total_merged') # type: ignore
)

# find average of both sar and non_sar, and drop unmerged PRs
average_merged = (
    unique_prs[unique_prs['merged_at'].notna()]
    .groupby(['agent', 'is_sar'])['merge_time']
    .mean()
    .round(2)
    .reindex(all_groups, fill_value='n/a')
    .reset_index(name='average_merge_time(days)') # type: ignore
)

sar_categories = (
    unique_prs
    .groupby(['agent', 'is_sar'])
    [['bug', 'internal', 'external', 'functional', 'smell']]
    .sum()
    .reset_index()
)

sar_locations = (
    unique_prs
    .groupby(['agent', 'is_sar'])
    [['sar_in_pr_title', 'sar_in_pr_body', 'sar_in_pr_comment']]
    .sum()
    .reset_index()
)


summary = (
    total_requests
    .merge(total_merged, on=['agent', 'is_sar'], how='left')
    .merge(average_merged, on=['agent', 'is_sar'], how='left')
    .merge(sar_categories, on=['agent', 'is_sar'], how='left')
    .merge(sar_locations, on=['agent', 'is_sar'], how='left')
)
summary['merge_rate(%)'] = ((summary['total_merged'] / summary['total_requests']).round(2) * 100).astype(int)

In [7]:
combined_df.head()

Unnamed: 0,pr_id,number,title,body,agent,user_id,user,state,created_at,closed_at,...,is_sar,bug,internal,external,functional,smell,sar_in_pr_title,sar_in_pr_body,sar_in_pr_comment,merge_time
1,3265118634,2,ファイルパス参照を相対パスに統一し、doc/からdocs/に統一,## 背景\n\n現在、本プロジェクトにおいて以下のパス構成の不整合が生じています：\n\n...,Claude_Code,61827001,cm-kojimat,closed,2025-07-26T04:56:55Z,2025-07-26T22:12:24Z,...,False,False,False,False,False,False,False,False,False,0.719086
20,3214782537,1538,Major Architecture Refactor - Configuration Sy...,### **User description**\r\nResolves #1529 \r\...,Claude_Code,1206,delano,closed,2025-07-09T07:05:44Z,2025-07-17T18:34:41Z,...,False,False,False,False,False,False,False,False,False,8.478438
30,3165023078,100,Convert to typescript,All JavaScript source files have been successf...,Claude_Code,14286938,jefago,closed,2025-06-21T11:39:26Z,2025-06-23T14:18:25Z,...,False,False,False,False,False,False,False,False,False,2.110405
57,3222101465,2520,♻️ Refactor database schema design workflow to...,## Issue\r\n\r\n- resolve: #2504\r\n\r\n## Why...,Claude_Code,31152321,MH4GF,closed,2025-07-11T08:45:31Z,2025-07-14T03:57:14Z,...,False,False,False,False,False,False,False,False,False,2.799792
80,3262310380,713,refactor: rename chain_dag to chain_let for se...,## Summary\n\nRenames chain_dag to chain_let t...,Claude_Code,4249447,lmeyerov,closed,2025-07-25T07:41:02Z,2025-07-28T07:42:25Z,...,False,False,False,False,False,False,False,False,False,


In [8]:
combined_df.columns

Index(['pr_id', 'number', 'title', 'body', 'agent', 'user_id', 'user', 'state',
       'created_at', 'closed_at', 'merged_at', 'repo_id', 'repo_url',
       'html_url', 'comment_id', 'comment_body', 'id', 'pr_type', 'is_sar',
       'bug', 'internal', 'external', 'functional', 'smell', 'sar_in_pr_title',
       'sar_in_pr_body', 'sar_in_pr_comment', 'merge_time'],
      dtype='object')

In [9]:
summary

Unnamed: 0,agent,is_sar,total_requests,total_merged,average_merge_time(days),bug,internal,external,functional,smell,sar_in_pr_title,sar_in_pr_body,sar_in_pr_comment,merge_rate(%)
0,Claude_Code,False,23,13,1.87,0,0,0,0,0,0,0,0,56
1,Claude_Code,True,3,0,,2,0,1,0,0,0,3,0,0
2,Copilot,False,207,102,2.96,0,0,0,0,0,0,0,0,49
3,Copilot,True,94,42,4.43,74,19,22,3,1,12,93,0,45
4,Cursor,False,94,63,1.44,0,0,0,0,0,0,0,0,67
5,Cursor,True,17,9,0.91,13,1,4,0,0,3,15,0,53
6,Devin,False,346,208,0.98,0,0,0,0,0,0,0,0,60
7,Devin,True,91,40,0.91,71,12,10,1,4,9,90,0,44
8,OpenAI_Codex,False,1301,1036,0.45,0,0,0,0,0,0,0,0,80
9,OpenAI_Codex,True,112,91,0.24,79,17,5,15,1,19,103,0,81


# Testing

RQ4: How do SAR patterns relate to PR acceptance outcomes?
- RQ4:1. What is the acceptance rate for PRs with vs. without
SAR patterns?
- RQ4:2. How do acceptance rates vary by agent?
- RQ4:3. Do specific SAR intent categories show different accep-
tance patterns?


# Acceptance rate for PRs with vs without SAR patterns


In [10]:
prs_with_sar_patterns = combined_df[combined_df['is_sar'] == True]
prs_without_sar_patterns = combined_df[combined_df['is_sar'] == False]

print(prs_with_sar_patterns.size)
print(prs_without_sar_patterns.size)

8876
55188


In [11]:
prs_with_merged = prs_with_sar_patterns[prs_with_sar_patterns["merged_at"].notna()]
prs_without_merged = prs_without_sar_patterns[prs_without_sar_patterns["merged_at"].notna()]

# Build the 2×2 contingency table:
# rows: [with_sar, without_sar]
# cols: [merged, not_merged]
merged_with = prs_with_merged.shape[0]
merged_without = prs_without_merged.shape[0]

not_merged_with = prs_with_sar_patterns.shape[0] - merged_with
not_merged_without = prs_without_sar_patterns.shape[0] - merged_without

table = np.array([
    [merged_with, not_merged_with],
    [merged_without, not_merged_without]
])

chi2, p, dof, expected = chi2_contingency(table)

# effect size
N = table.sum()
phi = np.sqrt(chi2 / N)

signed_deviation = table - expected
residuals = signed_deviation / np.sqrt(expected)

print("Contingency table:\n", table)
print("Chi-square:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected counts:\n", expected)
print("Signed deviations (O - E):\n", signed_deviation)
print("Standardized residuals:\n", residuals)
print("Phi effect size:", phi)

Contingency table:
 [[ 182  135]
 [1422  549]]
Chi-square: 27.583785980082986
p-value: 1.504345299433526e-07
Degrees of freedom: 1
Expected counts:
 [[ 222.23251748   94.76748252]
 [1381.76748252  589.23251748]]
Signed deviations (O - E):
 [[-40.23251748  40.23251748]
 [ 40.23251748 -40.23251748]]
Standardized residuals:
 [[-2.6988168   4.13282996]
 [ 1.08233007 -1.65742489]]
Phi effect size: 0.10979913713581968


As the p value < 0.05 (p = 1.504345299433526e-07), there is a very significant difference between the two groups

# How do acceptance rates vary by agent?

In [12]:
 # chi square test of indepdndence per agent x merged/not merged
for agent in summary['agent'].unique():
  agent_prs_with_sar_patterns = prs_with_sar_patterns[prs_with_sar_patterns['agent'] == agent]
  agent_prs_without_sar_patterns = prs_without_sar_patterns[prs_without_sar_patterns['agent'] == agent]

  agent_prs_with_merged = agent_prs_with_sar_patterns[agent_prs_with_sar_patterns["merged_at"].notna()]
  agent_prs_without_merged = agent_prs_without_sar_patterns[agent_prs_without_sar_patterns["merged_at"].notna()]

  # Build the 2×2 contingency table:
  # rows: [with_sar, without_sar]
  # cols: [merged, not_merged]
  agent_merged_with = agent_prs_with_merged.shape[0]
  agent_merged_without = agent_prs_without_merged.shape[0]

  agent_not_merged_with = agent_prs_with_sar_patterns.shape[0] - agent_merged_with
  agent_not_merged_without = agent_prs_without_sar_patterns.shape[0] - agent_merged_without

  table = np.array([
      [agent_merged_with, agent_not_merged_with],
      [agent_merged_without, agent_not_merged_without]
  ])

  chi2, p, dof, expected = chi2_contingency(table)

  # effect size
  N = table.sum()
  phi = np.sqrt(chi2 / N)

  signed_deviation = table - expected
  residuals = signed_deviation / np.sqrt(expected)

  print(f"Agent: {agent}")
  print("Chi-square:", chi2)
  print("p-value:", p)
  print("Degrees of freedom:", dof)
  print("Expected counts:\n", expected)
  print("Phi effect size:", phi)
  print("Signed deviations (O - E):\n", signed_deviation)
  print("Standardized residuals:\n", residuals)
  print("\n")

Agent: Claude_Code
Chi-square: 1.507246376811594
p-value: 0.2195597461121137
Degrees of freedom: 1
Expected counts:
 [[ 1.5  1.5]
 [11.5 11.5]]
Phi effect size: 0.2407717061715384
Signed deviations (O - E):
 [[-1.5  1.5]
 [ 1.5 -1.5]]
Standardized residuals:
 [[-1.22474487  1.22474487]
 [ 0.44232587 -0.44232587]]


Agent: Copilot
Chi-square: 0.3782405599068448
p-value: 0.5385463662179483
Degrees of freedom: 1
Expected counts:
 [[ 44.97009967  49.02990033]
 [ 99.02990033 107.97009967]]
Phi effect size: 0.03544873983425495
Signed deviations (O - E):
 [[-2.97009967  2.97009967]
 [ 2.97009967 -2.97009967]]
Standardized residuals:
 [[-0.44290349  0.42417056]
 [ 0.29846118 -0.28583754]]


Agent: Cursor
Chi-square: 0.7107035838066811
p-value: 0.39921033804185546
Degrees of freedom: 1
Expected counts:
 [[11.02702703  5.97297297]
 [60.97297297 33.02702703]]
Phi effect size: 0.08001709185698627
Signed deviations (O - E):
 [[-2.02702703  2.02702703]
 [ 2.02702703 -2.02702703]]
Standardized residu

# Do specific SAR intent categories have different acceptance patterns?

In [13]:
# run for each acceptance pattern
for pattern in ['bug', 'internal', 'external', 'functional', 'smell']:
  pattern_prs_with_sar_patterns = prs_with_sar_patterns[prs_with_sar_patterns[pattern]]
  pattern_prs_without_sar_patterns = prs_without_sar_patterns[~prs_without_sar_patterns[pattern]]

  pattern_prs_with_merged = pattern_prs_with_sar_patterns[pattern_prs_with_sar_patterns["merged_at"].notna()]
  pattern_prs_without_merged = pattern_prs_without_sar_patterns[pattern_prs_without_sar_patterns["merged_at"].notna()]

  # Build the 2×2 contingency table:
  # rows: [with_sar, without_sar]
  # cols: [merged, not_merged]
  pattern_merged_with = pattern_prs_with_merged.shape[0]
  pattern_merged_without = pattern_prs_without_merged.shape[0]

  pattern_not_merged_with = pattern_prs_with_sar_patterns.shape[0] - pattern_merged_with
  pattern_not_merged_without = pattern_prs_without_sar_patterns.shape[0] - pattern_merged_without

  table = np.array([
      [pattern_merged_with, pattern_not_merged_with],
      [pattern_merged_without, pattern_not_merged_without]
  ])

  chi2, p, dof, expected = chi2_contingency(table)
  # effect size
  N = table.sum()
  phi = np.sqrt(chi2 / N)

  signed_deviation = table - expected
  residuals = signed_deviation / np.sqrt(expected)

  print(f"Pattern: {pattern}")
  print("Chi-square:", chi2)
  print("p-value:", p)
  print("Degrees of freedom:", dof)
  print("Expected counts:\n", expected)
  print("Phi effect size:", phi)
  print("Signed deviations (O - E):\n", signed_deviation)
  print("Standardized residuals:\n", residuals)
  print("\n")

Pattern: bug
Chi-square: 19.435497745425614
p-value: 1.0405493705205562e-05
Degrees of freedom: 1
Expected counts:
 [[ 168.81402715   70.18597285]
 [1392.18597285  578.81402715]]
Phi effect size: 0.09377815775132986
Signed deviations (O - E):
 [[-29.81402715  29.81402715]
 [ 29.81402715 -29.81402715]]
Standardized residuals:
 [[-2.2946496   3.55873361]
 [ 0.79904643 -1.23922772]]


Pattern: internal
Chi-square: 13.707194768349442
p-value: 0.00021363449085801104
Degrees of freedom: 1
Expected counts:
 [[  35.0519802   13.9480198]
 [1409.9480198  561.0519802]]
Phi effect size: 0.08237560308933348
Signed deviations (O - E):
 [[-12.0519802  12.0519802]
 [ 12.0519802 -12.0519802]]
Standardized residuals:
 [[-2.03564541  3.22702353]
 [ 0.3209644  -0.50881143]]


Pattern: external
Chi-square: 40.13225409581345
p-value: 2.3733773877014543e-10
Degrees of freedom: 1
Expected counts:
 [[  29.89865872   12.10134128]
 [1403.10134128  567.89865872]]
Phi effect size: 0.14119681136941775
Signed deviat

In [14]:
sample_df = combined_df.sample(n=330)
sample_df.iloc[0]['body']

'This PR removes all arc4/rc4 references as specified in the attachment file, based on https://github.com/anhu/wolfssl/pull/new/rm_arc4.\n\nLink to Devin run: https://app.devin.ai/sessions/fc59bb680e2f43f5b4c568578da17a0b\nRequested by: Anthony H (anthony@wolfssl.com)'