In [1]:
import pandas as pd
import re
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from bug_fix_list import bug_words
from internal_list import internal_words
from external_list import external_words
from functional_list import functional_words
from code_smell_list import smell_words

print('starting script.')
pd.set_option('display.max_rows', None)

pull_request_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")
comments_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_comments.parquet")
pr_task_type_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_task_type.parquet")
print('finished querying parquets.')

pull_request_df.rename(columns={'id': 'pr_id'}, inplace=True)

# need comment bodies
combined_df = pull_request_df.merge(
    comments_df[['id', 'body']].add_prefix('comment_'),
    how='left',
    left_on='pr_id',
    right_on='comment_id'
)

# only keep refactor prs
combined_df = combined_df.merge(
    pr_task_type_df[['id', 'type']],
    how='left',
    left_on='pr_id',
    right_on='id'
)
combined_df.rename(columns={'type': 'pr_type'}, inplace=True)
combined_df = combined_df.loc[combined_df['pr_type'].str.contains('refactor', na=False)].copy()

print(combined_df['pr_id'].nunique())
print(f"length of combined_df: {len(combined_df)}")

print('building regex.')

def to_regex_pattern(word):
    return re.escape(word).replace(r'\*', '.*')

all_patterns = re.compile(
    '|'.join(to_regex_pattern(w) for w in (
        bug_words + internal_words + external_words + functional_words + smell_words
    )),
    re.IGNORECASE
)

print('searching sar patterns.')

# check title/body for sar
combined_df['sar_in_pr_title'] = combined_df['title'].str.contains(all_patterns, na=False)
combined_df['sar_in_pr_body'] = combined_df['body'].str.contains(all_patterns, na=False)

# drop dupes so 1 row per pr
unique_prs = combined_df.drop_duplicates(subset=['pr_id'])

# count combos
title_only = unique_prs[(unique_prs['sar_in_pr_title']) & (~unique_prs['sar_in_pr_body'])]
body_only = unique_prs[(unique_prs['sar_in_pr_body']) & (~unique_prs['sar_in_pr_title'])]
both = unique_prs[(unique_prs['sar_in_pr_title']) & (unique_prs['sar_in_pr_body'])]
neither = unique_prs[(~unique_prs['sar_in_pr_title']) & (~unique_prs['sar_in_pr_body'])]

print("----- sar pattern location summary -----")
print(f"title only:  {len(title_only)}")
print(f"body only:   {len(body_only)}")
print(f"both:        {len(both)}")
print(f"neither:     {len(neither)}")
print("----------------------------------------")

# statistical analysis with fisher's exact test
table = [
    [len(both), len(title_only)],
    [len(body_only), len(neither)]
]

odds_ratio, p_value = fisher_exact(table)

print("\n----- fisher exact test (title vs body) -----")
print(f"2x2 table: {table}")
print(f"odds ratio: {odds_ratio}")
print(f"p-value: {p_value}")
print("---------------------------------------------")

starting script.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


finished querying parquets.
2288
length of combined_df: 2288
building regex.
searching sar patterns.
----- sar pattern location summary -----
title only:  13
body only:   274
both:        30
neither:     1971
----------------------------------------

----- fisher exact test (title vs body) -----
2x2 table: [[30, 13], [274, 1971]]
odds ratio: 16.600224592925322
p-value: 9.989071974289105e-18
---------------------------------------------
