RQ1: Is there an association between repository activity characteristics (number of commits and project age) and the intensity of performance testing configurations (num-
ber of users and test duration) in open-source repositories?


RQ2: How do repositories that implement both GUI and performance end-to-end tests differ from repositories that implement only one type of end-to-end test in terms of project activity metrics (number of commits, contributors, total issues, and total pull requests)?

In [83]:
!pip install pandas

import pandas as pd
from scipy import stats
import numpy as np



In [84]:
df_repository_general = pd.read_csv('E2EGit\\repository.csv')


df_repository_general = df_repository_general.rename(columns={'name': 'repository_name'})

# Keep only the columns you want
df_repository_general = df_repository_general[['repository_name', 'commits','contributors', 'total_issues', 'total_pull_requests']]

#df_repository_general = df_repository_general.drop_duplicates().reset_index(drop=True)  

# Apply filtering conditions
df_filtered = df_repository_general[
    (df_repository_general['commits'] >= 2000) &
    (df_repository_general['contributors'] >= 10) &
    (df_repository_general['total_issues'] >= 100) &
    (df_repository_general['total_pull_requests'] >= 50)
].reset_index(drop=True)

df_filtered.head(10)

  df_repository_general = pd.read_csv('E2EGit\\repository.csv')


Unnamed: 0,repository_name,commits,contributors,total_issues,total_pull_requests
0,sparklemotion/nokogiri,7305.0,224.0,1953.0,1095.0
1,junit-team/junit4,2509.0,146.0,829.0,903.0
2,unclebob/fitnesse,6054.0,114.0,763.0,727.0
3,connectbot/connectbot,2207.0,57.0,527.0,855.0
4,bndtools/bndtools,3563.0,28.0,1291.0,643.0
5,caelum/vraptor,3448.0,60.0,455.0,209.0
6,maxcom/lorsource,8917.0,75.0,301.0,758.0
7,rzwitserloot/lombok,3243.0,106.0,2458.0,354.0
8,voldemort/voldemort,4263.0,57.0,136.0,371.0
9,jdbi/jdbi,5615.0,131.0,1051.0,1555.0


In [85]:
df_repository_non_trivial = pd.read_csv('E2EGit\\non_trivial_repository.csv')

df_repository_non_trivial = df_repository_non_trivial.rename(columns={'name': 'repository_name'})

df_repository_non_trivial = df_repository_non_trivial[['repository_name']]

In [86]:
df_repository = pd.merge(df_filtered, df_repository_non_trivial, left_on='repository_name', right_on='repository_name', how='inner')

print(len(df_repository))

print(df_repository)

10727
                         repository_name  commits  contributors  total_issues  \
0                 sparklemotion/nokogiri   7305.0         224.0        1953.0   
1                      junit-team/junit4   2509.0         146.0         829.0   
2                      unclebob/fitnesse   6054.0         114.0         763.0   
3                  connectbot/connectbot   2207.0          57.0         527.0   
4                      bndtools/bndtools   3563.0          28.0        1291.0   
...                                  ...      ...           ...           ...   
10722             jeff-regier/Celeste.jl   2316.0          11.0         313.0   
10723                 denizyuret/Knet.jl   3086.0          33.0         495.0   
10724                numenta/htmresearch   5346.0          27.0         115.0   
10725          huaweicloud/ModelArts-Lab   2541.0         277.0         171.0   
10726  girlscript/winter-of-contributing  10156.0         416.0        4098.0   

       total_pull_req

In [87]:
numeric_cols = ['commits', 'contributors', 'total_issues', 'total_pull_requests']

for col in numeric_cols:
    df_repository[col] = pd.to_numeric(df_repository[col], errors='coerce')

# Drop rows with NaN (caused by non-numeric values)
df_repository = df_repository.dropna(subset=numeric_cols)

print(len(df_repository))

10727


In [88]:
df_gui_repo_details = pd.read_csv('E2EGit\gui_testing_repo_details.csv')

# Keep only the columns you want
df_gui_repo_details = df_gui_repo_details[['repository_name']]
#df_gui_repo_details = df_gui_repo_details.drop_duplicates().reset_index(drop=True)
#df_gui_repo_details.head(10)

print(df_gui_repo_details)
print(len(df_gui_repo_details))

                        repository_name
0                       quantumlib/cirq
1                           wandb/weave
2    insightsoftwareconsortium/itk-wasm
3              codyogden/killedbygoogle
4                 mattermost/mattermost
..                                  ...
467               wp-graphql/wp-graphql
468               reduxjs/redux-toolkit
469           facebook/create-react-app
470                            zkoss/zk
471                   grafana/pyroscope

[472 rows x 1 columns]
472


In [89]:
df_performance_test_details = pd.read_csv('E2EGit\performance_testing_test_details.csv')


# Keep only the columns you want
df_performance_test_details = df_performance_test_details[['repository_name']]
#df_performance_test_details = df_performance_test_details.drop_duplicates().reset_index(drop=True)  
df_performance_test_details.head(10)

print(df_performance_test_details)
print(len(df_performance_test_details))

              repository_name
0               apache/roller
1    nysenate/openlegislation
2    nysenate/openlegislation
3    nysenate/openlegislation
4    nysenate/openlegislation
..                        ...
405  HumanSignal/label-studio
406  HumanSignal/label-studio
407  HumanSignal/label-studio
408       jetty/jetty.project
409       jetty/jetty.project

[410 rows x 1 columns]
410


In [90]:
df_both_tests = pd.merge(df_performance_test_details, df_gui_repo_details, left_on='repository_name', right_on='repository_name', how='inner')

df_both_tests.head(10)

print(df_both_tests)
print(len(df_both_tests))


             repository_name
0              apache/roller
1          apache/tapestry-5
2        zkoss/zkspreadsheet
3           eugenp/tutorials
4           eugenp/tutorials
..                       ...
85  HumanSignal/label-studio
86  HumanSignal/label-studio
87  HumanSignal/label-studio
88  HumanSignal/label-studio
89  HumanSignal/label-studio

[90 rows x 1 columns]
90


In [91]:
# Not merged repositories (exist in GUI but not in Performance)
df_gui_only = pd.merge(df_gui_repo_details, df_performance_test_details, on='repository_name', how='left', indicator=True)
df_gui_only = df_gui_only[df_gui_only['_merge'] == 'left_only'][['repository_name']]

# Not merged repositories (exist in Performance but not in GUI)
df_perf_only = pd.merge(df_performance_test_details, df_gui_repo_details, on='repository_name', how='left', indicator=True)
df_perf_only = df_perf_only[df_perf_only['_merge'] == 'left_only'][['repository_name']]


# Combine the two "not merged" dataframes
not_merged_all = pd.concat([df_gui_only, df_perf_only]).reset_index(drop=True)

print(not_merged_all)

print(len(not_merged_all))

                                    repository_name
0                                   quantumlib/cirq
1                                       wandb/weave
2                insightsoftwareconsortium/itk-wasm
3                          codyogden/killedbygoogle
4                             mattermost/mattermost
..                                              ...
774  ballerina-platform/ballerina-performance-cloud
775  ballerina-platform/ballerina-performance-cloud
776  ballerina-platform/ballerina-performance-cloud
777                             jetty/jetty.project
778                             jetty/jetty.project

[779 rows x 1 columns]
779


In [103]:
# merged repositories (exist in both GUI and Performance) with repository details

repo_both_with_repository_details = pd.merge(df_both_tests, df_repository, on='repository_name', how='inner')

repo_both_with_repository_details.head(10)

Unnamed: 0,repository_name,commits,contributors,total_issues,total_pull_requests
0,eugenp/tutorials,15392.0,321.0,733.0,15512.0
1,eugenp/tutorials,15392.0,321.0,733.0,15512.0
2,eugenp/tutorials,15392.0,321.0,733.0,15512.0
3,eugenp/tutorials,15392.0,321.0,733.0,15512.0
4,eugenp/tutorials,15392.0,321.0,733.0,15512.0
5,eugenp/tutorials,15392.0,321.0,733.0,15512.0
6,eugenp/tutorials,15392.0,321.0,733.0,15512.0
7,eugenp/tutorials,15392.0,321.0,733.0,15512.0
8,eugenp/tutorials,15392.0,321.0,733.0,15512.0
9,eugenp/tutorials,15392.0,321.0,733.0,15512.0


In [104]:
# not merged repositories (exist in GUI or Performance but not in both) with repository details
# repo_not_merged_with_repository_details = pd.merge(not_merged_all, df_repository, on='repository_name', how='inner')

# repo_not_merged_with_repository_details.head(10)

repo_gui_only_with_repository_details = pd.merge(df_gui_only, df_repository, on='repository_name', how='inner')

repo_perf_only_with_repository_details = pd.merge(df_perf_only, df_repository, on='repository_name', how='inner')

print(len(repo_gui_only_with_repository_details))
print(len(repo_perf_only_with_repository_details))


387
200


In [94]:
repo_both_with_repository_details['test_type'] = 'Both'
repo_gui_only_with_repository_details['test_type'] = 'Only'
repo_perf_only_with_repository_details['test_type'] = 'Only'

In [105]:
df_all = pd.concat([repo_both_with_repository_details, repo_gui_only_with_repository_details, repo_perf_only_with_repository_details], ignore_index=True)

print(len(df_all))

674


##### Normality Tests

In [96]:
normality_results = {}

for test_type in df_all['test_type'].unique():
    print(f"{test_type}:")
    subset = df_all[df_all['test_type'] == test_type]
    normality_results[test_type] = {}
    
    for metric in numeric_cols:
        stat, p_value = stats.shapiro(subset[metric])
        is_normal = p_value > 0.05
        normality_results[test_type][metric] = is_normal
        print(f"  {metric:15s}: W={stat:.4f}, p={p_value:.4f} | "
              f"{'Normal ✓' if is_normal else 'Non-normal ✗'}")
    print()

# Check if we should use parametric or non-parametric tests
all_normal = all(all(normality_results[tt].values()) for tt in normality_results)
print(f"Recommendation: Use {'Parametric (t-test)' if all_normal else 'Non-parametric (Mann-Whitney U)'} tests")
print("However, we'll perform BOTH tests for comprehensive analysis.\n")

Both:
  commits        : W=0.6977, p=0.0000 | Non-normal ✗
  contributors   : W=0.6164, p=0.0000 | Non-normal ✗
  total_issues   : W=0.5144, p=0.0000 | Non-normal ✗
  total_pull_requests: W=0.6631, p=0.0000 | Non-normal ✗

Only:
  commits        : W=0.6663, p=0.0000 | Non-normal ✗
  contributors   : W=0.8306, p=0.0000 | Non-normal ✗
  total_issues   : W=0.2725, p=0.0000 | Non-normal ✗
  total_pull_requests: W=0.6317, p=0.0000 | Non-normal ✗

Recommendation: Use Non-parametric (Mann-Whitney U) tests
However, we'll perform BOTH tests for comprehensive analysis.



In [97]:
summary_stats = df_all.groupby('test_type').agg({
    'commits': ['mean', 'median', 'std', 'count'],
    'contributors': ['mean', 'median', 'std'],
    'total_issues': ['mean', 'median', 'std'],
    'total_pull_requests': ['mean', 'median', 'std']
}).round(2)

print(summary_stats)

            commits                         contributors                 \
               mean  median       std count         mean median     std   
test_type                                                                 
Both       10960.68  5690.0  11288.33    87       103.24   74.0   92.21   
Only       10602.38  7239.0  10931.03   587       142.67   87.0  126.86   

          total_issues                  total_pull_requests                   
                  mean  median      std                mean  median      std  
test_type                                                                     
Both           2047.72  1353.0  1974.08             3933.71  2307.0  4951.84  
Only           3400.16  1506.0  8385.07             4412.81  2798.0  5628.31  


In [98]:
for metric in numeric_cols:
    # Separate data by test type
    gui_only = df_all[df_all['test_type'] == 'GUI Only'][metric]
    perf_only = df_all[df_all['test_type'] == 'Performance Only'][metric]
    both = df_all[df_all['test_type'] == 'Both'][metric]
    
    # Perform one-way ANOVA
    f_stat, p_value = stats.f_oneway(gui_only, perf_only, both)
    
    print(f"{metric.upper()}")
    print(f"  F-statistic: {f_stat:.4f}")
    print(f"  P-value: {p_value:.4f}")
    print(f"  Significant at α=0.05: {'YES' if p_value < 0.05 else 'NO'}")
    print()

COMMITS
  F-statistic: nan
  P-value: nan
  Significant at α=0.05: NO

CONTRIBUTORS
  F-statistic: nan
  P-value: nan
  Significant at α=0.05: NO

TOTAL_ISSUES
  F-statistic: nan
  P-value: nan
  Significant at α=0.05: NO

TOTAL_PULL_REQUESTS
  F-statistic: nan
  P-value: nan
  Significant at α=0.05: NO



  f_stat, p_value = stats.f_oneway(gui_only, perf_only, both)


In [99]:
for metric in numeric_cols:
    # Separate data by test type
    single  = df_all[df_all['test_type'] != 'Both'][metric]
    both = df_all[df_all['test_type'] == 'Both'][metric]
    
    # Perform one-way ANOVA
    f_stat, p_value = stats.f_oneway(single, both)
    
    print(f"{metric.upper()}")
    print(f"  F-statistic: {f_stat:.4f}")
    print(f"  P-value: {p_value:.4f}")
    print(f"  Significant at α=0.05: {'YES' if p_value < 0.05 else 'NO'}")
    print()

COMMITS
  F-statistic: 0.0807
  P-value: 0.7764
  Significant at α=0.05: NO

CONTRIBUTORS
  F-statistic: 7.7902
  P-value: 0.0054
  Significant at α=0.05: YES

TOTAL_ISSUES
  F-statistic: 2.2422
  P-value: 0.1348
  Significant at α=0.05: NO

TOTAL_PULL_REQUESTS
  F-statistic: 0.5654
  P-value: 0.4524
  Significant at α=0.05: NO



In [100]:
# Separate data by test type
both_tests = df_all[df_all['test_type'] == 'Both']
single_test = df_all[df_all['test_type'] != 'Both']

print(f"Sample sizes:")
print(f"  Both GUI & Performance: {len(both_tests)} repositories")
print(f"  Single test type (GUI or Performance): {len(single_test)} repositories")
print("\n" + "-" * 80 + "\n")

for metric in numeric_cols:
    both_data = both_tests[metric]
    single_data = single_test[metric]
    
    # Perform independent samples t-test
    t_stat, p_value = stats.ttest_ind(both_data, single_data)
    
    # Calculate means and effect size (Cohen's d)
    mean_both = both_data.mean()
    mean_single = single_data.mean()
    std_both = both_data.std()
    std_single = single_data.std()
    
    # Pooled standard deviation for Cohen's d
    n1, n2 = len(both_data), len(single_data)
    pooled_std = np.sqrt(((n1-1)*std_both**2 + (n2-1)*std_single**2) / (n1+n2-2))
    cohens_d = (mean_both - mean_single) / pooled_std
    
    # Percentage difference
    pct_diff = ((mean_both - mean_single) / mean_single) * 100
    
    print(f"METRIC: {metric.upper()}")
    print(f"  Both Tests Mean:        {mean_both:.2f}")
    print(f"  Single Test Mean:       {mean_single:.2f}")
    print(f"  Difference:             {mean_both - mean_single:.2f} ({pct_diff:+.1f}%)")
    print(f"  T-statistic:            {t_stat:.4f}")
    print(f"  P-value:                {p_value:.6f}")
    print(f"  Significant (α=0.05):   {'YES ✓' if p_value < 0.05 else 'NO ✗'}")
    print(f"  Cohen's d (effect size): {cohens_d:.3f}", end="")
    
    # Interpret effect size
    if abs(cohens_d) < 0.2:
        effect = "(negligible)"
    elif abs(cohens_d) < 0.5:
        effect = "(small)"
    elif abs(cohens_d) < 0.8:
        effect = "(medium)"
    else:
        effect = "(large)"
    print(f" {effect}")
    print()

Sample sizes:
  Both GUI & Performance: 87 repositories
  Single test type (GUI or Performance): 587 repositories

--------------------------------------------------------------------------------

METRIC: COMMITS
  Both Tests Mean:        10960.68
  Single Test Mean:       10602.38
  Difference:             358.30 (+3.4%)
  T-statistic:            0.2841
  P-value:                0.776409
  Significant (α=0.05):   NO ✗
  Cohen's d (effect size): 0.033 (negligible)

METRIC: CONTRIBUTORS
  Both Tests Mean:        103.24
  Single Test Mean:       142.67
  Difference:             -39.43 (-27.6%)
  T-statistic:            -2.7911
  P-value:                0.005402
  Significant (α=0.05):   YES ✓
  Cohen's d (effect size): -0.321 (small)

METRIC: TOTAL_ISSUES
  Both Tests Mean:        2047.72
  Single Test Mean:       3400.16
  Difference:             -1352.43 (-39.8%)
  T-statistic:            -1.4974
  P-value:                0.134761
  Significant (α=0.05):   NO ✗
  Cohen's d (effect size

In [101]:
mw_test_results = {}

for metric in numeric_cols:
    both_data = both_tests[metric]
    single_data = single_test[metric]
    
    # Perform Mann-Whitney U test
    u_stat, p_value = stats.mannwhitneyu(both_data, single_data, alternative='two-sided')
    
    # Calculate medians and rank-biserial correlation (effect size for Mann-Whitney)
    median_both = both_data.median()
    median_single = single_data.median()
    
    # Rank-biserial correlation as effect size
    n1, n2 = len(both_data), len(single_data)
    rank_biserial = 1 - (2*u_stat) / (n1 * n2)
    
    # Percentage difference in medians
    pct_diff_median = ((median_both - median_single) / median_single) * 100
    
    mw_test_results[metric] = {
        'median_both': median_both,
        'median_single': median_single,
        'u_stat': u_stat,
        'p_value': p_value,
        'rank_biserial': rank_biserial,
        'pct_diff': pct_diff_median
    }
    
    print(f"METRIC: {metric.upper()}")
    print(f"  Both Tests Median:      {median_both:.2f}")
    print(f"  Single Test Median:     {median_single:.2f}")
    print(f"  Median Difference:      {median_both - median_single:.2f} ({pct_diff_median:+.1f}%)")
    print(f"  U-statistic:            {u_stat:.4f}")
    print(f"  P-value:                {p_value:.6f}")
    print(f"  Significant (α=0.05):   {'YES ✓' if p_value < 0.05 else 'NO ✗'}")
    print(f"  Rank-biserial (effect): {rank_biserial:.3f}", end="")
    
    # Interpret effect size
    if abs(rank_biserial) < 0.1:
        effect = "(negligible)"
    elif abs(rank_biserial) < 0.3:
        effect = "(small)"
    elif abs(rank_biserial) < 0.5:
        effect = "(medium)"
    else:
        effect = "(large)"
    print(f" {effect}")
    print()

METRIC: COMMITS
  Both Tests Median:      5690.00
  Single Test Median:     7239.00
  Median Difference:      -1549.00 (-21.4%)
  U-statistic:            23193.5000
  P-value:                0.167251
  Significant (α=0.05):   NO ✗
  Rank-biserial (effect): 0.092 (negligible)

METRIC: CONTRIBUTORS
  Both Tests Median:      74.00
  Single Test Median:     87.00
  Median Difference:      -13.00 (-14.9%)
  U-statistic:            22453.0000
  P-value:                0.069033
  Significant (α=0.05):   NO ✗
  Rank-biserial (effect): 0.121 (small)

METRIC: TOTAL_ISSUES
  Both Tests Median:      1353.00
  Single Test Median:     1506.00
  Median Difference:      -153.00 (-10.2%)
  U-statistic:            26652.0000
  P-value:                0.509817
  Significant (α=0.05):   NO ✗
  Rank-biserial (effect): -0.044 (negligible)

METRIC: TOTAL_PULL_REQUESTS
  Both Tests Median:      2307.00
  Single Test Median:     2798.00
  Median Difference:      -491.00 (-17.5%)
  U-statistic:            21199