In [6]:
# cell 0
import pandas as pd
import os

print("cwd:", os.getcwd())          # confirm current directory
df = pd.read_csv("run_table.csv")  # adjust path if necessary
df.head()

cwd: /Users/liuruyi/github/Green_Lab_group1/green-lab-group1/data-analysis


Unnamed: 0,__run_id,__done,_compiler,_benchmark,execution_time,cpu_usage,memory_usage,energy_consumption
0,run_7_repetition_5,DONE,pure_python,regex,0.401,4.521,8537481.0,7.214
1,run_17_repetition_19,DONE,cython,regex,0.401,6.899,8519287.0,7.261
2,run_3_repetition_8,DONE,pure_python,fft,0.201,13.341,8496410.667,3.896
3,run_16_repetition_11,DONE,cython,quick_sort,0.401,4.488,8508329.0,5.935
4,run_28_repetition_19,DONE,swig,sieve,0.201,3.279,8518120.0,1.549


In [9]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# Check basic data information
print("Data shape:", df.shape)
print("\nCompiler types:", df['_compiler'].unique())
print("\nBenchmark types:", df['_benchmark'].unique())

Data shape: (600, 8)

Compiler types: ['pure_python' 'cython' 'swig']

Benchmark types: ['regex' 'fft' 'quick_sort' 'sieve' 'dense_matrix' 'nbody' 'convex'
 'k_means' 'bfs' 'json_bench']


In [None]:
def comprehensive_shapiro_wilk_testing(df):
    """
    Perform Shapiro-Wilk normality tests for all (benchmark × compiler × metric) combinations
    """
    results = []
    
    compilers = df['_compiler'].unique()
    benchmarks = df['_benchmark'].unique()
    metrics = ['execution_time', 'cpu_usage', 'memory_usage', 'energy_consumption']
    
    print("Performing Shapiro-Wilk normality tests for all combinations...")
    print(f"Compilers: {list(compilers)}")
    print(f"Benchmarks: {list(benchmarks)}")
    print(f"Metrics: {metrics}")
    print(f"Total combinations: {len(compilers)} × {len(benchmarks)} × {len(metrics)} = {len(compilers)*len(benchmarks)*len(metrics)}")
    print()
    
    for compiler in compilers:
        for benchmark in benchmarks:
            for metric in metrics:
                # Get data for this specific combination
                subset = df[(df['_compiler'] == compiler) & 
                           (df['_benchmark'] == benchmark)]
                
                values = subset[metric].dropna()
                    
                # Perform Shapiro-Wilk test
                shapiro_stat, shapiro_p = stats.shapiro(values)
                
                # Calculate descriptive statistics
                skew_val = stats.skew(values)
                kurtosis_val = stats.kurtosis(values)
                
                results.append({
                    'compiler': compiler,
                    'benchmark': benchmark,
                    'metric': metric,
                    'sample_size': len(values),
                    'shapiro_statistic': shapiro_stat,
                    'shapiro_p_value': shapiro_p,
                    'is_normal': shapiro_p > 0.05,
                    'skewness': skew_val,
                    'kurtosis': kurtosis_val,
                    'mean': values.mean(),
                    'std': values.std(),
                    'min': values.min(),
                    'max': values.max()
                })
    
    results_df = pd.DataFrame(results)
    return results_df


In [44]:

def analyze_rq1(normality_df):
    """
    RQ1: Energy Consumption - Python vs C++
    Analyze normality for energy consumption across all compilers and benchmarks
    """
    print("=" * 70)
    print("RQ1 ANALYSIS: Energy Consumption Normality (Python vs C++)")
    print("=" * 70)
    
    # Filter for energy consumption metric only
    energy_data = normality_df[normality_df['metric'] == 'energy_consumption']
    
    # Group by compiler
    compiler_groups = energy_data.groupby('compiler')
    
    rq1_results = {}
    
    for compiler, group in compiler_groups:
        total_tests = len(group)
        normal_tests = group['is_normal'].sum()
        normal_ratio = normal_tests / total_tests
        
        rq1_results[compiler] = {
            'total_tests': total_tests,
            'normal_tests': normal_tests,
            'normal_ratio': normal_ratio
        }
        
        print(f"\n{compiler.upper():<12}: {normal_tests}/{total_tests} normal ({normal_ratio:.1%})")
        
        # Breakdown by benchmark
        print("  By Benchmark:")
        benchmark_groups = group.groupby('benchmark')
        for benchmark, bench_group in benchmark_groups:
            bench_normal = bench_group['is_normal'].sum()
            bench_total = len(bench_group)
            print(f"    {benchmark:<15}: {bench_normal}/{bench_total} normal")
    
    return rq1_results


In [45]:

def analyze_rq2(normality_df):
    """
    RQ2: Performance Metrics - Python vs C++
    Analyze normality for execution_time, cpu_usage, memory_usage
    """
    print("\n" + "=" * 70)
    print("RQ2 ANALYSIS: Performance Metrics Normality (Python vs C++)")
    print("=" * 70)
    
    # Filter for performance metrics only
    performance_metrics = ['execution_time', 'cpu_usage', 'memory_usage']
    perf_data = normality_df[normality_df['metric'].isin(performance_metrics)]
    
    rq2_results = {}
    
    # Analysis by compiler
    print("Overall by Compiler:")
    compiler_groups = perf_data.groupby('compiler')
    for compiler, group in compiler_groups:
        total_tests = len(group)
        normal_tests = group['is_normal'].sum()
        normal_ratio = normal_tests / total_tests
        
        rq2_results[compiler] = {
            'total_tests': total_tests,
            'normal_tests': normal_tests,
            'normal_ratio': normal_ratio
        }
        
        print(f"  {compiler.upper():<12}: {normal_tests}/{total_tests} normal ({normal_ratio:.1%})")
    
    # Analysis by metric
    print("\nBy Metric (across all compilers):")
    metric_groups = perf_data.groupby('metric')
    for metric, group in metric_groups:
        total_tests = len(group)
        normal_tests = group['is_normal'].sum()
        normal_ratio = normal_tests / total_tests
        
        print(f"  {metric:<15}: {normal_tests}/{total_tests} normal ({normal_ratio:.1%})")
        
        # Breakdown by compiler for this metric
        metric_compiler_groups = group.groupby('compiler')
        for compiler, compiler_group in metric_compiler_groups:
            comp_normal = compiler_group['is_normal'].sum()
            comp_total = len(compiler_group)
            print(f"    {compiler:<12}: {comp_normal}/{comp_total} normal")
    
    return rq2_results


In [46]:

def analyze_rq3(normality_df):
    """
    RQ3: Energy Consumption - Cython vs SWIG
    Analyze normality for energy consumption comparing only Cython and SWIG
    """
    print("\n" + "=" * 70)
    print("RQ3 ANALYSIS: Energy Consumption Normality (Cython vs SWIG)")
    print("=" * 70)
    
    # Filter for energy consumption and only Cython/SWIG
    energy_data = normality_df[
        (normality_df['metric'] == 'energy_consumption') & 
        (normality_df['compiler'].isin(['cython', 'swig']))
    ]
    
    rq3_results = {}
    
    compiler_groups = energy_data.groupby('compiler')
    
    for compiler, group in compiler_groups:
        total_tests = len(group)
        normal_tests = group['is_normal'].sum()
        normal_ratio = normal_tests / total_tests
        
        rq3_results[compiler] = {
            'total_tests': total_tests,
            'normal_tests': normal_tests,
            'normal_ratio': normal_ratio
        }
        
        print(f"{compiler.upper():<12}: {normal_tests}/{total_tests} normal ({normal_ratio:.1%})")
    
    return rq3_results


In [47]:

def analyze_rq4(normality_df):
    """
    RQ4: Performance Metrics - Cython vs SWIG
    Analyze normality for performance metrics comparing only Cython and SWIG
    """
    print("\n" + "=" * 70)
    print("RQ4 ANALYSIS: Performance Metrics Normality (Cython vs SWIG)")
    print("=" * 70)
    
    # Filter for performance metrics and only Cython/SWIG
    performance_metrics = ['execution_time', 'cpu_usage', 'memory_usage']
    perf_data = normality_df[
        normality_df['metric'].isin(performance_metrics) & 
        normality_df['compiler'].isin(['cython', 'swig'])
    ]
    
    rq4_results = {}
    
    # Analysis by compiler
    print("Overall by Compiler:")
    compiler_groups = perf_data.groupby('compiler')
    for compiler, group in compiler_groups:
        total_tests = len(group)
        normal_tests = group['is_normal'].sum()
        normal_ratio = normal_tests / total_tests
        
        rq4_results[compiler] = {
            'total_tests': total_tests,
            'normal_tests': normal_tests,
            'normal_ratio': normal_ratio
        }
        
        print(f"  {compiler.upper():<12}: {normal_tests}/{total_tests} normal ({normal_ratio:.1%})")
    
    # Analysis by metric for Cython/SWIG only
    print("\nBy Metric (Cython and SWIG only):")
    for metric in performance_metrics:
        metric_data = perf_data[perf_data['metric'] == metric]
        total_tests = len(metric_data)
        normal_tests = metric_data['is_normal'].sum()
        normal_ratio = normal_tests / total_tests
        
        print(f"  {metric:<15}: {normal_tests}/{total_tests} normal ({normal_ratio:.1%})")
    
    return rq4_results


In [48]:

def generate_final_recommendations(rq1_results, rq2_results, rq3_results, rq4_results):
    """
    Generate final test method recommendations based on normality results
    """
    print("\n" + "=" * 70)
    print("FINAL STATISTICAL TEST RECOMMENDATIONS")
    print("=" * 70)
    
    # Overall recommendation based on majority
    all_normal_ratios = []
    
    # Collect all normal ratios
    for rq_results in [rq1_results, rq2_results, rq3_results, rq4_results]:
        for compiler, data in rq_results.items():
            all_normal_ratios.append(data['normal_ratio'])
    
    overall_normal_ratio = np.mean(all_normal_ratios)
    
    print(f"Overall normal distribution ratio: {overall_normal_ratio:.1%}")
    
    if overall_normal_ratio > 0.5:
        print("RECOMMENDATION: Use PARAMETRIC tests (paired t-tests)")
        primary_method = "parametric"
    else:
        print("RECOMMENDATION: Use NON-PARAMETRIC tests (permutation tests)")
        primary_method = "non-parametric"
    
    # Detailed recommendations for each RQ
    print("\nDetailed Recommendations by Research Question:")
    print("-" * 50)
    
    recommendations = {
        'RQ1': {},
        'RQ2': {}, 
        'RQ3': {},
        'RQ4': {}
    }
    
    # RQ1 recommendations
    rq1_normal_ratio = np.mean([data['normal_ratio'] for data in rq1_results.values()])
    recommendations['RQ1']['method'] = 'parametric' if rq1_normal_ratio > 0.5 else 'permutation'
    recommendations['RQ1']['normal_ratio'] = rq1_normal_ratio
    
    # RQ2 recommendations  
    rq2_normal_ratio = np.mean([data['normal_ratio'] for data in rq2_results.values()])
    recommendations['RQ2']['method'] = 'parametric' if rq2_normal_ratio > 0.5 else 'permutation'
    recommendations['RQ2']['normal_ratio'] = rq2_normal_ratio
    
    # RQ3 recommendations
    rq3_normal_ratio = np.mean([data['normal_ratio'] for data in rq3_results.values()])
    recommendations['RQ3']['method'] = 'parametric' if rq3_normal_ratio > 0.5 else 'permutation'
    recommendations['RQ3']['normal_ratio'] = rq3_normal_ratio
    
    # RQ4 recommendations
    rq4_normal_ratio = np.mean([data['normal_ratio'] for data in rq4_results.values()])
    recommendations['RQ4']['method'] = 'parametric' if rq4_normal_ratio > 0.5 else 'permutation'
    recommendations['RQ4']['normal_ratio'] = rq4_normal_ratio
    
    for rq, rec in recommendations.items():
        print(f"{rq}: {rec['method'].upper()} tests (normal ratio: {rec['normal_ratio']:.1%})")
    
    return recommendations, primary_method


In [49]:
# Perform comprehensive normality testing
normality_results = comprehensive_shapiro_wilk_testing(df)

# Save detailed results
normality_results.to_csv('comprehensive_normality_results.csv', index=False)
print(f"\nDetailed normality results saved to 'comprehensive_normality_results.csv'")
print(f"Total normality tests performed: {len(normality_results)}")

# Analyze each research question
rq1_results = analyze_rq1(normality_results)
rq2_results = analyze_rq2(normality_results) 
rq3_results = analyze_rq3(normality_results)
rq4_results = analyze_rq4(normality_results)

# Generate final recommendations
recommendations, primary_method = generate_final_recommendations(
    rq1_results, rq2_results, rq3_results, rq4_results
)

# Save summary report
summary_report = {
    'total_tests': len(normality_results),
    'overall_normal_ratio': normality_results['is_normal'].mean(),
    'primary_method': primary_method,
    'rq1_results': rq1_results,
    'rq2_results': rq2_results,
    'rq3_results': rq3_results, 
    'rq4_results': rq4_results,
    'recommendations': recommendations
}

print(f"\nAnalysis complete! Primary method recommendation: {primary_method.upper()} tests")


Performing Shapiro-Wilk normality tests for all combinations...
Compilers: ['pure_python', 'cython', 'swig']
Benchmarks: ['regex', 'fft', 'quick_sort', 'sieve', 'dense_matrix', 'nbody', 'convex', 'k_means', 'bfs', 'json_bench']
Metrics: ['execution_time', 'cpu_usage', 'memory_usage', 'energy_consumption']
Total combinations: 3 × 10 × 4 = 120


Detailed normality results saved to 'comprehensive_normality_results.csv'
Total normality tests performed: 120
RQ1 ANALYSIS: Energy Consumption Normality (Python vs C++)

CYTHON      : 2/10 normal (20.0%)
  By Benchmark:
    bfs            : 0/1 normal
    convex         : 0/1 normal
    dense_matrix   : 0/1 normal
    fft            : 0/1 normal
    json_bench     : 0/1 normal
    k_means        : 1/1 normal
    nbody          : 1/1 normal
    quick_sort     : 0/1 normal
    regex          : 0/1 normal
    sieve          : 0/1 normal

PURE_PYTHON : 5/10 normal (50.0%)
  By Benchmark:
    bfs            : 0/1 normal
    convex         : 1/1 norma