In [7]:
import pandas as pd
import scipy.stats as stats

# Sample data for relevant models (extracted from the paper's table)
data = {
    'Model': [
        'GPT-4', 'Phind-CodeLlama', 'WizardCoder-CodeLlama', 'ChatGPT', 'CODELLAMA_34B', 
        'CODELLAMA_13B', 'CODELLAMA_7B', 'StarCoder', 'CodeGen_16B', 'CodeGen_6B', 
        'CodeGen_2B', 'CODET5+_16B', 'MISTRAL_7B', 'CodeGen2_16B', 'CodeGen2_7B', 
        'CodeGen2_3B', 'CodeGen2_1B', 'VICUNA_13B', 'VICUNA_7B', 'SantaCoder_1.1B', 
        'INCODER_6.7B', 'INCODER_1.3B', 'GPT-J_6B', 'GPT-NEO_2.7B', 'PolyCoder_2.7B', 'StableLM_7B'
    ],
    'Size': [
        '999999', '34B', '34B', '99999', '34B', '13B', '7B', '15B', '16B', '6B', '2B', '16B', '7B', 
        '16B', '7B', '3B', '1B', '13B', '7B', '1.1B', '6.7B', '1.3B', '6B', '2.7B', '2.7B', '7B'
    ],
    'Base_k=1*': [
        88.4, 71.3, 73.2, 73.2, 51.8, 42.7, 37.8, 34.1, 32.9, 29.3, 24.4, 31.7, 28.7, 19.5, 18.3, 
        15.9, 11.0, 16.5, 11.6, 14.6, 15.9, 12.2, 12.2, 7.9, 6.1, 2.4
    ],
    'Extra_k=1*': [
        76.2, 67.1, 64.6, 63.4, 42.7, 36.6, 34.1, 29.3, 26.8, 25.6, 20.7, 26.2, 23.8, 16.5, 16.5, 
        12.8, 9.1, 15.2, 11.0, 12.8, 12.2, 10.4, 10.4, 6.7, 5.5, 2.4
    ],
    'Base_k=1': [
        None, 71.6, 61.6, 69.4, 52.0, 44.6, 39.2, 32.2, 32.2, 27.7, 18.4, 32.2, 28.1, None, 17.9, 
        15.2, 10.2, 15.3, 10.9, 16.6, 15.6, 10.0, 11.3, 6.5, 5.9, 2.7
    ],
    'Extra_k=1': [
        None, 67.0, 54.5, 62.5, 43.1, 37.4, 34.5, 27.8, 27.2, 23.6, 15.1, 27.4, 23.7, None, 15.9, 
        12.9, 8.7, 13.9, 10.3, 14.2, 12.4, 7.9, 9.5, 6.0, 5.3, 2.6
    ],
    'Base_k=10': [
        None, 90.5, 85.2, 88.6, 82.4, 77.6, 69.1, 56.7, 56.0, 46.9, 39.8, 58.5, 55.2, None, 30.9, 
        23.9, 15.1, 30.1, 23.8, 29.2, 27.7, 15.9, 17.7, 11.8, 10.2, 7.5
    ],
    'Extra_k=10': [
        None, 85.0, 78.6, 82.1, 73.7, 69.4, 61.4, 50.3, 48.4, 41.0, 34.8, 51.1, 48.5, None, 27.1, 
        21.2, 13.7, 25.8, 20.3, 26.2, 22.2, 13.5, 15.2, 9.0, 7.9, 6.2
    ],
    'Base_k=100': [
        None, 96.2, 94.5, 94.0, 95.0, 92.7, 89.7, 84.2, 81.5, 72.7, 66.8, 83.5, 83.8, None, 50.9, 
        38.6, 24.7, 54.8, 42.3, 45.4, 45.0, 25.2, 31.8, 20.7, 17.1, 15.8
    ],
    'Extra_k=100': [
        None, 92.5, 88.9, 91.1, 89.4, 88.2, 82.9, 75.4, 71.4, 64.6, 55.8, 76.4, 76.4, None, 45.4, 
        34.3, 21.2, 46.7, 35.0, 40.6, 38.9, 20.7, 25.9, 16.8, 13.6, 11.9
    ]
}



# Convert the dictionary to a DataFrame
df = pd.DataFrame(data)


# Calculate performance drops and relative drops
for k in ['1*', '1', '10', '100']:
    df[f'Drop_k={k}'] = df[f'Base_k={k}'] - df[f'Extra_k={k}']
    df[f'Relative_Drop_k={k}'] = (df[f'Drop_k={k}'] / df[f'Base_k={k}']) * 100

# Filter models <= 13B
df_less_equal_13B = df[df['Size'].apply(lambda x: float(x[:-1]) <= 13)].copy()

# Manually input ChatGPT's relative performance drops (example values, adjust based on actual data)
chatgpt_relative_drops = {
    'Relative_Drop_k=1*': 13.1,  # example value: calculated from ChatGPT data
    'Relative_Drop_k=1': 9.0,    # example value: calculated from ChatGPT data
    'Relative_Drop_k=10': 7.3,   # example value: calculated from ChatGPT data
    'Relative_Drop_k=100': 3.1   # example value: calculated from ChatGPT data
}

# Perform t-tests comparing mean relative drops for models <= 13B against ChatGPT
ttest_results = {}
for k in ['1*', '1', '10', '100']:
    relative_drop_col = f'Relative_Drop_k={k}'
    ttest_results[relative_drop_col] = stats.ttest_1samp(df_less_equal_13B[relative_drop_col], chatgpt_relative_drops[relative_drop_col], alternative='greater')

# Function to calculate Cohen's d for one-sample t-test
def cohen_d_one_sample(sample, popmean):
    n = len(sample)
    mean_sample = sample.mean()
    std_sample = sample.std(ddof=1)  # Standard deviation with degrees of freedom = 1
    return (mean_sample - popmean) / std_sample

# Calculate Cohen's d for each k value
effect_sizes = {k: cohen_d_one_sample(df_less_equal_13B[f'Relative_Drop_k={k}'], chatgpt_relative_drops[f'Relative_Drop_k={k}']) for k in ['1*', '1', '10', '100']}

# Display results
print("T-test results:", ttest_results)
print("Effect sizes (Cohen's d):", effect_sizes)


T-test results: {'Relative_Drop_k=1*': TtestResult(statistic=np.float64(-0.17456514664692194), pvalue=np.float64(0.5681944399609618), df=np.int64(16)), 'Relative_Drop_k=1': TtestResult(statistic=np.float64(3.6168189328049536), pvalue=np.float64(0.0011577307439439213), df=np.int64(16)), 'Relative_Drop_k=10': TtestResult(statistic=np.float64(6.856535923295437), pvalue=np.float64(1.9290866137053954e-06), df=np.int64(16)), 'Relative_Drop_k=100': TtestResult(statistic=np.float64(8.922453275129158), pvalue=np.float64(6.546790921245238e-08), df=np.int64(16))}
Effect sizes (Cohen's d): {'1*': np.float64(-0.042338266951570336), '1': np.float64(0.8772074405110923), '10': np.float64(1.6629542257405294), '100': np.float64(2.1640127819409263)}
