In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from scipy.stats import pearsonr, ttest_rel

In [2]:
SIMILARITY = "euclidean"

In [3]:
def euclidean_similarity(array1, array2):
    dist = euclidean(array1, array2)
    max_dist = np.sqrt(len(array1))
    similarity = 1 - (dist / max_dist)
    return similarity

def min_max_scaling(group):
    min_value = group['Response'].min()
    max_value = group['Response'].max()
    group['Response_scaled'] = (group['Response'] - min_value) / (max_value - min_value)
    return group

def prepare_data(filename):
    df = pd.read_csv(filename)
    if not "item_id" in df.columns:
        df['context'] = df['Confederate'].apply(lambda x: "Context" if x == 0 else "No context")
        df = df.rename(columns={"ItemNum" : "item_id", "Condition" : "condition"})
        df['condition'] = df['condition'].replace('SemAnom', 'S.Anom')
    df = df[df.model != "gpt2-xl"]
    # Apply min-max scaling for each model (for euclidean distance)
    df_scaled = df.groupby('model').apply(min_max_scaling)
    return df_scaled

In [4]:
def compute_correlations_and_test(data, similarity="pearson"):
    item_ids = data['item_id'].unique()
    
    # Filter
    human_data = data[data['model'] == 'human']
    non_human_data = data[data['model'] != 'human']

    # Conditions in the desired order
    condition_order = ['Control', 'Critical', 'S.Anom']
    
    correlations = []
    test_results = []

    for model_name in non_human_data['model'].unique():
        model_data = non_human_data[non_human_data['model'] == model_name]
        
        # Lists to store correlation results for each comparison
        corr_human_context_model_context = []
        corr_human_no_context_model_no_context = []
        corr_human_context_model_no_context = []
        corr_human_no_context_model_context = []

        if similarity == "pearson":
            response_variable = "Response"
        elif similarity == "euclidean":
            response_variable = "Response_scaled"
        
        # Iterate over all item_ids
        for item_id in item_ids:
            # Get human baseline vectors sorted by condition
            human_context = human_data[(human_data['item_id'] == item_id) & 
                                       (human_data['context'] == 'Context')].sort_values(by='condition', 
                                                                                         key=lambda x: x.map({cond: i for i, cond in enumerate(condition_order)}))[response_variable].values
            human_no_context = human_data[(human_data['item_id'] == item_id) & 
                                          (human_data['context'] == 'No context')].sort_values(by='condition', 
                                                                                               key=lambda x: x.map({cond: i for i, cond in enumerate(condition_order)}))[response_variable].values
            
            # Get model vectors sorted by condition
            model_context = model_data[(model_data['item_id'] == item_id) & 
                                       (model_data['context'] == 'Context')].sort_values(by='condition', 
                                                                                         key=lambda x: x.map({cond: i for i, cond in enumerate(condition_order)}))[response_variable].values
            model_no_context = model_data[(model_data['item_id'] == item_id) & 
                                          (model_data['context'] == 'No context')].sort_values(by='condition', 
                                                                                               key=lambda x: x.map({cond: i for i, cond in enumerate(condition_order)}))[response_variable].values
            
            if similarity == "pearson":
                # Compute correlations for the four comparisons
                corr_human_context_model_context.append(pearsonr(human_context, model_context)[0])
                corr_human_no_context_model_no_context.append(pearsonr(human_no_context, model_no_context)[0])
                corr_human_context_model_no_context.append(pearsonr(human_context, model_no_context)[0])
                corr_human_no_context_model_context.append(pearsonr(human_no_context, model_context)[0])
            elif similarity == "euclidean":
                corr_human_context_model_context.append(euclidean_similarity(human_context, model_context))
                corr_human_no_context_model_no_context.append(euclidean_similarity(human_no_context, model_no_context))
                corr_human_context_model_no_context.append(euclidean_similarity(human_context, model_no_context))
                corr_human_no_context_model_context.append(euclidean_similarity(human_no_context, model_context))
        
        # Average correlations across all items for this model
        correlation_record = {
            'model': model_name,
            'corr_human_context_model_context': np.mean(corr_human_context_model_context),
            'corr_human_no_context_model_no_context': np.mean(corr_human_no_context_model_no_context),
            'corr_human_context_model_no_context': np.mean(corr_human_context_model_no_context),
            'corr_human_no_context_model_context': np.mean(corr_human_no_context_model_context)
        }
        correlations.append(correlation_record)
        
        # Perform statistical test for this model
        same_context_corrs = np.array(corr_human_context_model_context + corr_human_no_context_model_no_context)
        mismatched_context_corrs = np.array(corr_human_context_model_no_context + corr_human_no_context_model_context)
        
        t_stat, p_value = ttest_rel(same_context_corrs, mismatched_context_corrs)
        
        test_results.append({
            'model': model_name,
            't_stat': t_stat,
            'p_value': p_value
        })
    
    correlation_df = pd.DataFrame(correlations)
    test_results_df = pd.DataFrame(test_results)

    return correlation_df, test_results_df

In [5]:
def main(filename):
    df_scaled = prepare_data(filename)
    correlation_df, test_results_df = compute_correlations_and_test(df_scaled, similarity=SIMILARITY)
    return correlation_df, test_results_df

In [6]:
def get_avg_match_noMatch(dataframe):
    dataframe["matched"] = (dataframe["corr_human_context_model_context"] + dataframe["corr_human_no_context_model_no_context"]) / 2
    dataframe["unmatched"] = (dataframe["corr_human_context_model_no_context"] + dataframe["corr_human_no_context_model_context"]) / 2
    return dataframe[["model", "matched", "unmatched"]]

# LogProbs

In [7]:
ll_filename = "ContextDependency.SocialN400.WordLLComparison.ByModel.csv"
ll_correlation_df, ll_test_results_df = main(ll_filename)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_scaled = df.groupby('model').apply(min_max_scaling)


In [8]:
ll_correlation_df

Unnamed: 0,model,corr_human_context_model_context,corr_human_no_context_model_no_context,corr_human_context_model_no_context,corr_human_no_context_model_context
0,Mistral-7B-Instruct-v0.1,0.500207,0.307681,0.374596,0.233508
1,Mistral-7B-v0.1,0.50336,0.319483,0.369799,0.248483
2,falcon-7b-instruct,0.569692,0.450995,0.421235,0.374781
3,falcon-7b,0.581156,0.432421,0.43261,0.352706
4,mpt-7b-instruct,0.565039,0.401555,0.440572,0.306511
5,mpt-7b,0.575146,0.43376,0.434566,0.332693


In [9]:
ll_match_df = get_avg_match_noMatch(ll_correlation_df)
ll_match_df

Unnamed: 0,model,matched,unmatched
0,Mistral-7B-Instruct-v0.1,0.403944,0.304052
1,Mistral-7B-v0.1,0.411422,0.309141
2,falcon-7b-instruct,0.510344,0.398008
3,falcon-7b,0.506788,0.392658
4,mpt-7b-instruct,0.483297,0.373541
5,mpt-7b,0.504453,0.38363


In [10]:
ll_test_results_df

Unnamed: 0,model,t_stat,p_value
0,Mistral-7B-Instruct-v0.1,8.523552,3.84049e-15
1,Mistral-7B-v0.1,8.494988,4.600625e-15
2,falcon-7b-instruct,9.691246,1.944534e-18
3,falcon-7b,10.834789,8.453288e-22
4,mpt-7b-instruct,10.701331,2.111941e-21
5,mpt-7b,11.803975,1.017634e-24


# Prompting

In [11]:
prompt_filename = "ContextDependency.SocialN400.SentenceJudgeComparison.ByModel.csv" 
prompt_correlation_df, prompt_test_results_df = main(prompt_filename)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_scaled = df.groupby('model').apply(min_max_scaling)


In [12]:
prompt_correlation_df

Unnamed: 0,model,corr_human_context_model_context,corr_human_no_context_model_no_context,corr_human_context_model_no_context,corr_human_no_context_model_context
0,Mistral-7B-Instruct-v0.1,0.291066,0.293315,0.099332,0.189953
1,Mistral-7B-v0.1,0.163078,-0.049325,0.178559,-0.009274
2,falcon-7b-instruct,0.2323,0.207382,0.077463,0.085385
3,falcon-7b,0.097235,-0.018494,0.065519,0.011038
4,mpt-7b-chat,0.173244,-0.074671,0.172509,-0.084074
5,mpt-7b-instruct,0.170581,0.097644,0.094874,0.040895
6,mpt-7b,-0.205939,0.111499,-0.208859,0.114004


In [13]:
prompt_match_df = get_avg_match_noMatch(prompt_correlation_df)
prompt_match_df

Unnamed: 0,model,matched,unmatched
0,Mistral-7B-Instruct-v0.1,0.29219,0.144642
1,Mistral-7B-v0.1,0.056877,0.084643
2,falcon-7b-instruct,0.219841,0.081424
3,falcon-7b,0.03937,0.038279
4,mpt-7b-chat,0.049286,0.044218
5,mpt-7b-instruct,0.134113,0.067884
6,mpt-7b,-0.04722,-0.047427


In [14]:
prompt_test_results_df

Unnamed: 0,model,t_stat,p_value
0,Mistral-7B-Instruct-v0.1,4.810007,3e-06
1,Mistral-7B-v0.1,-1.430651,0.154099
2,falcon-7b-instruct,4.687167,5e-06
3,falcon-7b,0.039468,0.968557
4,mpt-7b-chat,0.580388,0.56231
5,mpt-7b-instruct,2.836148,0.005038
6,mpt-7b,0.009561,0.992381
