In [None]:
import pandas as pd
import numpy as np
import ast
import json
from tqdm import tqdm
from src.topicbench.label_topics import label_topics
from src.topicbench.validate import score_similarity, compute_alignment

In [None]:
# Specify the models that will be benchmarked
models_config = {
    'llama3.2:latest': {'api_key_path': None, 'type': 'local'},
    # 'gpt-oss:20b': {'api_key_path': None, 'type': 'local'},
    # 'gpt-3.5-turbo': {'api_key_path': '../../openai_key.txt', 'type': 'api'},
}

In [None]:
# Load the TopicBench dataset
df = pd.read_csv('../data/data_cleaned.csv')

In [50]:
# Label topics using each model and store results
for model_name, config in models_config.items():
    print(f"Processing with {model_name}...")
    results_df = label_topics(df, model_name=model_name, API_KEY_PATH=config['api_key_path'])
    print(f"Completed {model_name}\n")

Processing with llama3.2:latest...


Labeling topics with llama3.2:latest: 100%|██████████| 5/5 [00:47<00:00,  9.50s/it]

Completed llama3.2:latest






In [None]:
# Compute similarity scores for author vs alt_human and author vs each AI model

# Define the metric to use
metric = 'cosine'

def safe_parse_labels(label_string):
    """Safely parse label strings that may contain apostrophes or other special characters."""
    try:
        # First try ast.literal_eval
        return ast.literal_eval(label_string)
    except (ValueError, SyntaxError):
        try:
            # If that fails, try json.loads (need to replace single quotes with double quotes)
            return json.loads(label_string.replace("'", '"'))
        except json.JSONDecodeError:
            # If both fail, return the string as-is wrapped in a list
            print(f"Warning: Could not parse: {label_string[:100]}")
            return [label_string]

# For each model in the config, compute similarity scores
for model_name in models_config.keys():
    all_scores = []
    
    for idx in tqdm(range(len(results_df)), desc=f"Computing similarity for {model_name}"):
        row = results_df.iloc[idx]
        
        # Parse the label lists (they are stored as strings)
        author_labels = safe_parse_labels(row['author_label'])
        comparison_labels = safe_parse_labels(row[model_name])
        
        # Compute pairwise similarities between corresponding labels
        # Each topic's label is compared to the corresponding author label
        label_similarities = []
        for author_label, comp_label in zip(author_labels, comparison_labels):
            score = score_similarity(author_label, comp_label, metric=metric)
            label_similarities.append(float(score))
        
        # Store the list of scores for this row
        all_scores.append(label_similarities)
    
    # Add scores as a new column with format: model_name_metric_scores
    results_df[f'{model_name}_{metric}_similarity'] = all_scores

# Also compute for alt_human
alt_human_scores = []
for idx in tqdm(range(len(results_df)), desc="Computing similarity for alt_human"):
    row = results_df.iloc[idx]
    
    # Parse the label lists
    author_labels = safe_parse_labels(row['author_label'])
    alt_human_labels = safe_parse_labels(row['alt_human'])
    
    # Compute pairwise similarities
    label_similarities = []
    for author_label, alt_label in zip(author_labels, alt_human_labels):
        score = score_similarity(author_label, alt_label, metric=metric)
        label_similarities.append(float(score))
    
    alt_human_scores.append(label_similarities)

results_df[f'alt_human_{metric}_similarity'] = alt_human_scores

print("Similarity computations complete!")

Computing similarity for llama3.2:latest: 100%|██████████| 5/5 [00:14<00:00,  2.83s/it]
Computing similarity for alt_human: 100%|██████████| 5/5 [00:14<00:00,  2.86s/it]

Similarity computations complete!





In [53]:
results_df.head()

Unnamed: 0,paper_title,field,keywords,author_label,alt_human,llama3.2:latest,llama3.2:latest_cosine_similarity,alt_human_cosine_similarity
0,almquist_bagozzi_2019,sociology,"[['one', 'made', 'anoth', 'everi', 'side', 'ti...","['Inspirational Language', 'Group Identity Deb...","['Motivational Rhetoric', 'Collective Identity...","['Social Movement', 'Activism', 'Protest', 'Re...","[0.22654280066490173, 0.4005390405654907, 0.34...","[0.4601529836654663, 0.857661247253418, 0.7370..."
1,dinsa_2024,medicine,"[['body', 'came', 'dries up', 'rocking', 'time...","['Nervous disease', 'Gynecology', 'Mental illn...","['Fatigue & General Body Weakness', 'Pregnancy...","['Gastrointestinal Issues', 'Menstrual Cycle',...","[0.15294265747070312, 0.41162168979644775, 0.3...","[0.2469634711742401, 0.31719857454299927, 0.11..."
2,farrell_2015,environmental science,"[['one', 'will', 'peopl', 'can', 'just', 'get'...","[""People's Knowledge"", 'Skeptical of IPCC Scie...","['General Discourse', 'Climate Science & IPCC ...","['Environmental Impact', 'Climate Change', 'Gl...","[0.14470866322517395, 0.19482237100601196, 0.4...","[0.28594285249710083, 0.7122806906700134, 0.35..."
3,liao_2022,hci,"[['can', 'get', 'us', 'cant', 'work', 'please'...","['Problem solving', 'Desperate effort to log i...","['Technical Support & DNS Issues', 'App Login ...","['error', 'problem', 'network', 'server', 'tes...","[0.2958523631095886, 0.28478139638900757, 0.13...","[0.21905887126922607, 0.4882417917251587, 0.13..."
4,paul_girju_2009,nlp,"[['pragmatics', 'attitudes', 'meaning', 'seman...","['Pragmatics', 'Prosody', 'Psycholinguistics',...","['Pragmatics & Communicative Inference', 'Pros...","['pragmatics', 'attitudes', 'meaning', 'semant...","[1.0000001192092896, 0.22162874042987823, 0.26...","[0.766014575958252, 0.8201093673706055, 0.8852..."


In [54]:
# Apply compute_alignment element-wise to each score in the lists for all models
for model_name in models_config.keys():
    alignment_scores = []
    tau_values = []
    
    for idx, row in results_df.iterrows():
        human_scores = row['alt_human_cosine_similarity']
        ai_scores = row[f'{model_name}_cosine_similarity']
        
        # Ensure both are lists and have the same length
        if not isinstance(human_scores, list):
            human_scores = list(human_scores) if hasattr(human_scores, '__iter__') else [human_scores]
        if not isinstance(ai_scores, list):
            ai_scores = list(ai_scores) if hasattr(ai_scores, '__iter__') else [ai_scores]
        
        # Handle potential length mismatch
        min_len = min(len(human_scores), len(ai_scores))
        human_scores = human_scores[:min_len]
        ai_scores = ai_scores[:min_len]
        
        # Create a temporary dataframe with the individual scores
        temp_df = pd.DataFrame({
            'human_similarity': human_scores,
            'ai_similarity': ai_scores
        })
        
        # Apply compute_alignment to this temporary dataframe
        aligned_temp = compute_alignment(temp_df, human_col='human_similarity', ai_col='ai_similarity', tau=0)
        
        # Extract the alignment results and tau value as lists
        alignment_list = aligned_temp['AI_alignment'].tolist()
        tau_value = aligned_temp['tau'].iloc[0]
        
        alignment_scores.append(alignment_list)
        tau_values.append(tau_value)
    
    # Add the alignment scores and tau values to your results dataframe with model-specific names
    results_df[f'{model_name}_alignment_scores'] = alignment_scores
    results_df[f'{model_name}_tau_values'] = tau_values

In [67]:
# Calculate the mean of alignment scores for each row for all models
for model_name in models_config.keys():
    results_df[f'{model_name}_final_score'] = results_df[f'{model_name}_alignment_scores'].apply(lambda x: np.mean(x))

In [71]:
# Display results for all models
score_columns = [f'{model_name}_alignment_scores' for model_name in models_config.keys()] + \
                [f'{model_name}_final_score' for model_name in models_config.keys()]
results_df[['field', 'author_label', 'alt_human', 'llama3.2:latest', 'gpt-3.5-turbo', 'llama3.2:latest_final_score', 'gpt-3.5-turbo_final_score']].head()

Unnamed: 0,field,author_label,alt_human,llama3.2:latest,gpt-3.5-turbo,llama3.2:latest_final_score,gpt-3.5-turbo_final_score
0,sociology,"['Inspirational Language', 'Group Identity Deb...","['Motivational Rhetoric', 'Collective Identity...","['Social Movement', 'Activism', 'Resistance', ...","['movement and activism', 'environmental conse...",0.6,0.333333
1,medicine,"['Nervous disease', 'Gynecology', 'Mental illn...","['Fatigue & General Body Weakness', 'Pregnancy...","['Gastrointestinal issues', 'Pregnancy complic...","['Physical Symptoms', 'Pregnancy-related Issue...",0.222222,0.2
2,environmental science,"[""People's Knowledge"", 'Skeptical of IPCC Scie...","['General Discourse', 'Climate Science & IPCC ...","['Climate Change', 'Global Warming', 'Environm...","['general concepts', 'climate science research...",0.576923,0.461538
3,hci,"['Problem solving', 'Desperate effort to log i...","['Technical Support & DNS Issues', 'App Login ...","['technical issues', 'network problems', 'dns ...","['technical support', 'app usability', 'social...",0.4,0.3
4,nlp,"['Pragmatics', 'Prosody', 'Psycholinguistics',...","['Pragmatics & Communicative Inference', 'Pros...","['pragmatics', 'semantics', 'inference', 'comm...","['pragmatics', 'semantics', 'communication', '...",0.266667,0.4
