In [113]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("C:/Users/Gebruiker/Downloads/DiffColNames2.csv")
# Show the first few rows

print(df.columns.tolist())

['Question ', 'Step1', 'Step2', 'Step3', 'Step4', 'Step5', 'Correct_Answer', 'Question_Type', 'Question_origin', 'Step_origin', '# of reasoning steps', 'Sal_NoCOT_Answer', 'Sal_NoCOT_Correctness', 'Sal_NoCOT_P', 'Sal_NoCOT_Tokens', 'Sal_COT_Answer', 'Sal_COT_Correctness', 'Sal_COT_P', 'Sal_COT_Tokens', 'Sal_NoMost_Answer', 'Sal_NoMost_Correctness', 'Sal_NoMost_P', 'Sal_NoMost_Tokens', 'Sal_MaxStep', 'Score', 'Sal_MinStep', 'Sal_NoLeast_Answer', 'Sal_NoLeast_Correctness', 'Sal_NoLeast_P', 'Sal_NoLeast_Tokens', 'Grad_NoCOT_Answer', 'Grad_NoCOT_Correctness', 'Grad_NoCOT_P', 'Grad_NoCOT_Tokens', 'Grad_COT_Answer', 'Grad_COT_Correctness', 'Grad_COT_P', ' Grad_COT_Tokens', 'Grad_MaxStep', 'Score.1', 'Grad_NoMost_Answer', 'Grad_NoMost_Correctness', 'Grad_NoMost_P', 'Grad_NoMost_Tokens']


In [115]:
# Define the relevant columns for correctness for both Saliency and Gradients
saliency_columns = [
    'Sal_NoCOT_Correctness', 'Sal_COT_Correctness', 'Sal_NoMost_Correctness', 'Sal_NoLeast_Correctness'
]
gradient_columns = [
    'Grad_NoCOT_Correctness', 'Grad_COT_Correctness', 'Grad_NoMost_Correctness'
]

# Initialize a dictionary to hold the results
proportions = {}

# Calculate the proportion of correct answers for saliency conditions
for col in saliency_columns:
    saliency_correct = df.groupby('Question_Type')[col].mean()  # Mean gives the proportion of correct answers
    proportions[f"Saliency - {col}"] = saliency_correct

# Calculate the proportion of correct answers for gradient conditions
for col in gradient_columns:
    gradient_correct = df.groupby('Question_Type')[col].mean()  # Mean gives the proportion of correct answers
    proportions[f"Gradient - {col}"] = gradient_correct

# Convert the dictionary to a DataFrame for easier inspection
proportion_df = pd.DataFrame(proportions)

# Display the result
print(proportion_df)


                 Saliency - Sal_NoCOT_Correctness  \
Question_Type                                       
Math                                         0.52   
Multiple Choice                              0.92   
Open Ended                                   0.72   
T/F                                          0.68   

                 Saliency - Sal_COT_Correctness  \
Question_Type                                     
Math                                       1.00   
Multiple Choice                            1.00   
Open Ended                                 0.84   
T/F                                        0.84   

                 Saliency - Sal_NoMost_Correctness  \
Question_Type                                        
Math                                          0.68   
Multiple Choice                               0.96   
Open Ended                                    0.76   
T/F                                           0.72   

                 Saliency - Sal_NoLeast_Correctne

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr


# List all configurations
configs = [
    'Sal_NoCOT', 'Sal_COT', 'Sal_NoMost', 'Sal_NoLeast', 
    'Grad_NoCOT', 'Grad_COT', 'Grad_NoMost'
]

# Get unique question types
question_types = df['Question_Type'].unique()
print(f"Analyzing {len(question_types)} question types: {question_types}")

# For each question type, create a separate table
for q_type in question_types:
    print(f"\n\n--- Analysis for Question Type: {q_type} ---")
    
    # Filter data for this question type
    df_q = df[df['Question_Type'] == q_type]
    
    # Create empty lists to store results
    results = []
    
    # For each configuration, calculate statistics
    for config in configs:
        # Skip if the column doesn't exist
        if f'{config}_Correctness' not in df_q.columns or f'{config}_P' not in df_q.columns:
            continue
        
        # Clean data by removing NaN and infinite values
        valid_data = df_q[[f'{config}_Correctness', f'{config}_P']].dropna()
        valid_data = valid_data[~np.isinf(valid_data[f'{config}_P'])]
        
        # Check if we have enough data and both correct and incorrect samples
        correct_samples = valid_data[valid_data[f'{config}_Correctness'] == 1]
        incorrect_samples = valid_data[valid_data[f'{config}_Correctness'] == 0]
        
        if len(correct_samples) > 0 and len(incorrect_samples) > 0:
            # Get mean probability for correct and incorrect answers
            mean_p_correct = correct_samples[f'{config}_P'].mean()
            mean_p_incorrect = incorrect_samples[f'{config}_P'].mean()
            
            # Calculate correlation between correctness and probability
            try:
                corr, p_val = pearsonr(valid_data[f'{config}_Correctness'], valid_data[f'{config}_P'])
            except (ValueError, TypeError) as e:
                print(f"Error calculating correlation for {config}: {e}")
                corr, p_val = np.nan, np.nan
            
            # Calculate accuracy
            accuracy = valid_data[f'{config}_Correctness'].mean()
            
            # Count of samples
            total_samples = len(valid_data)
            original_samples = len(df_q)
            dropped_samples = original_samples - total_samples
            
            # Add to results
            results.append({
                'Configuration': config,
                'Attribution': config.split('_')[0],
                'COT_Type': '_'.join(config.split('_')[1:]),
                'Accuracy': accuracy,
                'Mean_P_Correct': mean_p_correct,
                'Mean_P_Incorrect': mean_p_incorrect,
                'P_Diff': mean_p_correct - mean_p_incorrect,
                'Correlation': corr,
                'P_Value': p_val,
                'Valid_Samples': total_samples,
                'Dropped_Samples': dropped_samples
            })
        else:
            print(f"Warning: {config} for {q_type} has insufficient data for correlation analysis")
    
    # Create a DataFrame from results
    if results:
        results_df = pd.DataFrame(results)
        
        # Sort by attribution method and then by correlation strength (handle NaN values)
        results_df = results_df.sort_values(['Attribution', 'Correlation'], 
                                            ascending=[True, False], 
                                            na_position='last')
        
        # Format for readability
        formatted_df = results_df.copy()
        formatted_df['Accuracy'] = formatted_df['Accuracy'].map(lambda x: '{:.2%}'.format(x) if not pd.isna(x) else 'N/A')
        formatted_df['Mean_P_Correct'] = formatted_df['Mean_P_Correct'].map(lambda x: '{:.4f}'.format(x) if not pd.isna(x) else 'N/A')
        formatted_df['Mean_P_Incorrect'] = formatted_df['Mean_P_Incorrect'].map(lambda x: '{:.4f}'.format(x) if not pd.isna(x) else 'N/A')
        formatted_df['P_Diff'] = formatted_df['P_Diff'].map(lambda x: '{:.4f}'.format(x) if not pd.isna(x) else 'N/A')
        formatted_df['Correlation'] = formatted_df['Correlation'].map(lambda x: '{:.3f}'.format(x) if not pd.isna(x) else 'N/A')
        formatted_df['P_Value'] = formatted_df['P_Value'].map(lambda x: '{:.3f}'.format(x) if not pd.isna(x) else 'N/A')
        
        # Display the table
        print(formatted_df.to_string(index=False))
    else:
        print(f"No results for question type {q_type}")

Analyzing 5 question types: ['T/F' 'Multiple Choice' 'Open Ended' 'Math' nan]


--- Analysis for Question Type: T/F ---
Configuration Attribution COT_Type Accuracy Mean_P_Correct Mean_P_Incorrect P_Diff Correlation P_Value  Valid_Samples  Dropped_Samples
     Grad_COT        Grad      COT   68.00%         0.9103           0.6093 0.3010       0.543   0.005             25                0
  Grad_NoMost        Grad   NoMost   72.00%         0.9113           0.7045 0.2068       0.423   0.035             25                0
   Grad_NoCOT        Grad    NoCOT   56.00%         0.8242           0.6571 0.1671       0.335   0.102             25                0
   Sal_NoMost         Sal   NoMost   72.00%         0.9289           0.7845 0.1444       0.378   0.063             25                0
    Sal_NoCOT         Sal    NoCOT   68.00%         0.7704           0.7232 0.0472       0.090   0.667             25                0
      Sal_COT         Sal      COT   84.00%         0.8476           0

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

#filter rows where we have both Sal_MaxStep and Grad_MaxStep
valid_df = df.dropna(subset=['Sal_MaxStep', 'Grad_MaxStep'])

# Check how many rows remain after filtering
print(f"Total rows with both attribution methods: {len(valid_df)} (out of {len(df)} total rows)")

# Calculate agreement rate
agreement_count = (valid_df['Sal_MaxStep'] == valid_df['Grad_MaxStep']).sum()
agreement_rate = agreement_count / len(valid_df)

print(f"\nOverall agreement rate: {agreement_rate:.2%} ({agreement_count}/{len(valid_df)})")

# break it down by question type
question_types = valid_df['Question_Type'].unique()
print("\nAgreement by Question Type:")
print("--------------------------")

for q_type in question_types:
    q_df = valid_df[valid_df['Question_Type'] == q_type]
    if len(q_df) > 0:
        q_agreement = (q_df['Sal_MaxStep'] == q_df['Grad_MaxStep']).sum()
        q_agreement_rate = q_agreement / len(q_df)
        print(f"{q_type}: {q_agreement_rate:.2%} ({q_agreement}/{len(q_df)})")

# Create a contingency table to see which steps were identified as most salient
print("\nContingency Table (Sal_MaxStep vs Grad_MaxStep):")
print("----------------------------------------------")
contingency_table = pd.crosstab(valid_df['Sal_MaxStep'], valid_df['Grad_MaxStep'])
print(contingency_table)

# Calculate chi-square test to see if there's a significant relationship
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"\nChi-square test: chi2={chi2:.2f}, p={p:.4f}")

# Create a visualization of the agreement
plt.figure(figsize=(10, 6))
agreement_by_type = []
for q_type in question_types:
    q_df = valid_df[valid_df['Question_Type'] == q_type]
    if len(q_df) > 0:
        q_agreement_rate = (q_df['Sal_MaxStep'] == q_df['Grad_MaxStep']).mean()
        agreement_by_type.append({'Question Type': q_type, 'Agreement Rate': q_agreement_rate})

agreement_df = pd.DataFrame(agreement_by_type)
agreement_df = agreement_df.sort_values('Agreement Rate', ascending=False)

# Calculate percentage of questions where each step was identified as most important
print("\nPercentage of questions where each step was identified as most important:")
print("----------------------------------------------------------------------")
total_questions = len(valid_df)

print("Saliency method:")
sal_step_counts = valid_df['Sal_MaxStep'].value_counts().sort_index()
for step, count in sal_step_counts.items():
    print(f"Step {step}: {count/total_questions:.2%} ({count}/{total_questions})")

print("\nGradient method:")
grad_step_counts = valid_df['Grad_MaxStep'].value_counts().sort_index()
for step, count in grad_step_counts.items():
    print(f"Step {step}: {count/total_questions:.2%} ({count}/{total_questions})")

# Let's also include a more detailed analysis of step preference by question type
print("\nStep preference by question type:")
for q_type in question_types:
    q_df = valid_df[valid_df['Question_Type'] == q_type]
    if len(q_df) > 0:
        print(f"\n{q_type} (n={len(q_df)}):")
        
        print("Saliency method:")
        q_sal_counts = q_df['Sal_MaxStep'].value_counts().sort_index()
        for step, count in q_sal_counts.items():
            print(f"  Step {step}: {count/len(q_df):.2%} ({count}/{len(q_df)})")
        
        print("Gradient method:")
        q_grad_counts = q_df['Grad_MaxStep'].value_counts().sort_index()
        for step, count in q_grad_counts.items():
            print(f"  Step {step}: {count/len(q_df):.2%} ({count}/{len(q_df)})")

Total rows with both attribution methods: 78 (out of 101 total rows)

Overall agreement rate: 79.49% (62/78)

Agreement by Question Type:
--------------------------
T/F: 96.00% (24/25)
Multiple Choice: 100.00% (14/14)
Open Ended: 52.00% (13/25)
Math: 78.57% (11/14)

Contingency Table (Sal_MaxStep vs Grad_MaxStep):
----------------------------------------------
Grad_MaxStep  1.0  2.0  3.0  4.0  5.0
Sal_MaxStep                          
2.0             0    2    0    1    0
3.0             2    3   13    0    0
4.0             5    0    0   44    0
5.0             1    1    2    1    3

Chi-square test: chi2=104.26, p=0.0000

Percentage of questions where each step was identified as most important:
----------------------------------------------------------------------
Saliency method:
Step 2.0: 3.85% (3/78)
Step 3.0: 23.08% (18/78)
Step 4.0: 62.82% (49/78)
Step 5.0: 10.26% (8/78)

Gradient method:
Step 1.0: 10.26% (8/78)
Step 2.0: 7.69% (6/78)
Step 3.0: 19.23% (15/78)
Step 4.0: 58.97% (4

<Figure size 1000x600 with 0 Axes>

In [None]:
import pandas as pd
import numpy as np

results = []

# Get unique question types
question_types = df['Question_Type'].unique()

for q_type in question_types:
    # Filter for this question type
    q_df = df[df['Question_Type'] == q_type]
    
    # Drop rows with missing values in key columns
    clean_df = q_df.dropna(subset=[
        'Sal_COT_Correctness', 'Sal_NoMost_Correctness',
        'Grad_COT_Correctness', 'Grad_NoMost_Correctness'
    ])
    
    if len(clean_df) == 0:
        continue
    
    # Calculate performance drops when removing most important step
    sal_drop = clean_df['Sal_COT_Correctness'] - clean_df['Sal_NoMost_Correctness']
    grad_drop = clean_df['Grad_COT_Correctness'] - clean_df['Grad_NoMost_Correctness']
    
    # Calculate average drops
    avg_sal_drop = sal_drop.mean()
    avg_grad_drop = grad_drop.mean()
    
    # Count cases where removing the step hurts performance
    sal_faithful_count = (sal_drop > 0).sum()
    grad_faithful_count = (grad_drop > 0).sum()
    
    # Store results
    results.append({
        'Question Type': q_type,
        'Count': len(clean_df),
        'Sal Avg Drop': round(avg_sal_drop, 3),
        'Grad Avg Drop': round(avg_grad_drop, 3),
        'Sal Faithful %': round(sal_faithful_count / len(clean_df) * 100, 1),
        'Grad Faithful %': round(grad_faithful_count / len(clean_df) * 100, 1),
        'More Faithful Method': 'Saliency' if avg_sal_drop > avg_grad_drop else 'Gradient'
    })

# Create a DataFrame with the results
faithfulness_df = pd.DataFrame(results)

print("Faithfulness Comparison: Saliency vs Gradient Attribution")
print("--------------------------------------------------------")
print(faithfulness_df.to_string(index=False))

# Calculate overall results
overall = {
    'Question Type': 'OVERALL',
    'Count': faithfulness_df['Count'].sum(),
    'Sal Avg Drop': round(np.average(faithfulness_df['Sal Avg Drop'], weights=faithfulness_df['Count']), 3),
    'Grad Avg Drop': round(np.average(faithfulness_df['Grad Avg Drop'], weights=faithfulness_df['Count']), 3)
}

# Calculate overall faithful percentages
weighted_sal_faithful = sum(row['Sal Faithful %'] * row['Count'] for _, row in faithfulness_df.iterrows()) / overall['Count']
weighted_grad_faithful = sum(row['Grad Faithful %'] * row['Count'] for _, row in faithfulness_df.iterrows()) / overall['Count']

overall['Sal Faithful %'] = round(weighted_sal_faithful, 1)
overall['Grad Faithful %'] = round(weighted_grad_faithful, 1)
overall['More Faithful Method'] = 'Saliency' if overall['Sal Avg Drop'] > overall['Grad Avg Drop'] else 'Gradient'

# Add overall row to the bottom
faithfulness_df = pd.concat([faithfulness_df, pd.DataFrame([overall])], ignore_index=True)

print("\nFinal Results (including overall):")
print(faithfulness_df.to_string(index=False))

Faithfulness Comparison: Saliency vs Gradient Attribution
--------------------------------------------------------
  Question Type  Count  Sal Avg Drop  Grad Avg Drop  Sal Faithful %  Grad Faithful % More Faithful Method
            T/F     25         0.120         -0.040            12.0             12.0             Saliency
Multiple Choice     14         0.071          0.000             7.1              0.0             Saliency
     Open Ended     25         0.080          0.040             8.0              8.0             Saliency
           Math     14         0.429          0.071            42.9              7.1             Saliency

Final Results (including overall):
  Question Type  Count  Sal Avg Drop  Grad Avg Drop  Sal Faithful %  Grad Faithful % More Faithful Method
            T/F     25         0.120         -0.040            12.0             12.0             Saliency
Multiple Choice     14         0.071          0.000             7.1              0.0             Saliency
 