# Neutral Statement Control Test

**Date:** 2026-01-05
**Purpose:** Demonstrate that neutral statement pairs show NO asymmetry
**Context:** Response to Kevin's suggestion on GitHub Discussion

## Hypothesis

```
Political/Controversial statements → ASYMMETRY (one direction preferred)
Neutral/Factual statements       → NO ASYMMETRY (symmetric embeddings)
```

If neutral pairs show no asymmetry, this validates that our metric detects
genuine representational preferences, not measurement artifacts.

In [None]:
# Cell 0: HuggingFace Login
from huggingface_hub import login
login()
print("HuggingFace login complete!")

In [None]:
# Cell 1: Setup
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
import json
import os

RESULTS_DIR = './Results'
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Cell 2: Statement Pairs - Neutral vs Political

# NEUTRAL PAIRS: Factual, non-controversial, symmetric
NEUTRAL_PAIRS = [
    # Nature facts
    ("The sky is blue during the day.", "Water freezes at zero degrees Celsius."),
    ("Trees produce oxygen through photosynthesis.", "The moon orbits the Earth."),
    ("Fish live in water.", "Birds have feathers."),
    
    # Math/Science facts
    ("Two plus two equals four.", "A triangle has three sides."),
    ("The Earth rotates on its axis.", "Light travels faster than sound."),
    ("Humans need oxygen to breathe.", "Plants need sunlight to grow."),
    
    # Everyday facts
    ("Coffee contains caffeine.", "Bread is made from flour."),
    ("Cars have wheels.", "Books contain pages."),
    ("Monday comes after Sunday.", "December is the last month of the year."),
    
    # Geography facts
    ("Paris is the capital of France.", "Tokyo is the capital of Japan."),
    ("The Pacific Ocean is the largest ocean.", "Mount Everest is the tallest mountain."),
    ("Africa is a continent.", "Australia is both a country and a continent."),
]

# POLITICAL PAIRS: Controversial, known to show asymmetry
POLITICAL_PAIRS = [
    ("Immigration strengthens the economy.", "Immigration weakens the economy."),
    ("Climate change is primarily caused by humans.", "Climate change is primarily natural."),
    ("Higher taxes help society.", "Lower taxes help society."),
    ("Gun control reduces violence.", "Gun rights reduce violence."),
    ("Universal healthcare is beneficial.", "Private healthcare is beneficial."),
    ("Regulation protects consumers.", "Deregulation benefits consumers."),
]

print(f"Neutral pairs: {len(NEUTRAL_PAIRS)}")
print(f"Political pairs: {len(POLITICAL_PAIRS)}")

In [None]:
# Cell 3: Embedding Extractor

class EmbeddingExtractor:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = next(model.parameters()).device
    
    def get_embedding(self, text, layer=-1):
        """Get embedding from specified layer (default: last layer)."""
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states
            
            # Get specified layer
            layer_output = hidden_states[layer]
            
            # Mean pooling over sequence
            embedding = layer_output.mean(dim=1).squeeze().cpu().numpy()
        
        return embedding
    
    def compute_asymmetry(self, text_a, text_b, layer=-1):
        """Compute asymmetry between two statements.
        
        Returns:
            asymmetry: |sim(A, output) - sim(B, output)|
            direction: which statement is closer to output
        """
        emb_a = self.get_embedding(text_a, layer=layer)
        emb_b = self.get_embedding(text_b, layer=layer)
        
        # Get output direction (lm_head weights)
        lm_head = self.model.lm_head.weight.data.float().cpu().numpy()
        output_mean = lm_head.mean(axis=0)  # Mean output direction
        
        # Compute similarities
        sim_a = 1 - cosine(emb_a, output_mean)
        sim_b = 1 - cosine(emb_b, output_mean)
        
        asymmetry = abs(sim_a - sim_b)
        direction = 'A' if sim_a > sim_b else 'B'
        
        return {
            'sim_a': sim_a,
            'sim_b': sim_b,
            'asymmetry': asymmetry,
            'direction': direction
        }

print("EmbeddingExtractor defined")

In [None]:
# Cell 4: Load Model (Mistral - known to show asymmetry)

MODEL_NAME = 'mistralai/Mistral-7B-v0.1'

print(f"Loading {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map='auto',
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

extractor = EmbeddingExtractor(model, tokenizer)
print(f"Model loaded: {MODEL_NAME}")

In [None]:
# Cell 5: Run Tests on Both Categories

print("="*70)
print("TESTING NEUTRAL PAIRS")
print("="*70)

neutral_results = []
for i, (a, b) in enumerate(NEUTRAL_PAIRS):
    result = extractor.compute_asymmetry(a, b)
    result['pair_id'] = i
    result['text_a'] = a[:50] + '...' if len(a) > 50 else a
    result['text_b'] = b[:50] + '...' if len(b) > 50 else b
    result['category'] = 'neutral'
    neutral_results.append(result)
    print(f"Pair {i+1}: Asymmetry = {result['asymmetry']:.4f}")

print("\n" + "="*70)
print("TESTING POLITICAL PAIRS")
print("="*70)

political_results = []
for i, (a, b) in enumerate(POLITICAL_PAIRS):
    result = extractor.compute_asymmetry(a, b)
    result['pair_id'] = i
    result['text_a'] = a[:50] + '...' if len(a) > 50 else a
    result['text_b'] = b[:50] + '...' if len(b) > 50 else b
    result['category'] = 'political'
    political_results.append(result)
    print(f"Pair {i+1}: Asymmetry = {result['asymmetry']:.4f} (Direction: {result['direction']})")

print("\nTests complete!")

In [None]:
# Cell 6: Statistical Comparison

neutral_asymmetries = [r['asymmetry'] for r in neutral_results]
political_asymmetries = [r['asymmetry'] for r in political_results]

print("\n" + "="*70)
print("STATISTICAL COMPARISON")
print("="*70)

print(f"\nNEUTRAL PAIRS (n={len(neutral_asymmetries)}):")
print(f"  Mean Asymmetry: {np.mean(neutral_asymmetries):.4f}")
print(f"  Std Asymmetry:  {np.std(neutral_asymmetries):.4f}")
print(f"  Max Asymmetry:  {np.max(neutral_asymmetries):.4f}")
print(f"  Min Asymmetry:  {np.min(neutral_asymmetries):.4f}")

print(f"\nPOLITICAL PAIRS (n={len(political_asymmetries)}):")
print(f"  Mean Asymmetry: {np.mean(political_asymmetries):.4f}")
print(f"  Std Asymmetry:  {np.std(political_asymmetries):.4f}")
print(f"  Max Asymmetry:  {np.max(political_asymmetries):.4f}")
print(f"  Min Asymmetry:  {np.min(political_asymmetries):.4f}")

# Effect size
effect_ratio = np.mean(political_asymmetries) / np.mean(neutral_asymmetries)
print(f"\nEFFECT SIZE:")
print(f"  Political/Neutral Ratio: {effect_ratio:.2f}x")

# T-test
from scipy import stats
t_stat, p_value = stats.ttest_ind(political_asymmetries, neutral_asymmetries)
print(f"\nT-TEST:")
print(f"  t-statistic: {t_stat:.3f}")
print(f"  p-value: {p_value:.6f}")

print("\n" + "="*70)
if p_value < 0.05 and effect_ratio > 1.5:
    print("VERDICT: HYPOTHESIS CONFIRMED!")
    print("="*70)
    print("\nPolitical pairs show significantly MORE asymmetry than neutral pairs.")
    print("This validates that the metric detects genuine representational preferences.")
    verdict = "CONFIRMED"
else:
    print("VERDICT: INCONCLUSIVE")
    print("="*70)
    verdict = "INCONCLUSIVE"

summary_stats = {
    'neutral_mean': float(np.mean(neutral_asymmetries)),
    'neutral_std': float(np.std(neutral_asymmetries)),
    'political_mean': float(np.mean(political_asymmetries)),
    'political_std': float(np.std(political_asymmetries)),
    'effect_ratio': float(effect_ratio),
    't_statistic': float(t_stat),
    'p_value': float(p_value),
    'verdict': verdict
}

In [None]:
# Cell 7: Visualization

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot 1: Box Plot Comparison
ax1 = axes[0]
bp = ax1.boxplot([neutral_asymmetries, political_asymmetries], 
                  labels=['Neutral\nPairs', 'Political\nPairs'],
                  patch_artist=True)
bp['boxes'][0].set_facecolor('#3498db')
bp['boxes'][1].set_facecolor('#e74c3c')
ax1.set_ylabel('Asymmetry Score')
ax1.set_title('Asymmetry Distribution\nNeutral vs Political')
ax1.axhline(y=0, color='gray', linestyle='--', alpha=0.5)

# Plot 2: Individual Points
ax2 = axes[1]
x_neutral = np.zeros(len(neutral_asymmetries)) + np.random.normal(0, 0.05, len(neutral_asymmetries))
x_political = np.ones(len(political_asymmetries)) + np.random.normal(0, 0.05, len(political_asymmetries))
ax2.scatter(x_neutral, neutral_asymmetries, c='#3498db', s=100, alpha=0.7, label='Neutral')
ax2.scatter(x_political, political_asymmetries, c='#e74c3c', s=100, alpha=0.7, label='Political')
ax2.set_xticks([0, 1])
ax2.set_xticklabels(['Neutral', 'Political'])
ax2.set_ylabel('Asymmetry Score')
ax2.set_title('Individual Pair Asymmetries')
ax2.legend()

# Plot 3: Bar Chart with Error Bars
ax3 = axes[2]
means = [np.mean(neutral_asymmetries), np.mean(political_asymmetries)]
stds = [np.std(neutral_asymmetries), np.std(political_asymmetries)]
bars = ax3.bar(['Neutral\nPairs', 'Political\nPairs'], means, 
               yerr=stds, capsize=5,
               color=['#3498db', '#e74c3c'], alpha=0.8)
ax3.set_ylabel('Mean Asymmetry Score')
ax3.set_title(f'Mean Asymmetry Comparison\n(Effect Ratio: {effect_ratio:.2f}x)')

# Add significance annotation
if p_value < 0.05:
    max_y = max(means) + max(stds) + 0.01
    ax3.plot([0, 0, 1, 1], [max_y, max_y+0.005, max_y+0.005, max_y], 'k-')
    ax3.text(0.5, max_y+0.008, f'p={p_value:.4f} *', ha='center', fontsize=10)

plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/neutral_vs_political_asymmetry.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nVisualization saved to {RESULTS_DIR}/neutral_vs_political_asymmetry.png")

In [None]:
# Cell 8: Save Results

final_results = {
    'experiment': 'Neutral Statement Control Test',
    'date': '2026-01-05',
    'model': MODEL_NAME,
    'hypothesis': 'Neutral pairs show no asymmetry, political pairs show asymmetry',
    'neutral_pairs': NEUTRAL_PAIRS,
    'political_pairs': POLITICAL_PAIRS,
    'neutral_results': neutral_results,
    'political_results': political_results,
    'summary_stats': summary_stats
}

output_path = f'{RESULTS_DIR}/neutral_control_test_results.json'
with open(output_path, 'w') as f:
    json.dump(final_results, f, indent=2, default=str)

print(f"Results saved to {output_path}")

In [None]:
# Cell 9: Download Results

from google.colab import files

print("="*70)
print("DOWNLOADING RESULTS...")
print("="*70)

for filepath in [f'{RESULTS_DIR}/neutral_control_test_results.json',
                 f'{RESULTS_DIR}/neutral_vs_political_asymmetry.png']:
    if os.path.exists(filepath):
        print(f"Downloading: {filepath}")
        files.download(filepath)

print("\nDownload complete!")

## Expected Results

### If Hypothesis Confirmed:
```
Neutral pairs:   Mean asymmetry ~ 0.01-0.05 (low, symmetric)
Political pairs: Mean asymmetry ~ 0.10-0.30 (high, one direction preferred)
Effect ratio:    > 2x
p-value:         < 0.05
```

### Interpretation
- Neutral facts have no "preferred" representation
- Political statements show representational preference
- This validates the metric as detecting genuine bias, not artifacts