# Diversity Analysis for EU Legal Recommender

This notebook analyzes the diversity of recommendations across different weight configurations.

In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set visualization style
plt.style.use('ggplot')
sns.set(font_scale=1.2)
sns.set_style("whitegrid")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## 1. Load Diversity Results

In [3]:
# Load the diversity results
try:
    with open('diversity_results.json', 'r') as f:
        all_results = json.load(f)
    print(f"Loaded diversity results for {len(all_results)} clients")
    print(f"Client IDs: {list(all_results.keys())}")
except FileNotFoundError:
    print("Diversity results file not found. Please run calculate_diversity.py first.")
    all_results = {}

# Try to load individual client results if combined file not found
if not all_results:
    all_results = {}
    for client_file in Path('.').glob('*_diversity_results.json'):
        client_id = client_file.stem.replace('_diversity_results', '')
        with open(client_file, 'r') as f:
            client_results = json.load(f)
            all_results[client_id] = client_results
    
    if all_results:
        print(f"Loaded individual diversity results for {len(all_results)} clients")
        print(f"Client IDs: {list(all_results.keys())}")

Loaded diversity results for 2 clients
Client IDs: ['renewable_energy_client', 'bottling_company_client']


## 2. Process Results into DataFrames

In [6]:
def process_client_results(client_id, results):
    """Process a client's diversity results into dataframes"""
    dfs = {}
    
    # Process text vs categorical weights
    if 'text_vs_categorical' in results:
        text_cat_data = []
        for result in results['text_vs_categorical']:
            row = {
                'client_id': client_id,
                'config_type': 'text_vs_categorical',
                'text_weight': result['weights']['text_weight'],
                'categorical_weight': result['weights']['categorical_weight'],
                'content_diversity': result['content_diversity']
            }
            # Add categorical diversity metrics
            if isinstance(result['categorical_diversity'], dict):
                for cat, value in result['categorical_diversity'].items():
                    row[f'cat_diversity_{cat}'] = value
            else:
                row['cat_diversity'] = result['categorical_diversity']
                
            text_cat_data.append(row)
        
        if text_cat_data:
            dfs['text_vs_categorical'] = pd.DataFrame(text_cat_data)
    
    # Process summary vs keyword weights
    if 'summary_vs_keyword' in results:
        summary_keyword_data = []
        for result in results['summary_vs_keyword']:
            row = {
                'client_id': client_id,
                'config_type': 'summary_vs_keyword',
                'summary_weight': result['weights']['summary_weight'],
                'keyword_weight': result['weights']['keyword_weight'],
                'content_diversity': result['content_diversity']
            }
            # Add categorical diversity metrics
            if isinstance(result['categorical_diversity'], dict):
                for cat, value in result['categorical_diversity'].items():
                    row[f'cat_diversity_{cat}'] = value
            else:
                row['cat_diversity'] = result['categorical_diversity']
                
            summary_keyword_data.append(row)
        
        if summary_keyword_data:
            dfs['summary_vs_keyword'] = pd.DataFrame(summary_keyword_data)
    
    # Process personalization weights
    if 'personalization' in results:
        pers_data = []
        for result in results['personalization']:
            row = {
                'client_id': client_id,
                'config_type': 'personalization',
                'expert_weight': result['weights']['expert_weight'],
                'historical_weight': result['weights']['historical_weight'],
                'categorical_weight': result['weights']['categorical_weight'],
                'content_diversity': result['content_diversity']
            }
            # Add categorical diversity metrics
            if isinstance(result['categorical_diversity'], dict):
                for cat, value in result['categorical_diversity'].items():
                    row[f'cat_diversity_{cat}'] = value
            else:
                row['cat_diversity'] = result['categorical_diversity']
                
            pers_data.append(row)
        
        if pers_data:
            dfs['personalization'] = pd.DataFrame(pers_data)
    
    return dfs

# Process all client results
all_dfs = {}
for client_id, results in all_results.items():
    all_dfs[client_id] = process_client_results(client_id, results)

# Combine dataframes across clients
combined_dfs = {}
for config_type in ['text_vs_categorical', 'summary_vs_keyword', 'personalization']:
    config_dfs = []
    for client_id, client_dfs in all_dfs.items():
        if config_type in client_dfs:
            config_dfs.append(client_dfs[config_type])
    
    if config_dfs:
        combined_dfs[config_type] = pd.concat(config_dfs, ignore_index=True)

# Show available dataframes
for config_type, df in combined_dfs.items():
    print(f"\n{config_type} data shape: {df.shape}")
    display(df.head(3))

## 3. Analyze Text vs. Categorical Diversity

In [7]:
if 'text_vs_categorical' in combined_dfs:
    df = combined_dfs['text_vs_categorical']
    
    # Plot content diversity by text weight
    plt.figure(figsize=(12, 6))
    for client_id in df['client_id'].unique():
        client_data = df[df['client_id'] == client_id]
        plt.plot(client_data['text_weight'], client_data['content_diversity'], 
                 marker='o', linestyle='-', label=client_id)
    
    plt.xlabel('Text Weight')
    plt.ylabel('Content Diversity (Cosine Distance)')
    plt.title('Impact of Text vs. Categorical Weights on Content Diversity')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Plot categorical diversity metrics if available
    cat_cols = [col for col in df.columns if col.startswith('cat_diversity_')]
    if cat_cols:
        plt.figure(figsize=(12, 8))
        for i, cat_col in enumerate(cat_cols):
            if cat_col == 'cat_diversity_overall':
                continue  # Skip overall, plot it later
                
            plt.subplot(len(cat_cols), 1, i+1)
            for client_id in df['client_id'].unique():
                client_data = df[df['client_id'] == client_id]
                plt.plot(client_data['text_weight'], client_data[cat_col], 
                         marker='o', linestyle='-', label=client_id)
            
            plt.ylabel(cat_col.replace('cat_diversity_', ''))
            if i == 0:
                plt.title('Impact of Text vs. Categorical Weights on Categorical Diversity')
            if i == len(cat_cols) - 1:
                plt.xlabel('Text Weight')
            plt.grid(True)
            plt.legend()
        
        plt.tight_layout()
        plt.show()
    
    # Identify most diverse configurations
    print("\nMost diverse configurations (text vs. categorical):")
    print("\nHighest content diversity:")
    display(df.sort_values('content_diversity', ascending=False).head(5))
    
    if 'cat_diversity_overall' in df.columns:
        print("\nHighest overall categorical diversity:")
        display(df.sort_values('cat_diversity_overall', ascending=False).head(5))

## 4. Analyze Summary vs. Keyword Diversity

In [8]:
if 'summary_vs_keyword' in combined_dfs:
    df = combined_dfs['summary_vs_keyword']
    
    # Plot content diversity by summary weight
    plt.figure(figsize=(12, 6))
    for client_id in df['client_id'].unique():
        client_data = df[df['client_id'] == client_id]
        plt.plot(client_data['summary_weight'], client_data['content_diversity'], 
                 marker='o', linestyle='-', label=client_id)
    
    plt.xlabel('Summary Weight')
    plt.ylabel('Content Diversity (Cosine Distance)')
    plt.title('Impact of Summary vs. Keyword Weights on Content Diversity')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Identify most diverse configurations
    print("\nMost diverse configurations (summary vs. keyword):")
    print("\nHighest content diversity:")
    display(df.sort_values('content_diversity', ascending=False).head(5))
    
    if 'cat_diversity_overall' in df.columns:
        print("\nHighest overall categorical diversity:")
        display(df.sort_values('cat_diversity_overall', ascending=False).head(5))

## 5. Analyze Personalization Weights Impact on Diversity

In [9]:
if 'personalization' in combined_dfs:
    df = combined_dfs['personalization']
    
    # Create a 3D plot to visualize impact of three weights on diversity
    from mpl_toolkits.mplot3d import Axes3D
    
    # Plot for each client
    for client_id in df['client_id'].unique():
        client_data = df[df['client_id'] == client_id]
        
        fig = plt.figure(figsize=(12, 10))
        ax = fig.add_subplot(111, projection='3d')
        
        scatter = ax.scatter(
            client_data['expert_weight'],
            client_data['historical_weight'],
            client_data['categorical_weight'],
            c=client_data['content_diversity'],
            cmap='viridis',
            s=50,
            alpha=0.7
        )
        
        ax.set_xlabel('Expert Weight')
        ax.set_ylabel('Historical Weight')
        ax.set_zlabel('Categorical Weight')
        ax.set_title(f'Personalization Weights and Content Diversity for {client_id}')
        cbar = plt.colorbar(scatter)
        cbar.set_label('Content Diversity')
        
        plt.tight_layout()
        plt.show()
    
    # Identify most diverse configurations
    print("\nMost diverse configurations (personalization weights):")
    print("\nHighest content diversity:")
    display(df.sort_values('content_diversity', ascending=False).head(5))
    
    if 'cat_diversity_overall' in df.columns:
        print("\nHighest overall categorical diversity:")
        display(df.sort_values('cat_diversity_overall', ascending=False).head(5))

## 6. Compare Diversity Metrics Across Weight Configurations

In [10]:
# Combine all diversity data for comparison
all_diversity_data = []
for config_type, df in combined_dfs.items():
    all_diversity_data.append(df[['client_id', 'config_type', 'content_diversity']])

if all_diversity_data:
    combined_diversity = pd.concat(all_diversity_data, ignore_index=True)
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='config_type', y='content_diversity', hue='client_id', data=combined_diversity)
    plt.title('Content Diversity Comparison Across Weight Configuration Types')
    plt.xlabel('Configuration Type')
    plt.ylabel('Content Diversity (Cosine Distance)')
    plt.grid(True)
    plt.legend(title='Client')
    plt.tight_layout()
    plt.show()
    
    # Calculate average diversity by configuration type
    avg_diversity = combined_diversity.groupby(['client_id', 'config_type'])['content_diversity'].mean().reset_index()
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='config_type', y='content_diversity', hue='client_id', data=avg_diversity)
    plt.title('Average Content Diversity by Configuration Type')
    plt.xlabel('Configuration Type')
    plt.ylabel('Average Content Diversity')
    plt.grid(True)
    plt.legend(title='Client')
    plt.tight_layout()
    plt.show()

## 7. Overall Diversity Analysis and Recommendations

Based on the analysis above, we can draw the following conclusions about diversity in our recommendations:

1. **Most Diverse Weight Configurations**: 
   - [To be filled in after running the notebook]

2. **Impact of Text vs. Categorical Weights**:
   - [To be filled in after running the notebook]

3. **Impact of Summary vs. Keyword Weights**:
   - [To be filled in after running the notebook]

4. **Impact of Personalization Component Weights**:
   - [To be filled in after running the notebook]

5. **Client Differences**:
   - [To be filled in after running the notebook]

6. **Diversity-Performance Tradeoff**:
   - [To be filled in after running the notebook]

7. **Recommendations**:
   - [To be filled in after running the notebook]

In [None]:
# You can add additional custom analysis here based on the results