In [None]:
# -*- coding: utf-8 -*-
"""
Complete Word Embedding Similarity Evaluation Framework - Supports Multiple Standard Datasets
Includes dataset loading and CPAE model evaluation functionality
"""

import numpy as np
import pandas as pd
import torch
import re
import nltk
from scipy.stats import pearsonr, spearmanr
from sklearn.utils import Bunch
import warnings
import os
import sys
import json
import logging
from collections import defaultdict
import traceback
from tqdm import tqdm
import glob
import urllib.request
import zipfile
from io import StringIO

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

warnings.filterwarnings('ignore')

class DatasetLoader:
    """Standard Word Similarity Dataset Loader"""
    
    def __init__(self, data_dir="./datasets"):
        self.data_dir = data_dir
        os.makedirs(data_dir, exist_ok=True)
    
    def fetch_SimVerb3500(self, subset='all'):
        """Load SimVerb-3500 dataset"""
        try:
            # Example data structure for SimVerb-3500
            # In actual use, need to download from official source
            # Here provides a basic loading framework
            
            if subset == 'all':
                # Simulate complete dataset
                word_pairs = [
                    ('run', 'walk'), ('eat', 'consume'), ('talk', 'speak'),
                    ('think', 'ponder'), ('write', 'compose'), ('read', 'study'),
                    ('sleep', 'rest'), ('work', 'labor'), ('play', 'game'),
                    ('love', 'adore')
                ]
                scores = [7.5, 8.2, 8.8, 7.1, 6.9, 6.3, 7.8, 8.1, 5.2, 8.9]
            elif subset == 'dev':
                word_pairs = [('run', 'walk'), ('eat', 'consume'), ('talk', 'speak')]
                scores = [7.5, 8.2, 8.8]
            elif subset == 'test':
                word_pairs = [('think', 'ponder'), ('write', 'compose'), ('read', 'study')]
                scores = [7.1, 6.9, 6.3]
            
            return Bunch(X=np.array(word_pairs, dtype=object), 
                        y=np.array(scores))
                        
        except Exception as e:
            logger.error(f"Error loading SimVerb3500: {e}")
            return Bunch(X=np.array([], dtype=object), y=np.array([]))
    
    def fetch_MEN(self, subset='all'):
        """Load MEN dataset"""
        try:
            # Example data for MEN dataset
            if subset == 'all':
                word_pairs = [
                    ('car', 'automobile'), ('gem', 'jewel'), ('journey', 'voyage'),
                    ('boy', 'lad'), ('coast', 'shore'), ('asylum', 'madhouse'),
                    ('magician', 'wizard'), ('midday', 'noon'), ('furnace', 'stove'),
                    ('food', 'fruit'), ('bird', 'cock'), ('bird', 'crane')
                ]
                scores = [8.79, 8.96, 9.29, 8.83, 9.10, 8.87, 9.02, 8.55, 8.05, 7.52, 7.68, 7.91]
            elif subset == 'dev':
                word_pairs = [('car', 'automobile'), ('gem', 'jewel'), ('journey', 'voyage')]
                scores = [8.79, 8.96, 9.29]
            elif subset == 'test':
                word_pairs = [('boy', 'lad'), ('coast', 'shore'), ('asylum', 'madhouse')]
                scores = [8.83, 9.10, 8.87]
            
            return Bunch(X=np.array(word_pairs, dtype=object), 
                        y=np.array(scores))
                        
        except Exception as e:
            logger.error(f"Error loading MEN: {e}")
            return Bunch(X=np.array([], dtype=object), y=np.array([]))
    
    def fetch_SimLex999(self):
        """Load SimLex-999 dataset"""
        try:
            # Example data for SimLex-999
            word_pairs = [
                ('old', 'new'), ('smart', 'intelligent'), ('hard', 'difficult'),
                ('happy', 'cheerful'), ('large', 'big'), ('small', 'tiny'),
                ('hot', 'warm'), ('cold', 'cool'), ('fast', 'quick'),
                ('slow', 'sluggish'), ('good', 'excellent'), ('bad', 'terrible')
            ]
            scores = [1.58, 9.06, 8.77, 9.55, 9.36, 9.25, 8.27, 7.27, 8.75, 7.38, 8.68, 7.59]
            
            return Bunch(X=np.array(word_pairs, dtype=object), 
                        y=np.array(scores))
                        
        except Exception as e:
            logger.error(f"Error loading SimLex999: {e}")
            return Bunch(X=np.array([], dtype=object), y=np.array([]))
    
    def fetch_SCWS(self):
        """Load Stanford Contextual Word Similarities (SCWS) dataset"""
        try:
            # Example data for SCWS - includes contextual information
            word_pairs = [
                ('bank', 'money'), ('bank', 'river'), ('star', 'celebrity'),
                ('star', 'astronomy'), ('mouse', 'computer'), ('mouse', 'animal'),
                ('bow', 'weapon'), ('bow', 'ribbon'), ('bark', 'dog'),
                ('bark', 'tree')
            ]
            scores = [8.5, 1.2, 9.1, 8.7, 7.3, 2.1, 8.9, 1.8, 9.2, 2.5]
            
            return Bunch(X=np.array(word_pairs, dtype=object), 
                        y=np.array(scores))
                        
        except Exception as e:
            logger.error(f"Error loading SCWS: {e}")
            return Bunch(X=np.array([], dtype=object), y=np.array([]))
    
    def fetch_WS353(self):
        """Load WordSim-353 dataset"""
        try:
            # Example data for WS-353
            word_pairs = [
                ('love', 'sex'), ('tiger', 'cat'), ('tiger', 'tiger'),
                ('book', 'paper'), ('computer', 'keyboard'), ('computer', 'internet'),
                ('plane', 'car'), ('train', 'car'), ('telephone', 'communication'),
                ('television', 'radio'), ('media', 'radio'), ('drug', 'abuse')
            ]
            scores = [6.77, 7.35, 10.00, 7.46, 7.62, 7.58, 5.77, 6.31, 7.50, 6.77, 7.42, 6.85]
            
            return Bunch(X=np.array(word_pairs, dtype=object), 
                        y=np.array(scores))
                        
        except Exception as e:
            logger.error(f"Error loading WS353: {e}")
            return Bunch(X=np.array([], dtype=object), y=np.array([]))
    
    def fetch_RG65(self):
        """Load Rubenstein-Goodenough 65 dataset"""
        try:
            # Example data for RG-65
            word_pairs = [
                ('cord', 'string'), ('smile', 'grin'), ('author', 'writer'),
                ('cushion', 'pillow'), ('jew', 'hebrew'), ('sunset', 'sunrise'),
                ('noon', 'string'), ('rooster', 'voyage'), ('coast', 'hill'),
                ('forest', 'graveyard'), ('shore', 'woodland'), ('monk', 'slave')
            ]
            scores = [10.0, 7.38, 6.77, 7.92, 7.85, 7.27, 0.04, 0.04, 3.15, 1.85, 3.08, 0.92]
            
            return Bunch(X=np.array(word_pairs, dtype=object), 
                        y=np.array(scores))
                        
        except Exception as e:
            logger.error(f"Error loading RG65: {e}")
            return Bunch(X=np.array([], dtype=object), y=np.array([]))

class CPAEHyperbolicEvaluator:
    """CPAE Dataset Hyperbolic Embedding Model Evaluator"""
    
    def __init__(self, project_dir, device=None):
        self.models = {}
        self.results = {}
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.project_dir = project_dir
        self.dataset_loader = DatasetLoader()
        logger.info(f"Using device: {self.device}")
        logger.info(f"Project directory: {self.project_dir}")
        self.embeddings_cache = {}
    
    def normalize_word(self, word):
        """Normalize word (convert to lowercase, remove spaces, etc.)"""
        if isinstance(word, str):
            return word.lower().strip()
        return str(word).lower().strip()
    
    def load_multirel_model(self, model_path, data_dir):
        """Load model trained by multirelational-poincare repository (for CPAE dataset)"""
        try:
            # Infer model type and dimension from filename
            filename = os.path.basename(model_path)
            if "poincare" in filename:
                model_type = "MuRP"
            elif "euclidean" in filename:
                model_type = "MuRE"
            else:
                model_type = "Unknown"
                
            # Extract dimension information
            match = re.search(r'_(\d+)\.pth$', filename)
            dimension = int(match.group(1)) if match else 0
            
            logger.info(f"Loading {model_type} model (dim={dimension}) from: {model_path}")
            
            # Add project source code directory to system path
            source_dir = os.path.join(self.project_dir, "多关系双曲嵌入/multirelational-poincare")
            if source_dir not in sys.path:
                sys.path.insert(0, source_dir)
            
            logger.info(f"Added to sys.path: {source_dir}")
            
            try:
                from load_data import Data
                if model_type == "MuRP":
                    from model import MuRP
                else:
                    from model import MuRE
                logger.info("Successfully imported model modules")
            except ImportError as e:
                logger.error(f"Error importing model modules: {e}")
                logger.error(f"Current sys.path: {sys.path}")
                return None
            
            # Load data to get entity mapping
            try:
                d = Data(data_dir)
                entity_list = d.entities
                entity_id_map = {entity: idx for idx, entity in enumerate(entity_list)}
                logger.info(f"Loaded {len(entity_list)} entities from CPAE dataset")
                logger.info(f"Sample entities: {entity_list[:10]}")
            except Exception as e:
                logger.error(f"Error loading data: {e}")
                return None
            
            # Load model checkpoint
            try:
                if torch.cuda.is_available():
                    checkpoint = torch.load(model_path)
                else:
                    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
                
                # Create model instance
                if model_type == "MuRP":
                    model = MuRP(d, dimension, self.device)
                else:
                    model = MuRE(d, dimension)
                
                # Load model state
                model.load_state_dict(checkpoint)
                model.eval()
                model.to(self.device)
                
                # Extract entity embeddings
                with torch.no_grad():
                    if model_type == "MuRP":
                        entity_embeddings = model.Eh.weight.data.cpu().numpy()
                    else:
                        entity_embeddings = model.E.weight.data.cpu().numpy()
                
                logger.info(f"Extracted entity embeddings of shape: {entity_embeddings.shape}")
            except Exception as e:
                logger.error(f"Error loading model state: {e}")
                traceback.print_exc()
                return None
            
            # Create embedding cache
            self.embeddings_cache = {
                entity: entity_embeddings[idx] 
                for entity, idx in entity_id_map.items()
            }
            
            # Create normalized entity mapping
            normalized_entity_map = {}
            for entity in entity_list:
                normalized_key = self.normalize_word(entity)
                if normalized_key not in normalized_entity_map:
                    normalized_entity_map[normalized_key] = entity
            
            return {
                "entity_embeddings": entity_embeddings,
                "entity_id_map": entity_id_map,
                "entity_list": entity_list,
                "normalized_entity_map": normalized_entity_map,
                "type": model_type,
                "dimension": dimension,
                "embeddings_cache": self.embeddings_cache
            }
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            traceback.print_exc()
            return None
    
    def compute_poincare_distance(self, u, v, epsilon=1e-5):
        """Calculate Poincaré hyperbolic distance"""
        try:
            u = np.asarray(u, dtype=np.float32)
            v = np.asarray(v, dtype=np.float32)
            
            # Ensure vectors are within unit ball
            u_norm = np.linalg.norm(u)
            v_norm = np.linalg.norm(v)
            
            if u_norm >= 1.0:
                u = u / (u_norm + epsilon) * (1.0 - epsilon)
            if v_norm >= 1.0:
                v = v / (v_norm + epsilon) * (1.0 - epsilon)
            
            norm_u_sq = np.sum(u**2)
            norm_v_sq = np.sum(v**2)
            norm_diff_sq = np.sum((u - v)**2)
            
            denominator = max((1 - norm_u_sq) * (1 - norm_v_sq), epsilon)
            inner_expr = 1 + 2 * norm_diff_sq / denominator
            
            if inner_expr <= 1:
                inner_expr = 1 + epsilon
                
            return np.arccosh(inner_expr)
        except Exception as e:
            logger.warning(f"Error computing Poincaré distance: {e}")
            return np.linalg.norm(u - v)
    
    def compute_euclidean_similarity(self, u, v):
        """Calculate cosine similarity for Euclidean embeddings"""
        u = np.asarray(u, dtype=np.float32)
        v = np.asarray(v, dtype=np.float32)
        norm_u = np.linalg.norm(u)
        norm_v = np.linalg.norm(v)
        
        if norm_u == 0 or norm_v == 0:
            return 0.0
        
        cosine_sim = np.dot(u, v) / (norm_u * norm_v)
        return cosine_sim  # Maintain [-1,1] range
    
    def compute_hyperbolic_similarity(self, u, v):
        """Calculate similarity in hyperbolic space (multiple methods)"""
        try:
            distance = self.compute_poincare_distance(u, v)
            
            # Method 1: Negative exponential transformation
            similarity_exp = np.exp(-distance)
            
            # Method 2: Inverse transformation
            similarity_inv = 1.0 / (1.0 + distance)
            
            # Method 3: Negative distance (smaller distance, higher similarity)
            similarity_neg = -distance
            
            # Method 4: Hyperbolic cosine similarity
            u = np.asarray(u, dtype=np.float32)
            v = np.asarray(v, dtype=np.float32)
            dot_product = np.dot(u, v)
            similarity_cosh = dot_product  # Inner product in hyperbolic space
            
            return {
                'exp': similarity_exp,
                'inv': similarity_inv, 
                'neg': similarity_neg,
                'cosh': similarity_cosh
            }
        except Exception as e:
            logger.warning(f"Error computing hyperbolic similarity: {e}")
            return {'exp': 0.0, 'inv': 0.0, 'neg': 0.0, 'cosh': 0.0}
    
    def find_word_embedding(self, word, model_info):
        """Find word embedding in CPAE dataset"""
        normalized_word = self.normalize_word(word)
        
        # Direct lookup in normalized entity mapping
        normalized_entity_map = model_info.get("normalized_entity_map", {})
        if normalized_word in normalized_entity_map:
            original_entity = normalized_entity_map[normalized_word]
            if original_entity in model_info["embeddings_cache"]:
                return model_info["embeddings_cache"][original_entity]
        
        # If normalized version not found, try original word
        if word in model_info["embeddings_cache"]:
            return model_info["embeddings_cache"][word]
        
        # Try direct lookup in entity list
        entity_list = model_info["entity_list"]
        entity_id_map = model_info["entity_id_map"]
        
        for entity in entity_list:
            if self.normalize_word(entity) == normalized_word:
                if entity in entity_id_map:
                    idx = entity_id_map[entity]
                    embedding = model_info["entity_embeddings"][idx]
                    model_info["embeddings_cache"][entity] = embedding
                    return embedding
        
        return None
    
    def compute_similarity_batch(self, model_info, words1, words2, similarity_method='auto'):
        """Batch compute similarity, supports multiple similarity calculation methods"""
        model_type = model_info["type"]
        similarities = np.full(len(words1), np.nan)
        valid_indices = []
        
        # Store multiple similarity calculation results for comparison
        all_similarities = {
            'exp': np.full(len(words1), np.nan),
            'inv': np.full(len(words1), np.nan), 
            'neg': np.full(len(words1), np.nan),
            'cosh': np.full(len(words1), np.nan),
            'cosine': np.full(len(words1), np.nan)
        }
        
        for i, (word1, word2) in enumerate(zip(words1, words2)):
            emb1 = self.find_word_embedding(word1, model_info)
            emb2 = self.find_word_embedding(word2, model_info)
            
            if emb1 is not None and emb2 is not None:
                if model_type == "MuRP":
                    # Hyperbolic models: try multiple similarity calculation methods
                    hyp_sims = self.compute_hyperbolic_similarity(emb1, emb2)
                    
                    # Also calculate cosine similarity as baseline
                    cosine_sim = self.compute_euclidean_similarity(emb1, emb2)
                    
                    # Store results from all methods
                    for method, sim_value in hyp_sims.items():
                        all_similarities[method][i] = sim_value
                    all_similarities['cosine'][i] = cosine_sim
                    
                    # Select primary similarity method
                    if similarity_method == 'auto':
                        # Default to negative exponential method
                        similarity = hyp_sims['exp']
                    else:
                        similarity = hyp_sims.get(similarity_method, hyp_sims['exp'])
                        
                else:
                    # Euclidean models: use cosine similarity
                    similarity = self.compute_euclidean_similarity(emb1, emb2)
                    all_similarities['cosine'][i] = similarity
                
                similarities[i] = similarity
                valid_indices.append(i)
        
        return similarities, valid_indices, all_similarities
    
    def load_models(self, model_dir, data_dir):
        """Load all models from specified directory"""
        logger.info(f"Loading models from: {model_dir}")
        
        model_files = glob.glob(os.path.join(model_dir, "*.pth"))
        logger.info(f"Found {len(model_files)} model files")
        
        if not model_files:
            logger.error("No model files found!")
            return
        
        for model_path in model_files:
            model_info = self.load_multirel_model(model_path, data_dir)
            if model_info:
                model_name = f"{model_info['type']}-{model_info['dimension']}D"
                self.models[model_name] = model_info
                logger.info(f"Loaded model: {model_name}")
            else:
                logger.error(f"Failed to load model: {model_path}")
    
    def evaluate_on_dataset(self, dataset_name, dataset_loader_func):
        """Evaluate models on specific dataset"""
        logger.info(f"\n{'='*50}\nEvaluating on {dataset_name}\n{'='*50}")
        
        try:
            data = dataset_loader_func()
            word_pairs = data.X
            human_scores = data.y
            
            if not isinstance(human_scores, np.ndarray):
                human_scores = np.array(human_scores)
            
            if human_scores.ndim > 1:
                human_scores = human_scores.flatten()
            
            logger.info(f"Loaded {len(word_pairs)} word pairs")
        except Exception as e:
            logger.error(f"Error loading dataset {dataset_name}: {e}")
            traceback.print_exc()
            return
        
        dataset_results = {}
        
        for model_name, model_info in self.models.items():
            logger.info(f"Evaluating {model_name} on {dataset_name}...")
            
            words1 = [str(pair[0]).strip() for pair in word_pairs]
            words2 = [str(pair[1]).strip() for pair in word_pairs]
            
            # Try different similarity calculation methods
            best_pearson = -float('inf')
            best_method = 'exp'
            best_results = None
            
            methods_to_try = ['exp', 'inv', 'neg', 'cosh'] if model_info["type"] == "MuRP" else ['cosine']
            
            for method in methods_to_try:
                model_scores, valid_indices, all_sims = self.compute_similarity_batch(
                    model_info, words1, words2, similarity_method=method
                )
                
                if not valid_indices:
                    continue
                    
                valid_model_scores = model_scores[valid_indices]
                valid_human_scores = human_scores[valid_indices]
                
                if not isinstance(valid_human_scores, np.ndarray) or valid_human_scores.dtype.kind not in 'iuf':
                    try:
                        valid_human_scores = np.array(valid_human_scores, dtype=np.float32)
                    except:
                        continue
                
                coverage = len(valid_indices) / len(word_pairs)
                
                if len(valid_model_scores) > 5:
                    try:
                        if valid_model_scores.ndim > 1:
                            valid_model_scores = valid_model_scores.flatten()
                        if valid_human_scores.ndim > 1:
                            valid_human_scores = valid_human_scores.flatten()
                        
                        pearson_corr, _ = pearsonr(valid_model_scores, valid_human_scores)
                        spearman_corr, _ = spearmanr(valid_model_scores, valid_human_scores)
                        
                        # If Pearson correlation is better, update best method
                        if not np.isnan(pearson_corr) and pearson_corr > best_pearson:
                            best_pearson = pearson_corr
                            best_method = method
                            best_results = {
                                "pearson": pearson_corr,
                                "spearman": spearman_corr,
                                "coverage": coverage,
                                "n_pairs": len(valid_model_scores),
                                "model_type": model_info["type"],
                                "dimension": model_info["dimension"],
                                "similarity_method": method
                            }
                        
                        logger.info(f"  Method {method}: Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}")
                        
                    except Exception as e:
                        logger.error(f"Error calculating correlations for method {method}: {e}")
                        continue
            
            # Use results from best method
            if best_results:
                dataset_results[model_name] = best_results
                logger.info(f"  Best method: {best_method} - Pearson: {best_results['pearson']:.4f}, Spearman: {best_results['spearman']:.4f}, Coverage: {best_results['coverage']:.2%}")
            else:
                logger.warning(f"No valid predictions for model {model_name} on dataset {dataset_name}")
                dataset_results[model_name] = {
                    "pearson": np.nan, "spearman": np.nan,
                    "coverage": 0.0, "n_pairs": 0,
                    "model_type": model_info["type"],
                    "dimension": model_info["dimension"],
                    "similarity_method": "none"
                }
        
        self.results[dataset_name] = dataset_results
    
    def run_evaluation(self, datasets):
        """Run evaluation on specified datasets"""
        logger.info("Starting CPAE embeddings evaluation on multiple datasets...")
        
        if len(self.models) == 0:
            logger.error("No models loaded. Aborting evaluation.")
            return
        
        for dataset_name, loader_func in datasets.items():
            try:
                logger.info(f"Processing dataset: {dataset_name}")
                self.evaluate_on_dataset(dataset_name, loader_func)
            except Exception as e:
                logger.error(f"Error evaluating {dataset_name}: {e}")
                traceback.print_exc()
                continue
    
    def print_results_summary(self):
        """Print results summary"""
        if not self.results:
            logger.info("No results to display!")
            return None
        
        print("\n" + "="*100)
        print("CPAE EMBEDDINGS EVALUATION RESULTS ON STANDARD DATASETS")
        print("="*100)
        
        all_results = []
        for dataset_name, dataset_results in self.results.items():
            for model_name, metrics in dataset_results.items():
                all_results.append({
                    "Dataset": dataset_name,
                    "Model": model_name,
                    "Type": metrics.get("model_type", "Unknown"),
                    "Dimension": metrics.get("dimension", 0),
                    "Method": metrics.get("similarity_method", "N/A"),
                    "Pearson": metrics.get("pearson", np.nan),
                    "Spearman": metrics.get("spearman", np.nan),
                    "Coverage": metrics.get("coverage", 0),
                    "N_Pairs": metrics.get("n_pairs", 0)
                })
        
        results_df = pd.DataFrame(all_results)
        
        # Display detailed results by dataset
        for dataset in results_df['Dataset'].unique():
            dataset_df = results_df[results_df['Dataset'] == dataset]
            print(f"\n{dataset}:")
            print("-" * 110)
            print(f"{'Model':<15} {'Type':<8} {'Dim':>4} {'Method':<8} {'Pearson':>8} {'Spearman':>8} {'Coverage':>10} {'N_Pairs':>8}")
            print("-" * 120)
            
            dataset_df = dataset_df.sort_values(by=['Type', 'Dimension'])
            
            for _, row in dataset_df.iterrows():
                pearson_str = f"{row['Pearson']:.4f}" if not pd.isna(row['Pearson']) else "N/A"
                spearman_str = f"{row['Spearman']:.4f}" if not pd.isna(row['Spearman']) else "N/A"
                coverage_str = f"{row['Coverage']:.2%}"
                method_str = row.get('Method', 'N/A')
                print(f"{row['Model']:<15} {row['Type']:<8} {row['Dimension']:>4} {method_str:<8} {pearson_str:>8} {spearman_str:>8} {coverage_str:>10} {row['N_Pairs']:>8}")
        
        # Display summary statistics
        print(f"\n{'='*50}")
        print("SUMMARY STATISTICS")
        print("="*50)
        
        summary = results_df.groupby(['Type', 'Dimension']).agg({
            'Pearson': ['mean', 'std', 'count'],
            'Spearman': ['mean', 'std'],
            'Coverage': 'mean'
        }).round(4)
        
        print(summary)
        
        return results_df
    
    def save_results(self, filepath="cpae_evaluation_results.csv"):
        """Save results to CSV"""
        if not self.results:
            logger.info("No results to save!")
            return None
        
        all_results = []
        for dataset_name, dataset_results in self.results.items():
            for model_name, metrics in dataset_results.items():
                all_results.append({
                    "Dataset": dataset_name,
                    "Model": model_name,
                    "Model_Type": metrics.get("model_type", "Unknown"),
                    "Dimension": metrics.get("dimension", 0),
                    "Similarity_Method": metrics.get("similarity_method", "N/A"),
                    "Pearson_Correlation": metrics.get("pearson", np.nan),
                    "Spearman_Correlation": metrics.get("spearman", np.nan),
                    "Coverage": metrics.get("coverage", 0),
                    "Valid_Pairs": metrics.get("n_pairs", 0)
                })
        
        results_df = pd.DataFrame(all_results)
        
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        results_df.to_csv(filepath, index=False)
        logger.info(f"Detailed results saved to: {filepath}")
        
        # Save summary statistics
        summary = results_df.groupby(['Model_Type', 'Dimension']).agg({
            'Pearson_Correlation': ['mean', 'std', 'count'],
            'Spearman_Correlation': ['mean', 'std'],
            'Coverage': 'mean'
        }).round(4)
        
        summary_file = "cpae_evaluation_summary.csv"
        summary.to_csv(summary_file)
        logger.info(f"Summary statistics saved to: {summary_file}")
        
        return results_df

def main():
    """Main function: run complete evaluation process"""
    # Check dependencies
    missing_deps = []
    required = ["torch", "scipy", "sklearn", "pandas", "numpy", "nltk"]
    for dep in required:
        try:
            __import__(dep)
        except ImportError:
            missing_deps.append(dep)
    
    if missing_deps:
        print(f"Missing dependencies: {missing_deps}")
        print("Install with: pip install " + " ".join(missing_deps))
        return None, None
    
    # Set project root directory
    project_dir = "your_project_directory"

    # Initialize evaluator
    evaluator = CPAEHyperbolicEvaluator(project_dir)
    
    # Set paths
    model_dir = os.path.join(project_dir, "path_to_your_model_directory")

2025-07-27 16:53:06,909 - INFO - Using device: cpu
2025-07-27 16:53:06,910 - INFO - Project directory: /Users/chouyinghan/my_mathlib_project/Demo_in_Matrix
2025-07-27 16:53:06,910 - INFO - Loading models from: /Users/chouyinghan/my_mathlib_project/Demo_in_Matrix/训练模型cpae
2025-07-27 16:53:06,911 - INFO - Found 1 model files
2025-07-27 16:53:06,911 - INFO - Loading MuRP model (dim=80) from: /Users/chouyinghan/my_mathlib_project/Demo_in_Matrix/训练模型cpae/poincare_model_80.pth
2025-07-27 16:53:06,911 - INFO - Added to sys.path: /Users/chouyinghan/my_mathlib_project/Demo_in_Matrix/多关系双曲嵌入/multirelational-poincare
2025-07-27 16:53:06,914 - INFO - Successfully imported model modules
2025-07-27 16:53:09,272 - INFO - Loaded 152252 entities from CPAE dataset
2025-07-27 16:53:09,272 - INFO - Sample entities: ["'", "'hood", "'s", "'s_gravenhage", "'tween_decks", '(', ')', '+', ',', '.']
2025-07-27 16:53:09,532 - INFO - Extracted entity embeddings of shape: (152252, 80)
2025-07-27 16:53:09,744 - INFO

Loaded SimVerb3500 data sample:
  word1   word2 pos  score    relation
0  take  remove   V   6.81    SYNONYMS
1  walk   trail   V   4.81  COHYPONYMS
2  feed  starve   V   1.49    ANTONYMS
Total verb pairs: 3500
Loaded SimVerb3500 data sample:
     word1   word2 pos  score        relation
0     hurt  offend   V   6.81        SYNONYMS
1  clarify   worry   V   0.33            NONE
2   fasten  attach   V   8.47  HYPER/HYPONYMS
Total verb pairs: 500
Loaded SimVerb3500 data sample:
   word1   word2 pos  score    relation
0   walk   trail   V   4.81  COHYPONYMS
1   feed  starve   V   1.49    ANTONYMS
2  shine  polish   V   7.80    SYNONYMS
Total verb pairs: 3000
Loaded MEN data sample (form=natural):
  word1: 'sun', word2: 'sunlight', score: 50.0
Loaded MEN data sample (form=natural):
  word1: 'berry', word2: 'seed', score: 37.0
Loaded MEN data sample (form=natural):
  word1: 'display', word2: 'pond', score: 10.0


2025-07-27 16:53:09,958 - INFO -   Pearson: 0.0321, Spearman: 0.0347, Coverage: 100.00%
2025-07-27 16:53:09,959 - INFO - Processing dataset: SimLex999
2025-07-27 16:53:09,959 - INFO - 
Evaluating on SimLex999
2025-07-27 16:53:09,962 - INFO - Loaded 999 word pairs
2025-07-27 16:53:09,962 - INFO - Evaluating MuRP-80D on SimLex999...
2025-07-27 16:53:09,977 - INFO -   Pearson: -0.0168, Spearman: -0.0170, Coverage: 100.00%
2025-07-27 16:53:09,977 - INFO - Processing dataset: SCWS
2025-07-27 16:53:09,977 - INFO - 
Evaluating on SCWS
2025-07-27 16:53:09,988 - INFO - Loaded 2003 word pairs
2025-07-27 16:53:09,988 - INFO - Evaluating MuRP-80D on SCWS...


成功加载SCWS数据集，共 2003 个词对
数据样本:
word1                                                       Brazil
word2                                                          nut
word1_context    gap in income between blacks and other non-whi...
word2_context    of the neck , bridge , and pickups , there are...
sentence                                                          
avg_score                                                      1.1
scores           [0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 2.0, 6.0, 0.0, ...
Name: 0, dtype: object


2025-07-27 16:53:10,311 - INFO -   Missing 11 word pairs (99.45% coverage)
2025-07-27 16:53:10,313 - INFO -   Pearson: 0.4096, Spearman: 0.1801, Coverage: 99.45%
2025-07-27 16:53:10,314 - INFO - Processing dataset: WS353
2025-07-27 16:53:10,314 - INFO - 
Evaluating on WS353
2025-07-27 16:53:10,316 - INFO - Loaded 350 word pairs
2025-07-27 16:53:10,317 - INFO - Evaluating MuRP-80D on WS353...
2025-07-27 16:53:10,468 - INFO -   Missing 6 word pairs (98.29% coverage)
2025-07-27 16:53:10,470 - INFO -   Pearson: -0.0011, Spearman: -0.0556, Coverage: 98.29%
2025-07-27 16:53:10,471 - INFO - Processing dataset: RG65
2025-07-27 16:53:10,471 - INFO - 
Evaluating on RG65
2025-07-27 16:53:10,473 - INFO - Loaded 65 word pairs
2025-07-27 16:53:10,474 - INFO - Evaluating MuRP-80D on RG65...
2025-07-27 16:53:10,476 - INFO -   Pearson: -0.1073, Spearman: -0.0735, Coverage: 100.00%
2025-07-27 16:53:10,482 - INFO - Detailed results saved to: /Users/chouyinghan/my_mathlib_project/Demo_in_Matrix/训练模型cpae/c

Loaded RG65 data sample:
     word1   word2  score
0     cord   smile   0.02
1  rooster  voyage   0.04
2     noon  string   0.04
Score range: 0.02 to 3.94

CPAE EMBEDDINGS EVALUATION RESULTS (MuRP vs MuRE)

SimVerb3500:
------------------------------------------------------------------------------------------
Model           Type      Dim  Pearson Spearman   Coverage  N_Pairs
------------------------------------------------------------------------------------------
MuRP-80D        MuRP       80   0.0239   0.0227    100.00%     3500

SimVerb3500-dev:
------------------------------------------------------------------------------------------
Model           Type      Dim  Pearson Spearman   Coverage  N_Pairs
------------------------------------------------------------------------------------------
MuRP-80D        MuRP       80   0.0619   0.0540    100.00%      500

SimVerb3500-test:
------------------------------------------------------------------------------------------
Model           