In [None]:
# ==== Install required packages ====
!pip install fasttext
!pip install igraph

# ==== Imports ====
import os
import json
import pickle
import numpy as np
import pandas as pd
import networkx as nx
from tqdm.auto import tqdm
import unicodedata
from collections import defaultdict
import warnings
import concurrent.futures
import time
import traceback
import gc
import multiprocessing
from functools import partial
import igraph as ig
import fasttext

# ==== Silence warnings ====
warnings.filterwarnings("ignore", category=UserWarning, module="networkx")
warnings.filterwarnings("ignore", category=FutureWarning, module="networkx")

# =============================================================================
# Configuration and Settings
# =============================================================================

# File paths
PLANCHUELO_CSV_PATH = "/content/drive/MyDrive/cues_and_associates.csv"
FASTTEXT_EN_MODEL_PATH = "/content/drive/MyDrive/cc.en.300.bin"
FASTTEXT_ES_MODEL_PATH = "/content/drive/MyDrive/cc.es.300.bin"
OUTPUT_DIR = "fasttext_network_analysis_v5_with_associates"

# Network generation parameters
NUM_TOP_WORDS = 60000  # Number of top words to include in networks
SIMILARITY_THRESHOLD = 0.4  # Threshold for similarity to create an edge

# Automatically detect CPU cores
N_WORKERS = max(1, multiprocessing.cpu_count() // 4)

# =============================================================================
# Utility Functions
# =============================================================================
def convert_numpy_types(obj):
    """Convert NumPy types to standard Python types for JSON serialization"""
    if isinstance(obj, (np.float16, np.float32, np.float64)):
        return float(obj) if not (np.isnan(obj) or np.isinf(obj)) else None
    if isinstance(obj, (np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)):
        return int(obj)
    if isinstance(obj, (np.complex64, np.complex128)):
        return str(obj)
    if isinstance(obj, (np.bool_)):
        return bool(obj)
    if isinstance(obj, (np.void)):
        return None
    if isinstance(obj, np.ndarray):
        return [convert_numpy_types(i) for i in obj]
    if isinstance(obj, dict):
        return {convert_numpy_types(k): convert_numpy_types(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [convert_numpy_types(i) for i in obj]
    if isinstance(obj, float) and (np.isnan(obj) or np.isinf(obj)):
        return None
    return obj

def normalize_text(text):
    """Normalize text to handle different encodings and casing"""
    if pd.isna(text):
        return np.nan
    if not isinstance(text, str):
        text = str(text)
    try:
        text_norm = ' '.join(text.strip().lower().split())
        return unicodedata.normalize('NFC', text_norm)
    except TypeError:
        return text

# =============================================================================
# Data Loading and Processing
# =============================================================================
def load_planchuelo_data(file_path, log_messages):
    """Load Planchuelo et al. (2024) cue-associate data with participant information"""
    log_messages.append(f"Loading Planchuelo data from: {file_path}")
    start_time = time.time()


    # Load the CSV file
    df = pd.read_csv(file_path)

    # Clean and normalize data
    log_messages.append("Cleaning and normalizing Planchuelo data...")
    df.replace(["", "NA", "No more responses"], np.nan, inplace=True)
    df.dropna(subset=['Language', 'Cue', 'Associate', 'Participant'], inplace=True)

    # Normalize text fields
    df['Language'] = df['Language'].astype(str).apply(normalize_text)
    df['Cue'] = df['Cue'].astype(str).apply(normalize_text)
    df['Associate'] = df['Associate'].astype(str).apply(normalize_text)
    df['Participant'] = df['Participant'].astype(str)

    # Extract all unique cue words and associate words by language
    cue_words_by_lang = df.groupby('Language')['Cue'].apply(lambda x: set(x.unique())).to_dict()
    associate_words_by_lang = df.groupby('Language')['Associate'].apply(lambda x: set(x.unique())).to_dict()

    # Create a dataframe of cue-associate pairs with participant info
    cue_associate_pairs = df[['Language', 'Participant', 'Cue', 'Associate']].copy()

    log_messages.append(f"Planchuelo data loaded successfully. (Time: {time.time() - start_time:.2f}s)")
    log_messages.append(f"Cue words per language: {', '.join([f'{lang}: {len(words)}' for lang, words in cue_words_by_lang.items()])}")
    log_messages.append(f"Associate words per language: {', '.join([f'{lang}: {len(words)}' for lang, words in associate_words_by_lang.items()])}")
    log_messages.append(f"Total cue-associate pairs: {len(cue_associate_pairs)}")

    return cue_words_by_lang, associate_words_by_lang, cue_associate_pairs

# =============================================================================
# FastText Model and Word Embedding Processing
# =============================================================================
def load_fasttext_model(model_path, log_messages):
    """Load a FastText model from a file path"""
    log_messages.append(f"Loading FastText model from: {model_path}...")
    start_time = time.time()
    model = fasttext.load_model(model_path)
    log_messages.append(f"FastText model loaded successfully. (Dimensions: {model.get_dimension()}) (Time: {time.time() - start_time:.2f}s)")
    return model

def get_word_embeddings(ft_model, word_list, n_jobs=None, log_messages=None):
    """Process a list of words to get their embeddings using parallel processing"""
    if log_messages is None:
        log_messages = []
    if n_jobs is None:
        n_jobs = max(1, multiprocessing.cpu_count() // 2)

    log_messages.append(f"Extracting embeddings for {len(word_list)} words using {n_jobs} workers...")
    start_time = time.time()

    # Deduplicate and convert to list if needed
    if isinstance(word_list, set):
        word_list = list(word_list)

    # Process words in chunks for parallelization
    chunk_size = max(1000, len(word_list) // (n_jobs * 2))
    chunks = [word_list[i:i+chunk_size] for i in range(0, len(word_list), chunk_size)]

    log_messages.append(f"Processing {len(chunks)} chunks of words in parallel...")

    # Define the worker function to process each chunk
    def process_chunk(chunk):
        valid_words = []
        embeddings = []
        for word in chunk:
            try:
                vec = ft_model.get_word_vector(word)
                embeddings.append(vec)
                valid_words.append(word)
            except Exception:
                pass  # Skip problematic words
        return valid_words, embeddings

    # Process chunks in parallel
    all_valid_words = []
    all_embeddings = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=n_jobs) as executor:
        future_to_chunk = {executor.submit(process_chunk, chunk): i for i, chunk in enumerate(chunks)}

        for future in tqdm(concurrent.futures.as_completed(future_to_chunk),
                          total=len(chunks),
                          desc="Fetching embeddings",
                          leave=False):
            valid_words, embeddings = future.result()
            all_valid_words.extend(valid_words)
            all_embeddings.extend(embeddings)

    # Convert to numpy array and normalize
    embedding_matrix = np.array(all_embeddings, dtype=np.float32)

    # L2 normalize for cosine similarity calculation
    if embedding_matrix.size > 0:
        from sklearn.preprocessing import normalize as sk_normalize
        embedding_matrix = sk_normalize(embedding_matrix, norm='l2', axis=1)

    log_messages.append(f"Extracted embeddings for {len(all_valid_words)} words. (Time: {time.time() - start_time:.2f}s)")
    return all_valid_words, embedding_matrix

def get_combined_word_embeddings(ft_model, top_n_count, custom_words=None, n_jobs=None, log_messages=None):
    """Get embeddings for both top frequency words and custom words (like cues and associates) in one pass"""
    if log_messages is None:
        log_messages = []
    if n_jobs is None:
        n_jobs = max(1, multiprocessing.cpu_count() // 2)

    if custom_words is None:
        custom_words = set()

    log_messages.append(f"Getting combined embeddings for top {top_n_count} words and {len(custom_words)} custom words...")

    # Get top frequency words from the model
    top_words = ft_model.get_words()[:top_n_count]

    # Combine with custom words, eliminating duplicates
    combined_words = list(set(top_words) | set(custom_words))
    log_messages.append(f"Combined word list has {len(combined_words)} unique words")

    # Get all embeddings at once
    valid_words, embeddings = get_word_embeddings(ft_model, combined_words, n_jobs, log_messages)

    return valid_words, embeddings

# =============================================================================
# Similarity Matrix Calculation
# =============================================================================
def calculate_similarity_chunk(embeddings_matrix, threshold, start_idx, end_idx):
    """Calculate similarity matrix for a chunk of vectors - more efficiently"""
    chunk_edges = []

    # Process vectors in the chunk
    for i in range(start_idx, end_idx):
        # Calculate similarities with all vectors having higher indices
        # This calculates only the upper triangle of the similarity matrix (efficient for undirected graphs)
        similarities = np.dot(embeddings_matrix[i:i+1], embeddings_matrix[i+1:].T).flatten()

        # Find edges that exceed the threshold
        edges_idx = np.where(similarities > threshold)[0]

        if edges_idx.size > 0:
            # Create edges with similarity weights
            for j in edges_idx:
                chunk_edges.append((i, i+1+j, float(similarities[j])))

    return chunk_edges

def calculate_similarity_matrix(words, embeddings_matrix, threshold, n_jobs=None, log_messages=None):
    """Calculate similarity matrix in parallel for edge creation"""
    if log_messages is None:
        log_messages = []
    if n_jobs is None:
        n_jobs = max(1, multiprocessing.cpu_count() // 2)

    n_vectors = embeddings_matrix.shape[0]
    log_messages.append(f"Calculating similarity matrix for {n_vectors} vectors using {n_jobs} workers...")
    start_time = time.time()

    # Create optimal chunk sizes based on vector count and available workers
    chunk_size = max(100, min(1000, n_vectors // (n_jobs * 2)))
    batches = [(i, min(i + chunk_size, n_vectors)) for i in range(0, n_vectors, chunk_size)]

    log_messages.append(f"Created {len(batches)} batches for similarity calculation")

    # Process batches in parallel
    with concurrent.futures.ProcessPoolExecutor(max_workers=n_jobs) as executor:
        futures = []
        for batch_range in batches:
            future = executor.submit(
                calculate_similarity_chunk,
                embeddings_matrix,
                threshold,
                *batch_range
            )
            futures.append(future)

        # Collect results
        edges_indices = []
        for future in tqdm(concurrent.futures.as_completed(futures),
                         total=len(batches),
                         desc="Calculating similarities",
                         leave=False):
            edges_indices.extend(future.result())

    # Convert indices to actual words with weights
    log_messages.append(f"Converting {len(edges_indices)} edge indices to word pairs...")

    edges_with_weights = [(words[s_idx], words[t_idx], weight)
                         for s_idx, t_idx, weight in edges_indices]

    log_messages.append(f"Similarity calculation complete. Found {len(edges_with_weights)} edges. (Time: {time.time() - start_time:.2f}s)")
    return edges_with_weights

# =============================================================================
# Unified Network Construction
# =============================================================================
def build_semantic_network(words, embeddings_matrix, threshold, log_messages, n_jobs=None):
    """Build a semantic network from word embeddings in a single pass"""
    log_messages.append(f"Building semantic network (Nodes: {len(words)}, Threshold: {threshold})...")
    start_time = time.time()

    # Create graph and add nodes
    G = nx.Graph()
    G.add_nodes_from(words)

    # Calculate edges using parallel processing
    edges_with_weights = calculate_similarity_matrix(
        words,
        embeddings_matrix,
        threshold,
        n_jobs=n_jobs,
        log_messages=log_messages
    )

    # Add edges to graph
    log_messages.append(f"Adding {len(edges_with_weights)} edges to the graph...")
    G.add_weighted_edges_from(edges_with_weights)

    # Add distance attribute to edges (distance = 1 - similarity)
    log_messages.append(f"Adding 'distance' attribute to edges...")
    for u, v, data in G.edges(data=True):
        weight = data.get('weight', 0.0)
        data['distance'] = 1.0 - weight

    log_messages.append(f"Built semantic network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges. (Time: {time.time() - start_time:.2f}s)")
    return G

# =============================================================================
# Unified Network Statistics Calculation
# =============================================================================
def calculate_centrality_metrics(G, log_messages, use_igraph=True):
    """Calculate centrality metrics for a graph using igraph"""
    log_messages.append(f"Calculating centrality metrics for graph with {G.number_of_nodes()} nodes...")
    start_time = time.time()

    # Default dictionaries for metrics
    metrics = {
        'clustering': defaultdict(float),
        'eigenvector': defaultdict(float),
        'closeness': defaultdict(float),
        'betweenness': defaultdict(float)
    }

    # Use igraph for computationally intensive calculations
    if use_igraph:
        log_messages.append("Using igraph for faster centrality calculations...")

        try:
            # Convert NetworkX graph to igraph
            g_ig = ig.Graph()
            g_ig.add_vertices(list(G.nodes()))

            # Map node names to indices
            name_to_idx = {name: idx for idx, name in enumerate(G.nodes())}

            # Add edges with weights
            edges = [(name_to_idx[u], name_to_idx[v]) for u, v in G.edges()]
            weights = [G[u][v].get('weight', 1.0) for u, v in G.edges()]
            distances = [G[u][v].get('distance', 1.0) for u, v in G.edges()]

            g_ig.add_edges(edges)
            g_ig.es['weight'] = weights
            g_ig.es['distance'] = distances

            # Calculate centralities
            clustering = g_ig.transitivity_local_undirected(weights='weight')
            eigenvector = g_ig.eigenvector_centrality(weights='weight')
            closeness = g_ig.closeness(weights='distance')
            betweenness = g_ig.betweenness(weights='distance')

            # Convert to dictionaries
            node_names = list(G.nodes())
            metrics['clustering'] = {name: score for name, score in zip(node_names, clustering)}
            metrics['eigenvector'] = {name: score for name, score in zip(node_names, eigenvector)}
            metrics['closeness'] = {name: score for name, score in zip(node_names, closeness)}
            metrics['betweenness'] = {name: score for name, score in zip(node_names, betweenness)}

            log_messages.append(f"igraph centrality calculations complete. (Time: {time.time() - start_time:.2f}s)")
            return metrics

        except Exception as e:
            log_messages.append(f"Error using igraph: {e}. Falling back to NetworkX.")

    # If igraph not available or failed, use NetworkX
    log_messages.append("Calculating centrality metrics with NetworkX...")

    # For very large graphs, sample betweenness
    k_betweenness = min(1000, G.number_of_nodes() // 10) if G.number_of_nodes() > 30000 else None

    # Calculate metrics
    try:
        metrics['clustering'] = nx.clustering(G, weight='weight')
    except Exception as e:
        log_messages.append(f"Clustering calculation failed: {e}")

    try:
        metrics['eigenvector'] = nx.eigenvector_centrality_numpy(G, weight='weight', max_iter=1000, tol=1e-6)
    except Exception as e:
        log_messages.append(f"Eigenvector centrality calculation failed: {e}")

    try:
        metrics['closeness'] = nx.closeness_centrality(G, distance='distance')
    except Exception as e:
        log_messages.append(f"Closeness centrality calculation failed: {e}")

    try:
        metrics['betweenness'] = nx.betweenness_centrality(G, weight='distance', k=k_betweenness, normalized=True)
    except Exception as e:
        log_messages.append(f"Betweenness centrality calculation failed: {e}")

    log_messages.append(f"NetworkX centrality calculations complete. (Time: {time.time() - start_time:.2f}s)")
    return metrics

def calculate_word_statistics(G, word_sets, language, log_messages):
    """
    Calculate network statistics for different sets of words (cues and associates)

    Parameters:
    -----------
    G : NetworkX graph
        The semantic network
    word_sets : dict
        Dictionary with keys as word types (e.g., 'Cue', 'Associate') and values as sets of words
    language : str
        Language code
    log_messages : list
        List to append log messages to

    Returns:
    --------
    dict
        Dictionary of word statistics by word type
    """
    if not G or G.number_of_nodes() == 0:
        log_messages.append(f"Graph for {language} is empty. Cannot calculate statistics.")
        return {}

    log_messages.append(f"Calculating statistics for {sum(len(words) for words in word_sets.values())} words in {language} network...")
    start_time = time.time()

    # Find the largest connected component (LCC)
    log_messages.append("Finding connected components...")
    components = list(nx.connected_components(G))

    if not components:
        log_messages.append("No connected components found in graph.")
        return {}

    largest_component = max(components, key=len)
    G_lcc = G.subgraph(largest_component).copy()
    log_messages.append(f"Largest connected component has {G_lcc.number_of_nodes()} nodes and {G_lcc.number_of_edges()} edges")

    # Calculate centrality metrics for the LCC
    centrality_metrics = calculate_centrality_metrics(G_lcc, log_messages)

    # Prepare results for each word type
    results = {}

    for word_type, words in word_sets.items():
        word_stats = []

        for word in tqdm(words, desc=f"Processing {word_type} words", leave=False):
            word_norm = normalize_text(str(word))
            if pd.isna(word_norm):
                continue

            is_in_graph = word_norm in G
            is_in_lcc = word_norm in largest_component

            stats = {
                'Word': word,
                'Word_Type': word_type,
                'Language': language,
                'Clustering_Coefficient': centrality_metrics['clustering'].get(word_norm, 0.0) if is_in_lcc else 0.0,
                'Eigenvector_Centrality': centrality_metrics['eigenvector'].get(word_norm, 0.0) if is_in_lcc else 0.0,
                'Closeness_Centrality': centrality_metrics['closeness'].get(word_norm, 0.0) if is_in_lcc else 0.0,
                'Betweenness_Centrality': centrality_metrics['betweenness'].get(word_norm, 0.0) if is_in_lcc else 0.0,
                'Is_In_Graph': is_in_graph,
                'Is_In_LCC': is_in_lcc,
                'Degree': G.degree(word_norm, weight='weight') if is_in_graph else 0
            }
            word_stats.append(stats)

        results[word_type] = word_stats

    log_messages.append(f"Statistics calculation complete. (Time: {time.time() - start_time:.2f}s)")
    return results

# =============================================================================
# Network Saving Functions
# =============================================================================
def save_network(G, language_code, output_dir):
    """Save network in multiple formats with error handling"""
    # Create language-specific directory
    lang_dir = os.path.join(output_dir, f"network_{language_code}")
    os.makedirs(lang_dir, exist_ok=True)
    print(f"Saving network for {language_code} with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges...")

    paths = {}

    try:
        # Save as pickle
        pickle_path = os.path.join(lang_dir, f"{language_code}_network.pickle")
        with open(pickle_path, 'wb') as f:
            pickle.dump(G, f)
        paths['pickle'] = pickle_path

        # Save largest connected component
        components = list(nx.connected_components(G))
        if components:
            largest_component = max(components, key=len)
            if len(largest_component) > 100:
                G_lcc = G.subgraph(largest_component).copy()
                lcc_pickle_path = os.path.join(lang_dir, f"{language_code}_lcc_network.pickle")
                with open(lcc_pickle_path, 'wb') as f:
                    pickle.dump(G_lcc, f)
                paths['lcc_pickle'] = lcc_pickle_path

        # Save network metadata
        meta = {
            'nodes': G.number_of_nodes(),
            'edges': G.number_of_edges(),
            'density': nx.density(G),
            'components': len(components) if components else 0,
            'largest_component_size': len(max(components, key=len)) if components else 0
        }

        meta_path = os.path.join(lang_dir, f"{language_code}_network_meta.json")
        with open(meta_path, 'w') as f:
            json.dump(meta, f, indent=2)
        paths['meta'] = meta_path

    except Exception as e:
        print(f"Error saving network: {e}")
        traceback.print_exc()

    return paths

def save_word_statistics(word_stats, language_code, output_dir):
    """Save word statistics to CSV files"""
    stats_dir = os.path.join(output_dir, "statistics")
    os.makedirs(stats_dir, exist_ok=True)

    all_stats = []
    for word_type, stats in word_stats.items():
        # Append all stats to the combined list
        all_stats.extend(stats)

        # Save type-specific stats
        type_stats_path = os.path.join(stats_dir, f"{language_code}_{word_type.lower()}_statistics.csv")
        type_df = pd.DataFrame(stats)
        type_df.to_csv(type_stats_path, index=False)
        print(f"Saved {word_type} statistics to {type_stats_path}")

    # Save combined stats
    combined_stats_path = os.path.join(stats_dir, f"{language_code}_all_word_statistics.csv")
    combined_df = pd.DataFrame(all_stats)
    combined_df.to_csv(combined_stats_path, index=False)
    print(f"Saved combined statistics to {combined_stats_path}")

    return {
        'combined': combined_stats_path,
        'by_type': {word_type: os.path.join(stats_dir, f"{language_code}_{word_type.lower()}_statistics.csv")
                   for word_type in word_stats.keys()}
    }

# =============================================================================
# Language Processing Pipeline
# =============================================================================
def process_language(language_code, ft_model_path, cue_words, associate_words, num_top_words,
                    sim_threshold, output_dir, n_workers=None):
    """
    Process a language end-to-end in a unified pipeline:
    1. Load FastText model
    2. Get embeddings for all words (top frequency + cues + associates)
    3. Build a single semantic network with all words
    4. Calculate network statistics for cue and associate words
    5. Save network and statistics
    """
    if n_workers is None:
        n_workers = max(1, multiprocessing.cpu_count() // 2)

    log_messages = []
    log_messages.append(f"\n--- Starting Unified Pipeline for {language_code} ---")

    results = {
        'network_paths': {},
        'statistics_paths': {},
        'word_stats': {},
        'success': False
    }

    try:
        # 1. Load FastText model
        ft_model = load_fasttext_model(ft_model_path, log_messages)
        if ft_model is None:
            log_messages.append(f"Failed to load FastText model for {language_code}. Aborting pipeline.")
            return results, log_messages

        # 2. Get embeddings for all words at once
        all_custom_words = set(cue_words) | set(associate_words)
        all_words, embeddings = get_combined_word_embeddings(
            ft_model,
            num_top_words,
            custom_words=all_custom_words,
            n_jobs=n_workers,
            log_messages=log_messages
        )

        # 3. Build semantic network
        G = build_semantic_network(
            all_words,
            embeddings,
            sim_threshold,
            log_messages,
            n_jobs=n_workers
        )

        # Free up memory
        del embeddings
        gc.collect()

        # 4. Calculate statistics for different word types
        word_sets = {
            'Cue': set(cue_words),
            'Associate': set(associate_words)
        }

        word_stats = calculate_word_statistics(
            G,
            word_sets,
            language_code,
            log_messages
        )
        results['word_stats'] = word_stats

        # 5. Save network and statistics
        if output_dir:
            results['network_paths'] = save_network(G, language_code, output_dir)

            if word_stats:
                results['statistics_paths'] = save_word_statistics(word_stats, language_code, output_dir)

        # Free memory
        del ft_model
        gc.collect()

        results['success'] = True
        log_messages.append(f"--- Completed Unified Pipeline for {language_code} Successfully ---")

    except Exception as e:
        log_messages.append(f"Error in {language_code} pipeline: {e}")
        log_messages.append(traceback.format_exc())

    return results, log_messages

# =============================================================================
# Main Execution Block
# =============================================================================
def main():
    print(f"CPU Count: {multiprocessing.cpu_count()}")
    print(f"Using {N_WORKERS} worker processes")

    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Output directory: {OUTPUT_DIR}")

    # Load Planchuelo data
    main_log = []
    cue_words_by_lang, associate_words_by_lang, cue_associate_pairs = load_planchuelo_data(
        PLANCHUELO_CSV_PATH, main_log
    )

    # Print setup info
    for msg in main_log:
        print(msg)

    if not cue_words_by_lang:
        print("Error: No cue words loaded. Cannot continue.")
        return

    # Define language tasks
    tasks = []

    # English
    if os.path.exists(FASTTEXT_EN_MODEL_PATH):
        en_cues = cue_words_by_lang.get('english', set())
        en_associates = associate_words_by_lang.get('english', set())

        tasks.append({
            'language_code': 'en',
            'language_full': 'english',
            'model_path': FASTTEXT_EN_MODEL_PATH,
            'cue_words': en_cues,
            'associate_words': en_associates
        })
    else:
        print(f"WARNING: English FastText model not found at {FASTTEXT_EN_MODEL_PATH}")

    # Spanish
    if os.path.exists(FASTTEXT_ES_MODEL_PATH):
        es_cues = cue_words_by_lang.get('spanish', set())
        es_associates = associate_words_by_lang.get('spanish', set())

        tasks.append({
            'language_code': 'es',
            'language_full': 'spanish',
            'model_path': FASTTEXT_ES_MODEL_PATH,
            'cue_words': es_cues,
            'associate_words': es_associates
        })
    else:
        print(f"WARNING: Spanish FastText model not found at {FASTTEXT_ES_MODEL_PATH}")

    # Process languages
    if not tasks:
        print("No language tasks to process. Check FastText model paths.")
        return

    # Results container
    all_results = {}

    # Process each language
    for task in tasks:
        print(f"\nProcessing {task['language_full']} language...")

        results, logs = process_language(
            language_code=task['language_code'],
            ft_model_path=task['model_path'],
            cue_words=task['cue_words'],
            associate_words=task['associate_words'],
            num_top_words=NUM_TOP_WORDS,
            sim_threshold=SIMILARITY_THRESHOLD,
            output_dir=OUTPUT_DIR,
            n_workers=N_WORKERS
        )

        # Print logs
        for log in logs:
            print(log)

        all_results[task['language_code']] = results

    # Create final merged dataset with cue-associate pairs and network metrics
    print("\nCreating final cue-associate dataset with network metrics...")

    try:
        # Start with the original cue-associate pairs
        final_df = cue_associate_pairs.copy()

        # Add language code mapping
        language_code_map = {'english': 'en', 'spanish': 'es'}
        final_df['Language_Code'] = final_df['Language'].map(language_code_map)

        # Create empty columns for all metrics
        metric_columns = [
            'Cue_Clustering_Coefficient', 'Cue_Eigenvector_Centrality',
            'Cue_Closeness_Centrality', 'Cue_Betweenness_Centrality',
            'Cue_Is_In_Graph', 'Cue_Is_In_LCC', 'Cue_Degree',
            'Associate_Clustering_Coefficient', 'Associate_Eigenvector_Centrality',
            'Associate_Closeness_Centrality', 'Associate_Betweenness_Centrality',
            'Associate_Is_In_Graph', 'Associate_Is_In_LCC', 'Associate_Degree'
        ]

        for col in metric_columns:
            final_df[col] = np.nan

        # Process languages one by one
        for lang_code, lang_results in all_results.items():
            if not lang_results['success'] or not lang_results['word_stats']:
                print(f"Skipping language {lang_code} - missing results")
                continue

            # Get language full name (reverse map)
            lang_full = next((k for k, v in language_code_map.items() if v == lang_code), None)
            if not lang_full:
                print(f"Skipping language code {lang_code} - no matching language name")
                continue

            print(f"Processing metrics for language: {lang_full}")

            # Create dictionaries to efficiently lookup metrics
            cue_metrics = {}
            associate_metrics = {}

            # Process cue metrics
            if 'Cue' in lang_results['word_stats']:
                print(f"Processing cue metrics for {lang_full}")
                for stat in lang_results['word_stats']['Cue']:
                    word = stat['Word']
                    word_norm = normalize_text(str(word))
                    if pd.isna(word_norm):
                        continue

                    cue_metrics[word_norm] = {
                        'Clustering_Coefficient': stat['Clustering_Coefficient'],
                        'Eigenvector_Centrality': stat['Eigenvector_Centrality'],
                        'Closeness_Centrality': stat['Closeness_Centrality'],
                        'Betweenness_Centrality': stat['Betweenness_Centrality'],
                        'Is_In_Graph': stat['Is_In_Graph'],
                        'Is_In_LCC': stat['Is_In_LCC'],
                        'Degree': stat['Degree']
                    }

            # Process associate metrics
            if 'Associate' in lang_results['word_stats']:
                print(f"Processing associate metrics for {lang_full}")
                for stat in lang_results['word_stats']['Associate']:
                    word = stat['Word']
                    word_norm = normalize_text(str(word))
                    if pd.isna(word_norm):
                        continue

                    associate_metrics[word_norm] = {
                        'Clustering_Coefficient': stat['Clustering_Coefficient'],
                        'Eigenvector_Centrality': stat['Eigenvector_Centrality'],
                        'Closeness_Centrality': stat['Closeness_Centrality'],
                        'Betweenness_Centrality': stat['Betweenness_Centrality'],
                        'Is_In_Graph': stat['Is_In_Graph'],
                        'Is_In_LCC': stat['Is_In_LCC'],
                        'Degree': stat['Degree']
                    }

            # Apply metrics to the appropriate rows
            lang_mask = final_df['Language'] == lang_full

            # For each row in this language
            for idx, row in final_df[lang_mask].iterrows():
                cue_norm = normalize_text(str(row['Cue']))
                associate_norm = normalize_text(str(row['Associate']))

                # Apply cue metrics if available
                if cue_norm in cue_metrics:
                    metrics = cue_metrics[cue_norm]
                    final_df.loc[idx, 'Cue_Clustering_Coefficient'] = metrics['Clustering_Coefficient']
                    final_df.loc[idx, 'Cue_Eigenvector_Centrality'] = metrics['Eigenvector_Centrality']
                    final_df.loc[idx, 'Cue_Closeness_Centrality'] = metrics['Closeness_Centrality']
                    final_df.loc[idx, 'Cue_Betweenness_Centrality'] = metrics['Betweenness_Centrality']
                    final_df.loc[idx, 'Cue_Is_In_Graph'] = metrics['Is_In_Graph']
                    final_df.loc[idx, 'Cue_Is_In_LCC'] = metrics['Is_In_LCC']
                    final_df.loc[idx, 'Cue_Degree'] = metrics['Degree']

                # Apply associate metrics if available
                if associate_norm in associate_metrics:
                    metrics = associate_metrics[associate_norm]
                    final_df.loc[idx, 'Associate_Clustering_Coefficient'] = metrics['Clustering_Coefficient']
                    final_df.loc[idx, 'Associate_Eigenvector_Centrality'] = metrics['Eigenvector_Centrality']
                    final_df.loc[idx, 'Associate_Closeness_Centrality'] = metrics['Closeness_Centrality']
                    final_df.loc[idx, 'Associate_Betweenness_Centrality'] = metrics['Betweenness_Centrality']
                    final_df.loc[idx, 'Associate_Is_In_Graph'] = metrics['Is_In_Graph']
                    final_df.loc[idx, 'Associate_Is_In_LCC'] = metrics['Is_In_LCC']
                    final_df.loc[idx, 'Associate_Degree'] = metrics['Degree']

            print(f"Applied metrics for {lang_full} language")

        # Save final dataset
        final_path = os.path.join(OUTPUT_DIR, "cue_associate_network_metrics.csv")
        final_df.to_csv(final_path, index=False)
        print(f"Saved final cue-associate network metrics to: {final_path}")

        # Print summary statistics
        print("\nFinal Dataset Summary:")
        print(f"Total cue-associate pairs: {len(final_df)}")
        print(f"Pairs with cue metrics: {final_df['Cue_Clustering_Coefficient'].notna().sum()}")
        print(f"Pairs with associate metrics: {final_df['Associate_Clustering_Coefficient'].notna().sum()}")

    except Exception as e:
        print(f"Error creating final dataset: {e}")
        traceback.print_exc()

    print("\nNetwork analysis complete!")

if __name__ == "__main__":
    main()