In [2]:
# autoreload
%reload_ext autoreload
%autoreload 2

import pickle

import pandas as pd
from pathlib import Path
from tqdm import tqdm

# 0. load data

In [3]:
DATA_DIR = Path('../data')

In [4]:
# 2024/9/1 - 2025/2/14 jobs.ps
data = pickle.load(open(DATA_DIR / 'palestine_data.pkl', 'rb'))

In [None]:
len(data)

In [None]:
# turn into dataframe, only keep the columns we need: job_id, title, job_description, job_requirements
# turn dictionary into dataframe, key as the index, value as the column
df = pd.DataFrame.from_dict(data, orient='index')
df.head()
# only keep the columns we need: job_id, title, job_description, job_requirements
df = df[['job_id', 'title', 'job_description', 'job_requirements']]
df.head()


In [None]:
# stats length of job_description as word count
df['job_description'].apply(lambda x: len(x.split())).describe()

In [None]:
df['job_requirements'].apply(lambda x: len(x.split())).describe()

In [None]:
# combine job_description and job_requirements
df['job_description_full'] = df['job_description'] + '\n' + df['job_requirements']
df['job_description_full'].apply(lambda x: len(x.split())).describe()
# 1. get embedding

In [9]:
from text_preprocess import TextCleaner

cleaner = TextCleaner()
df['job_description_full_cleaned'] = df['job_description_full'].apply(cleaner.clean_text)


In [None]:
df['job_description_full_cleaned'].apply(lambda x: len(x.split())).describe()

# 1. get embedding

In [5]:
DEVICE = 'cuda:1'
MODEL_NAME = 'Qwen/Qwen3-Embedding-8B'
MAX_LENGTH = 4096

In [12]:
def get_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LENGTH)
    # Move inputs to the same device as the model
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    outputs = model(**inputs)
    # Move output back to CPU for numpy conversion
    return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()

In [13]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch.nn.utils.rnn import pad_sequence
import warnings

def get_embeddings_flash_attention(texts, model, tokenizer, max_length=4096, batch_size=1):
    """
    Get embeddings using FlashAttention2 with proper padding handling.
    
    Args:
        texts: List of text strings or single text string
        model: Pre-loaded model
        tokenizer: Pre-loaded tokenizer
        max_length: Maximum sequence length
        batch_size: Batch size for processing
    
    Returns:
        numpy array of embeddings
    """
    if isinstance(texts, str):
        texts = [texts]
    
    device = next(model.parameters()).device
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # For FlashAttention2, we need to handle padding carefully
        # Option 1: Process each text individually (no padding issues)
        if hasattr(model.config, '_attn_implementation') and model.config._attn_implementation == 'flash_attention_2':
            batch_embeddings = []
            for text in batch_texts:
                # Process individually to avoid padding issues
                inputs = tokenizer(text, truncation=True, return_tensors='pt', max_length=max_length)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                
                with torch.no_grad():
                    outputs = model(**inputs)
                    # Use mean pooling over the sequence dimension
                    embedding = outputs.last_hidden_state.mean(dim=1)
                    batch_embeddings.append(embedding)
            
            batch_embeddings = torch.cat(batch_embeddings, dim=0)
        else:
            # Standard batched processing with padding
            inputs = tokenizer(batch_texts, padding=True, truncation=True, 
                             return_tensors='pt', max_length=max_length)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model(**inputs)
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        
        embeddings.append(batch_embeddings.cpu().numpy())
    
    return np.vstack(embeddings)


def load_model_with_fallback(model_name, device='cuda', try_flash_attention=True):
    """
    Load model with FlashAttention2 if available, otherwise fallback to standard attention.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    if try_flash_attention:
        try:
            model = AutoModel.from_pretrained(
                model_name, 
                attn_implementation="flash_attention_2", 
                torch_dtype=torch.float16
            )
            print("✅ Successfully loaded model with FlashAttention2")
        except ImportError as e:
            print(f"⚠️  FlashAttention2 not available: {e}")
            print("📦 Install with: pip install flash-attn --no-build-isolation")
            print("🔄 Falling back to standard attention...")
            model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16)
    else:
        model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16)
    
    model = model.to(device)
    return model, tokenizer

In [6]:
output_path = DATA_DIR / "palestine_train_test_data_embedding.pkl"

if not output_path.exists():
# if True:
    model_name = MODEL_NAME

    model, tokenizer = load_model_with_fallback(MODEL_NAME, DEVICE, try_flash_attention=True)
    # tokenizer.padding_side  = 'left'

    model = model.to(DEVICE)

    batch_size = 32
    title_embeddings = []
    description_embeddings = []
    full_text_embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[i:i+batch_size]
                                
        batch_titles = batch['title'].tolist()
        batch_embeddings_title = [get_embeddings_flash_attention(text, model, tokenizer, max_length=MAX_LENGTH) for text in batch_titles]
        title_embeddings.extend(batch_embeddings_title)

        batch_descriptions = batch['job_description_full_cleaned'].tolist()
        batch_embeddings_description = [get_embeddings_flash_attention(text, model, tokenizer, max_length=MAX_LENGTH) for text in batch_descriptions]
        description_embeddings.extend(batch_embeddings_description)

        batch_full_texts = [title + '\n' + description for title, description in zip(batch['title'], batch['job_description_full_cleaned'])]
        batch_embeddings_full_texts = [get_embeddings_flash_attention(text, model, tokenizer, max_length=MAX_LENGTH) for text in batch_full_texts]
        full_text_embeddings.extend(batch_embeddings_full_texts)
        
    df['title_qwen3_8b_emb'] = title_embeddings
    df['description_qwen3_8b_emb'] = description_embeddings
    df['full_text_qwen3_8b_emb'] = full_text_embeddings
    print(df.head(1))
    df.to_pickle(output_path)
df = pickle.load(open(output_path, "rb"))

In [7]:
# split into train and test
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_train.to_pickle(DATA_DIR / "palestine_train_data_embedding.pkl")
df_test.to_pickle(DATA_DIR / "palestine_test_data_embedding.pkl")


# 2. check embedding similarity

In [None]:

title_embeddings = np.vstack(df_train['title_qwen3_8b_emb'].values)
title_embeddings_norm = np.linalg.norm(title_embeddings, axis=1, keepdims=True)
title_embeddings = title_embeddings / title_embeddings_norm

idx = 4

target_title_embedding = title_embeddings[idx]

print(df_train.iloc[idx]['title'])

scores = target_title_embedding @ title_embeddings.T

top_k = 10
top_k_indices = np.argsort(scores)[-top_k:][::-1]
top_k_scores = scores[top_k_indices]

top_k_titles = df_train.iloc[top_k_indices]['title'].tolist()

top_k_titles

In [None]:

full_text_embeddings = np.vstack(df_train['full_text_qwen3_8b_emb'].values)
full_text_embeddings_norm = np.linalg.norm(full_text_embeddings, axis=1, keepdims=True)
full_text_embeddings = full_text_embeddings / full_text_embeddings_norm


embeddings = np.concatenate([title_embeddings, full_text_embeddings], axis=1)
embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / embeddings_norm
idx = 4

target_embedding = embeddings[idx]

print(df_train.iloc[idx]['title'])

scores = target_embedding @ embeddings.T

top_k = 10
top_k_indices = np.argsort(scores)[-top_k:][::-1]
top_k_scores = scores[top_k_indices]

top_k_titles = df_train.iloc[top_k_indices]['title'].tolist()

top_k_titles

In [None]:
text_embeddings = np.vstack(df_train['description_qwen3_8b_emb'].values)
text_embeddings_norm = np.linalg.norm(text_embeddings, axis=1, keepdims=True)
text_embeddings = text_embeddings / text_embeddings_norm


embeddings = np.concatenate([title_embeddings*0.8, text_embeddings*0.2], axis=1)
embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / embeddings_norm
idx = 4

target_embedding = embeddings[idx]

print(df_train.iloc[idx]['title'])

scores = target_embedding @ embeddings.T

top_k = 10
top_k_indices = np.argsort(scores)[-top_k:][::-1]
top_k_scores = scores[top_k_indices]

top_k_titles = df_train.iloc[top_k_indices]['title'].tolist()

top_k_titles

In [None]:


from sklearn.cluster import AffinityPropagation
from sklearn.metrics import silhouette_score

df_train_deduplicated = df_train.dropna(subset=['title_qwen3_8b_emb', 'full_text_qwen3_8b_emb', 'description_qwen3_8b_emb']).reset_index(drop=True)

title_embeddings = np.vstack(df_train_deduplicated['title_qwen3_8b_emb'].values)
full_text_embeddings = np.vstack(df_train_deduplicated['full_text_qwen3_8b_emb'].values)
text_embeddings = np.vstack(df_train_deduplicated['description_qwen3_8b_emb'].values)


embeddings = np.concatenate([title_embeddings*0.8, text_embeddings*0.2], axis=1)
embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / embeddings_norm

symmetrized_similarities = embeddings @ embeddings.T

af = AffinityPropagation(affinity='precomputed', random_state=0) # Use 'precomputed'
af.fit(symmetrized_similarities)
labels = af.labels_
n_clusters_ = len(af.cluster_centers_indices_)
print(f"Estimated number of clusters: {n_clusters_}")


D = 1.01 - symmetrized_similarities
np.fill_diagonal(D, 0)
score = silhouette_score(D, labels, metric='precomputed')
print(score)
# print out center examplars of the ap model
titles = []
for i in range(n_clusters_):
    titles.append(df_train_deduplicated.iloc[af.cluster_centers_indices_[i]]['title'])
titles.sort()
titles


# 3. Affinity propagation clustering with xgboost features
- get the xboost model: 
    - first generate the training data using: same_occupation_job_pair_sampling.py
    - then to train the xgboost model, run: same_occupation_classification.py
- to get the clustering results, run clustering.py

In [None]:
import numpy as np
import pickle
from pathlib import Path


TMP_DIR = Path("../tmp")
CLUSTER_DIR = TMP_DIR / "xgbt_clustering"
ALPHA = 0.8

"""
Estimated number of clusters: 117
silhouette score: 0.335116
"""
#cleaned
"""
Estimated number of clusters: 118
silhouette score: 0.360974
"""

## 3.1 get training cluster id mapping

In [30]:
# qwen 8b
model_path = CLUSTER_DIR / "palestine_ap_model_qwen3_8b.pkl"
af = pickle.load(open(model_path, 'rb'))
    
embeddings = np.concatenate([np.vstack(df_train['title_qwen3_8b_emb'].values)*ALPHA, np.vstack(df_train['description_qwen3_8b_emb'].values)*(1-ALPHA)], axis=1)
cluster_centers_indices = af.cluster_centers_indices_
exemplars = embeddings[cluster_centers_indices]


In [None]:
id2label = {k: v for k, v in enumerate(af.labels_)}
print(len(id2label))
from collections import defaultdict

# Create a mapping from cluster labels to lists of IDs
cluster2ids = defaultdict(list)
for id_, cluster in id2label.items():
    cluster2ids[cluster].append(id_)

# Convert defaultdict to regular dict if needed
cluster2ids = dict(cluster2ids)

# Print the number of clusters and an example
print(f"Number of clusters: {len(cluster2ids)}")
print(f"Example cluster contents (first cluster): {list(cluster2ids.values())[0][:5]}")  # Show first 5 IDs of first cluster

In [34]:
# save cluster2ids
pickle.dump(cluster2ids, open(CLUSTER_DIR / 'palestine_cluster2id_mapping.pkl', 'wb'))

# 4. Annotation with LLM (gemini)

In [None]:
import os
import sys
sys.path.append('../src')

from google import genai
from google.genai import types
from dotenv import load_dotenv
load_dotenv()

from LLM_annotation import annotate_occupations
from get_prompts import get_truncated_prompts
from prompts import instruction_v4 # arabic aware

client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))

In [7]:
df_train_embed = pd.read_pickle('../data/palestine_train_data_embedding.pkl')

# rename title to job_title
df_train_embed.rename(columns={'title': 'job_title'}, inplace=True)

# get prompts
prompts = get_truncated_prompts(df_train_embed, root=None, k=15, use_mmr=True, lambda_param=0.5, instruction=instruction_v4, column_description='job_description_full', use_mapping='../tmp/xgbt_clustering/palestine_cluster2id_mapping.pkl')

# save prompts
pickle.dump(prompts, open('../tmp/palestine_prompts_118.pkl', 'wb'))

In [None]:
results = annotate_occupations(prompts, 'palestine_gemini25_occupations_118.pkl', client, model="gemini-2.5-flash-preview-05-20", temperature=0., max_tokens=4000, provider="google")

In [None]:
import json
import re

def robust_json_parser(text: str):
    """
    Extracts a JSON object from a string that might be wrapped in markdown code fences
    or other text, and parses it.

    Args:
        text: The input string containing a JSON object.

    Returns:
        The parsed Python dictionary or list if successful, otherwise None.
    """
    # This regex finds a string that starts with '{' and ends with '}',
    # and captures everything in between. re.DOTALL makes '.' match newlines.
    match = re.search(r'\{.*\}', text, re.DOTALL)
    
    if not match:
        # If no JSON object is found, try to parse the whole string
        try:
            return json.loads(text)
        except json.JSONDecodeError:
            print("Error: The string does not contain a valid JSON object.")
            return None

    json_str = match.group(0)
    
    try:
        # Parse the extracted string
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        # The extracted string is not valid JSON
        return None



In [36]:
# parse results

for i in range(len(results)):
    parsed_data = robust_json_parser(results[i]['annotation'])
    if parsed_data:
        results[i]['annotation'] = parsed_data
    else:
        print(f"Failed to parse annotation for job {results[i]['job_id']}")

In [38]:
# save results
pickle.dump(results, open('palestine_gemini25_occupations_118.pkl', 'wb'))

# 5. normalize the annotation

In [3]:
import json
import os
import re
import pickle
import nltk # For lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet # For POS tagging

In [None]:
# --- NLTK Setup (run once if not already downloaded) ---
try:
    nltk.data.find('corpora/wordnet.zip')
except:
    nltk.download('wordnet')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger.zip')
except:
    nltk.download('averaged_perceptron_tagger')
try:
    nltk.data.find('tokenizers/punkt.zip')
except:
    nltk.download('punkt')

In [None]:
conjunctions = []
gids = []
misc = []
for k, v in results.items():
    if isinstance(v['annotation'], list):
        title = v['annotation'][0]['occupation_title']
    elif isinstance(v['annotation'], dict):
        title = v['annotation']['occupation_title']
    if '+' in title:
        conjunctions.append(title)
        gids.append(k)
    if 'MISC' in title:
        misc.append(title)
print(len(conjunctions))
for c in conjunctions:
    print(c)
print('-'*100)
print(len(misc))
for m in misc:
    print(m)

pickle.dump(gids, open('palestine_conjunction_cluster_ids.pkl', 'wb'))    

In [43]:
idmapping = pickle.load(open('../tmp/xgbt_clustering/palestine_cluster2id_mapping.pkl', 'rb'))

conjunction_data = []
for id in gids:
    conjunction_data.extend(list(idmapping[id]))

# save the conjunction data
pickle.dump(conjunction_data, open('palestine_conjunction_data_ids.pkl', 'wb'))

# remove df train data
df_train_embed = df_train_embed[~df_train_embed.index.isin(conjunction_data)]
print(len(df_train_embed))
df_train_embed.to_pickle('palestine_train_data_no_conjunctions.pkl')

# remove results
results_keep = {k:v for k,v in results.items() if k not in gids}
print(len(results_keep))

all_titles = []
for k, v in results_keep.items():
    if isinstance(v['annotation'], list):
        original_title = v['annotation'][0]['occupation_title']
        all_titles.append(original_title)
    elif isinstance(v['annotation'], dict):
        title = v['annotation']['occupation_title']
        all_titles.append(title)
print(len(set(all_titles)),len(all_titles))

In [12]:
class ExaminerAgent:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        # Pre-compile regex for efficiency if called many times
        self.camel_case_splitter = re.compile(r'(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])')
        self.punctuation_remover = re.compile(r'[^\w\s/-]')  # Keeps words, spaces, hyphens, and slashes
        self.multiple_space_reducer = re.compile(r'\s+')        

    def _get_wordnet_pos(self, word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN) # Default to noun

    def normalize_title(self, title):
        if not title or not isinstance(title, str):
            return ""
        
        # 1. Insert spaces before uppercase letters in CamelCase/PascalCase
        # e.g., "OperationsCoordinator" -> "Operations Coordinator"
        # e.g., "PDFReader" -> "PDF Reader"
        # e.g., "MyAPITool" -> "My API Tool"
        title_spaced = self.camel_case_splitter.sub(r' ', title)

        # 2. Convert to lowercase
        title_lower = title_spaced.lower()
        
        # 3. Remove possessive 's
        title_no_possessive = title_lower.replace("'s", "")
        
        # 4. Remove common punctuation (keeps hyphens as they can be significant)
        title_no_punct = self.punctuation_remover.sub('', title_no_possessive)
        
        # 5. Lemmatize
        tokens = word_tokenize(title_no_punct) # Tokenize after most cleaning
        lemmatized_tokens = [self.lemmatizer.lemmatize(token, self._get_wordnet_pos(token)) for token in tokens if token.strip()] # ensure no empty tokens

        # 6. Join tokens and standardize multiple spaces to single, strip ends
        normalized = ' '.join(lemmatized_tokens)
        normalized = self.multiple_space_reducer.sub(' ', normalized).strip()
        
        return normalized

In [13]:
def capitalize_title(title):
    skip_words = {'a', 'an', 'the', 'and', 'but', 'or', 'for', 'nor', 'on', 'at', 
                'to', 'from', 'by', 'with', 'in', 'of'}
    words = title.split()
    if not words:
        return title
    # Always capitalize the first word
    title_case = [words[0].capitalize()]
    # For remaining words, capitalize unless they're in skip_words
    for word in words[1:]:
        if word not in skip_words:
            title_case.append(word.capitalize())
        else:
            title_case.append(word)
    return ' '.join(title_case)

In [14]:
examiner = ExaminerAgent()

In [None]:
init_input = []
init_input_with_gid = []
seen_titles = set()
cluster2normalized_map = {}
original2normalized_map = {}
for k, v in results_keep.items():
    if isinstance(v['annotation'], list):
        original_title = v['annotation'][0]['occupation_title']
        title = examiner.normalize_title(original_title)
        title = capitalize_title(title)
        original2normalized_map[original_title] = title
        description = v['annotation'][0]['occupation_description']
    elif isinstance(v['annotation'], dict):
        title = v['annotation']['occupation_title']
        original_title = title
        title = examiner.normalize_title(title)
        title = capitalize_title(title)
        original2normalized_map[original_title] = title
        description = v['annotation']['occupation_description']
    else:
        raise ValueError(f"Unexpected annotation type: {type(v['annotation'])}")
    cluster2normalized_map[k] = title
    if title in seen_titles:
        continue

    init_input.append({'title':title, 'description':description})
    init_input_with_gid.append({'title':title, 'description':description, 'gid':k})
    seen_titles.add(title)
len(seen_titles), len(original2normalized_map), len(cluster2normalized_map), len(init_input)

In [None]:
all_titles = []
for i, d in enumerate(init_input):
    title = d['title']
    all_titles.append(title)
for i in sorted(all_titles):
    print(i)


In [39]:
pickle.dump(original2normalized_map, open('palestine_original2normalized_map.pkl', 'wb'))
pickle.dump(cluster2normalized_map, open('palestine_cluster2normalized_map.pkl', 'wb'))

In [None]:
df_original_items = pd.DataFrame(init_input)
df_original_items.head()

## Group the cleaned nodes

In [5]:
import pickle
import numpy as np
from glob import glob
from pathlib import Path

import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity as sk_cosine_similarity
import spacy

In [21]:
# embed the description
def get_embeddings(text, model, tokenizer, device='cuda:1'):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=4096)
    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    # Move output back to CPU for numpy conversion
    return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()

In [None]:
# get embeddings for description
model_name = 'Qwen/Qwen3-Embedding-8B'

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = model.to('cuda:1')

batch_size = 32
title_embeddings = []
description_embeddings = []
full_text_embeddings = []
for i in tqdm(range(0, len(df_original_items), batch_size)):
    batch = df_original_items.iloc[i:i+batch_size]
                            
    batch_titles = batch['title'].tolist()
    batch_embeddings_title = [get_embeddings(text, model, tokenizer) for text in batch_titles]
    title_embeddings.extend(batch_embeddings_title)

    batch_descriptions = batch['description'].tolist()
    batch_embeddings_description = [get_embeddings(text, model, tokenizer) for text in batch_descriptions]
    description_embeddings.extend(batch_embeddings_description)

    batch_full_texts = [title + '\n' + description for title, description in zip(batch['title'], batch['description'])]
    batch_embeddings_full_texts = [get_embeddings(text, model, tokenizer) for text in batch_full_texts]
    full_text_embeddings.extend(batch_embeddings_full_texts)
df_original_items['title_embeddings'] = title_embeddings
df_original_items['description_embeddings'] = description_embeddings
df_original_items['full_text_embeddings'] = full_text_embeddings
df_original_items.head()

In [None]:
original_titles = df_original_items['title'].tolist()
description_embeddings = np.vstack(df_original_items['description_embeddings'].values)
title_embeddings = np.vstack(df_original_items['title_embeddings'].values)
embeddings = np.concatenate([description_embeddings, title_embeddings], axis=1)
print(f"Calculating similarity matrix for Affinity Propagation...")
similarity_matrix = sk_cosine_similarity(embeddings)

# --- Clustering with Affinity Propagation ---
PREFERENCE = 0.95
DAMPING = 0.7
RANDOM_STATE = 42 # For reproducibility


ap_model = AffinityPropagation(
    # damping=DAMPING,
    preference=PREFERENCE,
    affinity='precomputed',
    random_state=RANDOM_STATE
)
ap_model.fit(similarity_matrix)

cluster_labels_for_valid = ap_model.labels_
exemplar_indices_in_valid = ap_model.cluster_centers_indices_ # Indices within the 'valid_descriptions' list

n_clusters_ = len(exemplar_indices_in_valid)
print(f'Estimated number of clusters from Affinity Propagation: {n_clusters_}')

# Create a mapping from cluster label to the canonical title (title of the exemplar)
canonical_titles_map = {}
for i, exemplar_idx_in_valid in enumerate(exemplar_indices_in_valid):
    # exemplar_idx_in_valid is an index into 'description_embeddings' / 'original_titles_valid'
    # The cluster label assigned by AP for this exemplar's cluster will be the exemplar's own index
    # in 'description_embeddings' IF it's an exemplar.
    # More robustly, AP assigns labels from 0 to n_clusters-1.
    # We need to map which cluster label corresponds to which exemplar.
    # The `ap_model.labels_` gives the cluster label for each point.
    # The `exemplar_indices_in_valid` gives the index of the exemplar FOR EACH CLUSTER.
    # A simpler way: ap_model.labels_[exemplar_idx_in_valid] gives the cluster label for that exemplar.
    # We want: cluster_label -> title_of_exemplar_for_that_cluster
    
    # Let's find which cluster label corresponds to this exemplar
    # The label of the cluster whose exemplar is at `exemplar_idx_in_valid`
    # is simply `ap_model.labels_[exemplar_idx_in_valid]`
    
    cluster_label_of_exemplar = ap_model.labels_[exemplar_idx_in_valid]
    canonical_titles_map[cluster_label_of_exemplar] = original_titles[exemplar_idx_in_valid]


# Map cluster labels back to all original items
all_cluster_labels = np.full(len(original_titles), -1, dtype=int) # Default to -1 (unclustered/error)
final_canonical_titles = [None] * len(original_titles)

for i in range(len(original_titles)):
    original_item_cluster_label = cluster_labels_for_valid[i]
    all_cluster_labels[i] = original_item_cluster_label
    if original_item_cluster_label in canonical_titles_map:
        final_canonical_titles[i] = canonical_titles_map[original_item_cluster_label]
    else:
        print(f"Cluster label {original_item_cluster_label} not found in canonical_titles_map")
        # This case should ideally not happen if clustering is successful
        # but as a fallback, use its own title if its cluster exemplar is not found (e.g. -1 label from fit)
        final_canonical_titles[i] = original_titles[i]






In [None]:
from collections import defaultdict
canonical2original_map = defaultdict(list)
for i, v in enumerate(final_canonical_titles):
    canonical2original_map[v].append(original_titles[i])
for k, v in canonical2original_map.items():
    if len(v) > 1:
        print(k)
        print(v)
        print("-"*100)

In [31]:
pickle.dump(canonical2original_map, open('palestine_canonical2normalized_map.pkl', 'wb'))

In [33]:
# original2canonical_map
original2canonical_map = {}
for k, v in canonical2original_map.items():
    for vv in v:
        original2canonical_map[vv] = k
len(original2canonical_map)
pickle.dump(original2canonical_map, open('palestine_normalized2canonical_map.pkl', 'wb'))



In [36]:
# map the original titles to the canonical titles
df_original_items['canonical_title'] = final_canonical_titles
# save the dataframe
df_original_items.to_pickle('palestine_init_input_test_full_with_canonical_title_df.pkl')


In [35]:
cluster2canonical_map = {}
for k, v in cluster2normalized_map.items():
    cluster2canonical_map[k] = original2canonical_map[v]
len(cluster2canonical_map)
pickle.dump(cluster2canonical_map, open('palestine_cluster2canonical_map.pkl', 'wb'))


In [None]:
# input format
new_input = []
title2description = {}
for item in init_input:
    title2description[item['title']] = item['description']
for k in canonical2original_map.keys():
    new_input.append({'title': k, 'description': title2description[k]})
len(new_input)

In [38]:
json.dump(new_input, open('palestine_init_input_test_full_cannonical.json', 'w'))

# 6. Build taxonomy

`input: palestine_init_input_test_full_cannonical.json -> tree_multiagent.py -> output: full_generated_taxonomy_with_desc.json, level*_cannonical_parents_with_desc.json`

# 7. convert to tree

In [None]:
with open('../tmp/palestine_taxonomy_output/full_generated_taxonomy_with_desc.json', 'r') as f:
    json_string = f.read()
taxonomy_data_loaded = json.loads(json_string)

In [42]:
def convert_taxonomy_to_tree_format(taxonomy_data_loaded):
    """
    Convert taxonomy data from the original format to the desired tree format.
    Uses the title as the node name.
    
    Args:
        taxonomy_data_loaded: Dictionary with level numbers as keys and lists of taxonomy items as values
        
    Returns:
        Dictionary representing the root node of the tree in the desired format
    """
    # Create a mapping from title to node at each level
    title_to_node = {}
    
    # Process levels from highest to lowest (e.g., 4, 3, 2, 1, 0)
    max_level = max(int(k) for k in taxonomy_data_loaded.keys())
    
    # Start with the root node
    root_node = {
        "parent": "null",
        "name": "Root",  # You can customize this root name
        "edge_name": "null",
        "children": []
    }
    
    # Process each level starting from max_level down to 1
    for level in range(max_level, 0, -1):
        level_str = str(level)
        if level_str not in taxonomy_data_loaded:
            continue
            
        level_items = taxonomy_data_loaded[level_str]
        
        for i, item in enumerate(level_items):
            title = item['title']
            
            # Create a node for this item
            node = {
                "parent": None,  # Will be set later
                "name": title,
                "edge_name": title,
                "children": []
            }
            
            # If this is the top level, attach directly to root
            if level == max_level:
                node["parent"] = root_node["name"]
                root_node["children"].append(node)
            else:
                # Find a parent for this node
                # We'll look for a parent that has this title in its 'kids' list
                found_parent = False
                for potential_parent_level in range(level + 1, max_level + 1):
                    parent_level_str = str(potential_parent_level)
                    if parent_level_str not in taxonomy_data_loaded:
                        continue
                        
                    for parent_item in taxonomy_data_loaded[parent_level_str]:
                        if 'kids' in parent_item and title in parent_item['kids']:
                            parent_node = title_to_node[parent_item['title']]
                            node["parent"] = parent_node["name"]
                            parent_node["children"].append(node)
                            found_parent = True
                            break
                    
                    if found_parent:
                        break
                
                # If no parent found, attach to root (this is a fallback)
                if not found_parent:
                    node["parent"] = root_node["name"]
                    root_node["children"].append(node)
            
            # Store this node for future reference
            title_to_node[title] = node
    
    # Process level 0 (leaf nodes) if available
    if '0' in taxonomy_data_loaded:
        for item in taxonomy_data_loaded['0']['Leaf Items']:
            title = item['title']
            leaf_node = {
                "parent": None,
                "name": title,
                "edge_name": "",  # Using the format from your example
                "children": []
            }
            
            # Find a parent for this leaf node
            found_parent = False
            for level in range(1, max_level + 1):
                level_str = str(level)
                if level_str not in taxonomy_data_loaded:
                    continue
                    
                for parent_item in taxonomy_data_loaded[level_str]:
                    if 'kids' in parent_item and title in parent_item['kids']:
                        parent_node = title_to_node[parent_item['title']]
                        leaf_node["parent"] = parent_node["name"]
                        parent_node["children"].append(leaf_node)
                        found_parent = True
                        break
                
                if found_parent:
                    break
            
            # If no parent found, attach to root (this is a fallback)
            if not found_parent:
                leaf_node["parent"] = root_node["name"]
                root_node["children"].append(leaf_node)
    
    return root_node


In [43]:
result = convert_taxonomy_to_tree_format(taxonomy_data_loaded)
# save the result
# Write the output tree to a JSON file.
with open("../docs/palestine_Gemini_generated_tree_91.json", "w") as outfile:
    json.dump(result, outfile)