In this preliminary section, we clustered the papers using HDBSCAN-KNN Algorithm

# Section 1: HDBSCAN-KNN for clustering

In [None]:
import pandas as pd
import numpy as np
import hdbscan, re, gc
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from google.colab import drive

In [None]:
# --- CONFIGURATION ---
COORD_FILE = '/content/drive/MyDrive/100-5D_UMAP_Coordinates.pkl'
MAPPING_FILE = '/content/drive/MyDrive/research_map_cooordinates_for_mapping_africa.csv' # File containing umap_2d_x, umap_2d_y
TARGET_DIM = 'coords_30d'
MIN_CLUSTER_SIZE = 10
SEED = 42

In [None]:
drive.mount('/content/drive')

In [None]:
df_coords = pd.read_pickle(COORD_FILE)
df_map = pd.read_csv(MAPPING_FILE)

df = pd.merge(df_coords, df_map, on='EID')
X_30d = np.vstack(df[TARGET_DIM].values).astype('float32') # Extract 30D coordinates into a matrix for HDBSCAN/kNN

df['umapX'] = df['umap_2d_x']
df['umapY'] = df['umap_2d_y']

In [None]:
# running HDBSCAN

clusterer = hdbscan.HDBSCAN(min_cluster_size=10, gen_min_span_tree=True)
labels = clusterer.fit_predict(X_30d)
df['original_label'] = labels

print("\nInitial Cluster Summary (Raw HDBSCAN Output):")
cluster_counts = df['original_label'].value_counts().sort_index()

print(f"{'Cluster ID':<15} | {'Member Count':<15} | {'Sample EIDs'}")
print("-" * 65)

for cid, count in cluster_counts.items():
    sample_eids = df[df['original_label'] == cid]['EID'].head(5).tolist()
    sample_str = ", ".join(sample_eids)

    if cid == -1:
        label_name = "Noise (-1)"
    else:
        label_name = f"Cluster {cid}"

    print(f"{label_name:<15} | {count:<15} | {sample_str}...")

print("-" * 65)
print(f"Total Papers: {len(df)}")
print(f"Initial Noise (-1) Count: {cluster_counts.get(-1, 0)} ({cluster_counts.get(-1, 0)/len(df):.1%})")
print(f"Number of Core Clusters: {len(cluster_counts[cluster_counts.index != -1])}")

core_mask = (labels != -1)
noise_mask = (labels == -1)
final_labels = labels.copy()
spectrum = [f"Clust{l}(100%)" if l != -1 else "" for l in labels]

In [None]:
if noise_mask.any() and core_mask.any():
    print("\n -- Refining Noise points using kNN -- ")

    knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
    knn.fit(X_30d[core_mask], labels[core_mask])

    probs = knn.predict_proba(X_30d[noise_mask])
    preds = knn.predict(X_30d[noise_mask])
    final_labels[noise_mask] = preds

    classes = knn.classes_
    noise_indices = np.where(noise_mask)[0]
    for i, p in enumerate(probs):
        top = p.argsort()[-3:][::-1] # select top 3
        txt = [f"Clust{classes[idx]}({p[idx]*100:.0f}%)" for idx in top if p[idx] >= 0.05]
        spectrum[noise_indices[i]] = ", ".join(txt)

df['cluster_label'], df['cluster_spectrum'] = final_labels, spectrum

print(" ---> Spectrum Analysis Complete")

In [None]:
# Density Calculation & Elbow Detection

nn = NearestNeighbors(n_neighbors=16).fit(X_30d)
dists, _ = nn.kneighbors(X_30d)
density = np.sum(1.0 / (dists[:, 1:] + 1e-5), axis=1)
df['density_score'] = density #find density

def find_knee_y(values):
    x = np.arange(len(values)); y = values
    line_vec = np.array([x[-1] - x[0], y[-1] - y[0]])
    line_vec = line_vec / np.linalg.norm(line_vec)
    pts = np.vstack((x - x[0], y - y[0])).T
    proj = np.outer(np.dot(pts, line_vec), line_vec)
    dist = np.linalg.norm(pts - proj, axis=1)
    return np.argmax(dist)

if noise_mask.any():
    noise_vals = df.loc[noise_mask, 'density_score'].values
    sorted_den = np.sort(noise_vals)[::-1]
    log_den = np.log(sorted_den + 1e-9)

    # Detect elbows for classification
    idx1 = find_knee_y(log_den)
    idx2 = idx1 + find_knee_y(log_den[idx1:])
    t1, t2 = np.exp(log_den[idx1]), np.exp(log_den[idx2])

    plt.figure(figsize=(8, 4))
    plt.plot(np.arange(len(log_den)), log_den, label='log(density)')
    plt.axhline(np.log(t1), color='orange', ls='--', label=f'T1: {t1:.2f}')
    plt.axhline(np.log(t2), color='red', ls='--', label=f'T2: {t2:.2f}')
    plt.title("Elbow Cutoff (30D Density)")
    plt.legend(); plt.show()

    df['density_class'] = 'core'
    df.loc[noise_mask & (df['density_score'] <= t1), 'density_class'] = 'intermediate'
    df.loc[noise_mask & (df['density_score'] <= t2), 'density_class'] = 'outlier'

In [None]:
def classify_spectrum(txt):
    items = re.findall(r'(Clust\d+)\((\d+)%\)', str(txt))
    spec = sorted([(k, int(v)) for k, v in items], key=lambda x: -x[1])
    if not spec: return "UNKNOWN"
    if len(spec) == 1 or spec[0][1] >= 85: return f"SINGLE_{spec[0][0]}"
    (A, a), (B, b) = spec[0], spec[1]
    if a >= 70: return f"{A}_DOM_{B}"
    if abs(a - b) <= 15: return f"{A}_EQ_{B}"
    return f"{A}_GT_{B}"

In [None]:
df['spectrum_class'] = df['cluster_spectrum'].apply(classify_spectrum)
df.loc[df['density_class'] == 'outlier', 'spectrum_class'] = 'OUTLIER'

# Export
df.to_csv("Final_Research_Map_Data.csv", index=False)

#Section 2: Cluster Name Assignment

## 0_Lib install

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df_import = pd.read_csv('/content/MUSearch/Clustering/research_map_inheritance_logic.csv') #result from clustering

##1_Merge Title Abstract Year with EID


In [None]:
files = [
    "/content/MUSearch/Cleaned Data/2021_cleaned_data.csv", #paper database files from paper filtering process
    "/content/MUSearch/Cleaned Data/2022_cleaned_data.csv",
    "/content/MUSearch/Cleaned Data/2023_cleaned_data.csv",
    "/content/MUSearch/Cleaned Data/2024_cleaned_data.csv",
    "/content/MUSearch/Cleaned Data/2025_cleaned_data.csv"
]

df_list = []

for file in files:
    df = pd.read_csv(file)
    df_list.append(df)

merged_cleaned_df = pd.concat(df_list, ignore_index=True)

In [None]:
merged_cleaned_df.shape

(21840, 46)

In [None]:
merged_cleaned_df['EID'].duplicated().sum()

np.int64(0)

In [None]:
#add Title and Year and Abstract into df_import with merge of EID (same) if not, write nan
df_import = df_import.merge(merged_cleaned_df[['EID', 'Title', 'Year', 'Abstract','Authors']], on='EID', how='left')

##2_Centroid calculation and LLM Implementation

Previous Euclidean-based clustering (not shown in this coding file) failed for groups with hollow centroids. This group name assignment method introduces **Density-Weighted Centroid** **Ranking** to resolve these structural inconsistencies

----------
Process:

1. In layer 5, summarize 20 papers using density and centroid distance to generate a cluster title and a brief cluster description.

2. For the higher-level layer, summarize the cluster names and descriptions based on the titles and names of the child layers below."

In [None]:
import openai
import json
import time
import pandas as pd
from tqdm import tqdm

df_named_m2 = df_import.copy()
client = openai.OpenAI(api_key="**input your api key")


In [None]:
#centroid paper weighting and listing
from scipy.spatial.distance import cdist

def get_best_representative_papers(group, top_n=20):

    coords = group[['umap_3d_x', 'umap_3d_y', 'umap_3d_z']].values
    centroid = coords.mean(axis=0).reshape(1, -1)

    distances = cdist(coords, centroid, metric='euclidean').flatten()

    # Small distance is good, so we use (1 - normalized_distance)
    dist_min, dist_max = distances.min(), distances.max()
    if dist_max > dist_min:
        norm_dist = (distances - dist_min) / (dist_max - dist_min)
    else:
        norm_dist = np.zeros_like(distances)

    # High density is good
    dens_min, dens_max = group['density_score'].min(), group['density_score'].max()
    if dens_max > dens_min:
        norm_dens = (group['density_score'] - dens_min) / (dens_max - dens_min)
    else:
        norm_dens = np.zeros_like(distances)

    # weight 50% distance, 50% density (addictive model)
    group['rep_score'] = (1 - norm_dist) + norm_dens

    return group.sort_values('rep_score', ascending=False).head(top_n)['EID'].tolist()



cluster_summary_m2 = df_named_m2.groupby('5L_Layer_5_Final').apply(
    lambda x: pd.Series({
        'top_20_EIDs': get_best_representative_papers(x)
    })
).reset_index()

print(f"Calculated centroids for {len(cluster_summary_m2)} clusters.")

Calculated centroids for 655 clusters.


  cluster_summary_m2 = df_named_m2.groupby('5L_Layer_5_Final').apply(


In [None]:
# run layer 5 topic name and description

def get_cluster_details(abstracts):
    context = "\n---\n".join(abstracts[:20])
    prompt = f"""
    Analyze the following research paper abstracts from a single cluster:
    {context}

    Task:
    1. Provide a technical,descriptive, academic name for this cluster. (not more than 9 words) that represent the scope of the cluster.
    2. Provide a 1-sentence short description summarizing the core niche or methodology.

    Return ONLY a JSON object: {{"name": "...", "description": "..."}}
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={ "type": "json_object" },
            temperature=0.3
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        return {"name": "Unknown Cluster", "description": "Description could not be generated."}

naming_results = []

for idx, row in tqdm(cluster_summary_m2.iterrows(), total=len(cluster_summary_m2), desc="Generating Names & Descriptions"):
    c_id = row['5L_Layer_5_Final']
    eids = row['top_20_EIDs']

    abstract_list = df_import[df_import['EID'].isin(eids)]['Abstract'].dropna().tolist()

    if abstract_list:
        meta = get_cluster_details(abstract_list)
        naming_results.append({
            'ID': c_id,
            'New_Name': meta['name'],
            'Desc': meta['description']
        })
    time.sleep(1.0)


results_df = pd.DataFrame(naming_results)
name_map = results_df.set_index('ID')['New_Name'].to_dict()
desc_map = results_df.set_index('ID')['Desc'].to_dict()

df_named_m2['5L_Layer_5_Description'] = df_named_m2['5L_Layer_5_Final'].map(desc_map)
df_named_m2['5L_Layer_5_Final'] = df_named_m2['5L_Layer_5_Final'].map(name_map).fillna(df_named_m2['5L_Layer_5_Final'])

print(f"\n ---- Successfully updated df_named_m2 with names and descriptions for {len(results_df)} clusters. ------- ")

In [None]:
df_named_m2.rename(columns={'5L_Layer_5_Description': '5L_Layer_5_Final_Description'}, inplace=True)

In [None]:
#save to csv
df_named_m2.to_csv('/content/MUSearch/Cluster_named_MU_research_m2.csv', index=False)

##3_Assign name for layer 1-4 using title and description from previous layer

In [None]:
df_named_m3 = df_import.copy()

In [None]:
df_model2_results = pd.read_csv('/content/Cluster_named_MU_research_m2.csv')

In [None]:
df_named_m3.rename(columns={'5L_Layer_5_Final': '5L_Layer_5_Final_id'}, inplace=True)
df_named_m3.rename(columns={'5L_Layer_4': '5L_Layer_4_id'}, inplace=True)
df_named_m3.rename(columns={'5L_Layer_3': '5L_Layer_3_id'}, inplace=True)
df_named_m3.rename(columns={'5L_Layer_2': '5L_Layer_2_id'}, inplace=True)
df_named_m3.rename(columns={'5L_Layer_1': '5L_Layer_1_id'}, inplace=True)

In [None]:
df_named_m3 = df_named_m3.merge(
    df_model2_results[['EID', '5L_Layer_5_Final', '5L_Layer_5_Final_Description']],
    on='EID',
    how='left'
)

In [None]:
#now, we already have summary from layer 5 --> process 4-3-2-1 in order.

def get_hybrid_hierarchical_metadata(child_info_text, layer_level):
    """
    Summarizes sub-clusters into a parent umbrella name AND description
    using strict taxonomical rules.
    """
    prompt = f"""
    You are a Senior Technical Editor and Taxonomy Specialist. Your task is to organize a research
    hierarchy at the {layer_level} level.

    SUB-TOPIC DATA (CHILDREN):
    {child_info_text}

    TASK:
    1. Provide a high-precision academic umbrella name (Max 7 words).
       - Use '&' to join primary themes (e.g., 'Vision & 3D').
       - Use '( )' for high-signal sub-niches (e.g., 'LLM Evals (RAG, Agents)').
       - Use ':' for deep sub-specializations.
    2. Provide a 1-2 sentence description summarizing the collective scope,
       focusing on methodology or innovative discovery keywords.

    NAMING RULES:
    - PATTERN: Use 'Technique: Application A & Application B'.
    - NO FILLER: Absolutely no "Advances in", "Research on", "Methods", "Study of".
    - BE SPECIFIC: Use technical anchors (e.g., 'Preference Alignment', 'Inverse Problems').
    - DEDUPLICATION: Ensure the name is unique to this specific group of children.

    Return ONLY a JSON object: {{"name": "...", "description": "..."}}
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={ "type": "json_object" },
            temperature=0.3
        )
        import json
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        return {"name": f"General {layer_level} Category", "description": "Combined research area."}

# Format: (Target_ID_Col, Target_Name_Col, Source_Name_Col, Source_Desc_Col)
hierarchy_steps = [
    ('5L_Layer_4_id', '5L_Layer_4', '5L_Layer_5_Final', '5L_Layer_5_Final_Description'),
    ('5L_Layer_3_id', '5L_Layer_3', '5L_Layer_4', '5L_Layer_4_Description'),
    ('5L_Layer_2_id', '5L_Layer_2', '5L_Layer_3', '5L_Layer_3_Description'),
    ('5L_Layer_1_id', '5L_Layer_1', '5L_Layer_2', '5L_Layer_2_Description')
]

for target_id_col, target_name_col, source_name_col, source_desc_col in hierarchy_steps:
    print(f"\n" + "="*60)
    print(f"STEP: Naming & Describing {target_name_col} using {source_name_col}")
    print("="*60)

    parent_groups = df_named_m3.groupby(target_id_col)
    name_map = {}
    desc_map = {}

    for parent_id, group in tqdm(parent_groups, desc=f"Processing {target_name_col}"):
        if pd.isna(parent_id): continue

        unique_children = group.drop_duplicates(subset=[source_name_col])
        child_context = [f"- {row[source_name_col]}: {row[source_desc_col]}" for _, row in unique_children.iterrows()]
        child_info_text = "\n".join(child_context)

        meta = get_hybrid_hierarchical_metadata(child_info_text, target_name_col)
        name_map[parent_id] = meta['name']
        desc_map[parent_id] = meta['description']

        time.sleep(0.8)

    # Save name and description
    df_named_m3[f"{target_name_col}_Description"] = df_named_m3[target_id_col].map(desc_map)
    df_named_m3[target_name_col] = df_named_m3[target_id_col].map(name_map)

print("\n layer 1-4 Naming and Description complete")

In [None]:
#if layer 5 is nan, make sure all description in all layer is nan

nan_mask = df_named_m3['5L_Layer_5_Final'].isna()

description_cols = [
    '5L_Layer_5_Final_Description',
    '5L_Layer_4_Description',
    '5L_Layer_3_Description',
    '5L_Layer_2_Description',
    '5L_Layer_1_Description',
    '5L_Layer_1',
    '5L_Layer_2',
    '5L_Layer_3',

]
df_named_m3.loc[nan_mask, description_cols] = np.nan

print("Data consistency check and NaN propagation for description columns complete.")

In [None]:
df_named_m3['Authors'] = df_import['Authors']

In [None]:
df_named_m3.to_csv('/content/MUSearch/Cluster_named_MU_research_m3.csv', index=False)