In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
from hdbscan import HDBSCAN 
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import warnings


In [None]:

bert_base = SentenceTransformer('bert-base-nli-mean-tokens')
all_mpnet = SentenceTransformer('all-mpnet-base-v2')

# --------------------
# Feature Engineering
# --------------------
def process_anti_preferences(pref_str, all_ids):
    """Process anti-preferences into exclusion list"""
    if pd.isna(pref_str): return []
    return [int(id) for id in re.findall(r'\d+', str(pref_str)) if int(id) in all_ids]

def create_anti_matrix(df):
    """Create anti-preference matrix with bidirectional conflicts."""
    n = len(df)
    anti_matrix = np.zeros((n, n))
    id_to_idx = {id: idx for idx, id in enumerate(df['ID'])}
    
    for idx, row in df.iterrows():
        anti_ids = process_anti_preferences(row[7], df['ID'].values)
        for a_id in anti_ids:
            if a_id in id_to_idx:
                anti_matrix[idx, id_to_idx[a_id]] = 1
                anti_matrix[id_to_idx[a_id], idx] = 1  # Ensure bidirectional conflicts
    return anti_matrix

In [None]:

# ---------------------
# Weighted Embedding
# ---------------------
def get_weighted_embeddings(texts, weight,model):
    """Get weighted sentence embeddings"""
    embeddings = model.encode(texts.fillna(''))
    return embeddings * weight

In [None]:
def process_students(file_path):
    # Load and combine sections
    df = pd.read_excel(file_path, sheet_name='Sec 1')
    # df2 = pd.read_excel(file_path, sheet_name='Sec 2') 
    # df = pd.concat([df1, df2], ignore_index=True)
    
    # Feature Weights (adjust as needed)
  weights = {
    'anti_pref': 100.0,  # Very high weight to enforce anti-preferences
    'availability': 5.0,  # High importance
    'work_dist': 3.0,     # Medium importance
    'domains': 1.0,       # Equal and lower importance
    'projects': 1.0,      # Equal and lower importance
    'mentoring': 0.5      # Lowest importance
}

    # Process each feature
    features = {}
    
    # 1. Availability
    features['availability'] = get_weighted_embeddings(df['availability'], weights['availability'],all_mpnet)
    
    # 2. Work Distribution
    features['work_dist'] =  get_weighted_embeddings(df['work_dist'], weights['work_dist'],all_mpnet)
    
    # 3. Mentoring
    mentoring = df['mentoring'].notna().astype(int).values.reshape(-1, 1)
    features['mentoring'] = mentoring * weights['mentoring']
    
    # 4. Anti-preferences
    anti_matrix = create_anti_matrix(df)
    features['anti_pref'] = anti_matrix * weights['anti_pref']
    
    # 5. Domains & Projects (LLM embeddings)
    features['domains'] = get_weighted_embeddings(df['domains'], weights['domains'],bert_base)
    features['projects'] = get_weighted_embeddings(df['projects'], weights['projects'],bert_base)
    
    # Combine features
    X = np.hstack(list(features.values()))
    
    # Clustering
    clusterer = HDBSCAN(
        min_cluster_size=3,
        metric='cosine',
        cluster_selection_method='leaf',
        prediction_data=True
    )
    
    # Create distance matrix with anti-preference penalties
    distance_matrix = cosine_similarity(X)
    distance_matrix += anti_matrix * 1000  # Large penalty for conflicts
    
    clusters = clusterer.fit_predict(distance_matrix)
    
    # Post-process clusters
    result_df = pd.DataFrame({
        'ID': df['ID'],
        'Cluster': clusters,
        'Domains': df.iloc[:, 1],
        'Availability': df.iloc[:, 3],
        'Anti-Preferences': df.iloc[:, 6]
    })
    
    # Ensure anti-preferences are respected
    for cluster in result_df['Cluster'].unique():
        members = result_df[result_df['Cluster'] == cluster]
        for _, row in members.iterrows():
            anti_ids = process_anti_preferences(row['Anti-Preferences'], df['ID'].values)
            conflict_members = members[members['ID'].isin(anti_ids)]
            if not conflict_members.empty:
                print(f"Adjusting cluster {cluster} due to conflicts between {row['ID']} and {conflict_members['ID'].tolist()}")
                result_df.loc[result_df['ID'].isin(conflict_members['ID']), 'Cluster'] = -1
    
    return result_df.sort_values('Cluster')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def visualize_clusters(result_df):
    """Visualize clusters using a scatter plot."""
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x=result_df['Domains'].apply(lambda x: hash(x) % 100),  # Simplified domain hash
        y=result_df['Cluster'],
        hue=result_df['Cluster'],
        palette='viridis',
        s=100
    )
    plt.title("Student Clusters")
    plt.xlabel("Domains of Interest (Hashed)")
    plt.ylabel("Cluster")
    plt.show()

In [None]:
if __name__ == "__main__":
    result = process_students('new_data.xlsx')
    
    # Save results
    result.to_csv('student_clusters.csv', index=False)
    
    # Print cluster summary
    print("\nCluster Summary:")
    print(result.groupby('Cluster').agg(
        Students=('ID', lambda x: x.tolist()),
        Common_Domains=('Domains', lambda x: pd.Series(x).value_counts().index[0])
    ).reset_index())
    
    # Visualize clusters
    visualize_clusters(result)