In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')


In [None]:
import pandas as pd
df = pd.read_excel('section_1.xlsx')
combined_text = df['domains'].fillna('') + ' ' + df['projects'].fillna('')
embeddings = model.encode(combined_text.tolist(), show_progress_bar=True)

In [None]:

from sklearn.preprocessing import OneHotEncoder

# 1. Availability: parse into time-slot features
def parse_availability(av_text):
    slots = {slot: 0 for slot in ['weekday_morning','weekday_afternoon','weekday_evening',
                                  'weekend_morning','weekend_afternoon','weekend_evening']}
    if pd.isna(av_text):
        return slots
    text = av_text.lower()
    if "weekday" in text:
        if "morning" in text: slots['weekday_morning'] = 1
        if "afternoon" in text: slots['weekday_afternoon'] = 1
        if "evening" in text or "night" in text: slots['weekday_evening'] = 1
        if "all day" in text or ("morning" not in text and "afternoon" not in text and "evening" not in text):
            slots['weekday_morning'] = slots['weekday_afternoon'] = slots['weekday_evening'] = 1
    if "weekend" in text:
        if "morning" in text: slots['weekend_morning'] = 1
        if "afternoon" in text: slots['weekend_afternoon'] = 1
        if "evening" in text or "night" in text: slots['weekend_evening'] = 1
        if "all day" in text or ("morning" not in text and "afternoon" not in text and "evening" not in text):
            slots['weekend_morning'] = slots['weekend_afternoon'] = slots['weekend_evening'] = 1
    return slots

availability_features = df['availability'].apply(parse_availability).tolist()
avail_df = pd.DataFrame(availability_features)

In [None]:


# 3. Work Distribution: One-hot encode the categories
work_dist_ohe = pd.get_dummies(df['work_dist'].fillna('No preference'), prefix='work')


# 4. Mentor attribute: binary encode
mentor_flag = df['mentoring'].fillna('No').apply(lambda x: 1 if str(x).strip().lower() == 'yes' else 0)
mentor_flag = mentor_flag.values.reshape(-1, 1)


In [None]:
import numpy as np

avail_array = avail_df.values.astype(float)
work_array = work_dist_ohe.values.astype(float)

avail_weight = 3.0    
interest_weight = 1.0  
work_weight = 0.5    
mentor_weight = 0.2 


features = np.hstack([
    avail_array * avail_weight,
    embeddings * interest_weight,
    work_array * work_weight,
    mentor_flag * mentor_weight
])


In [None]:
from sklearn.cluster import KMeans
max_group_size = 4
initial_k = int(np.ceil(len(df) / max_group_size))  

kmeans = KMeans(n_clusters=initial_k, random_state=42)
labels = kmeans.fit_predict(features)

cluster_dict = {}
for idx, label in enumerate(labels):
    cluster_dict.setdefault(label, []).append(int(df.loc[idx, 'ID'])) 
initial_clusters = list(cluster_dict.values())
print(f"Initial clusters (count={len(initial_clusters)}):", [len(c) for c in initial_clusters])


In [None]:
from collections import defaultdict

def recluster_large_clusters(clusters, features, max_size=4):
    """Split clusters larger than max_size using KMeans (preserve original logic)."""
    new_clusters = []
    for cluster in clusters:
        if len(cluster) > max_size:
            n_sub = (len(cluster) // max_size) + 1
            sub_feats = [features[list(df['ID']).index(stu_id)] for stu_id in cluster]  # feature vectors of cluster members
            sub_kmeans = KMeans(n_clusters=n_sub, random_state=42)
            sub_labels = sub_kmeans.fit_predict(sub_feats)
            sub_cluster_map = defaultdict(list)
            for member, sub_label in zip(cluster, sub_labels):
                sub_cluster_map[sub_label].append(member)

            for sub_cluster in sub_cluster_map.values():
                if len(sub_cluster) > max_size:
                    new_clusters.extend(recluster_large_clusters([sub_cluster], features, max_size))
                else:
                    new_clusters.append(sub_cluster)
        else:
            new_clusters.append(cluster)
    return new_clusters

def combine_small_clusters(clusters, features, min_size=3, max_size=4):
    """Merge clusters smaller than min_size with nearest other clusters (by feature similarity) if possible."""

    small_clusters = [c for c in clusters if len(c) < min_size]
    other_clusters = [c for c in clusters if len(c) >= min_size]
    merged = False
    for small in small_clusters:
        if not small: 
            continue
  
        best_match = None
        best_distance = float('inf')

        small_indices = [list(df['ID']).index(stu_id) for stu_id in small]
        small_centroid = np.mean(features[small_indices], axis=0)
        for c in other_clusters:
            if len(c) + len(small) <= max_size:
                c_indices = [list(df['ID']).index(stu_id) for stu_id in c]
                c_centroid = np.mean(features[c_indices], axis=0)
                dist = np.linalg.norm(small_centroid - c_centroid)
                if dist < best_distance:
                    best_distance = dist
                    best_match = c
        if best_match is not None:
            best_match.extend(small)
            merged = True
        else:
            other_clusters.append(small)
    if merged:
        return combine_small_clusters(other_clusters, features, min_size, max_size)
    else:
        return other_clusters


clusters = recluster_large_clusters(initial_clusters, features, max_size=4)
clusters = combine_small_clusters(clusters, features, min_size=3, max_size=4)
print(f"Clusters after size balancing: {[len(c) for c in clusters]}")


In [None]:

anti_dict = {}
for i, row in df.iterrows():
    student_id = int(row['ID'])
    if pd.notna(row['anti_pref']):
        prefs = str(row['anti_pref']).split(',')
        forbidden_ids = {int(p.strip()) for p in prefs if p.strip().isdigit()}
        for forbidden_id in forbidden_ids:

            anti_dict.setdefault(student_id, set()).add(forbidden_id)
            anti_dict.setdefault(forbidden_id, set()).add(student_id)


def enforce_antiprefs(clusters):
    changed = True
    while changed:
        changed = False
        for ci, cluster in enumerate(clusters):
            violators = None
            for sid in cluster:
                if sid in anti_dict:
                    conflict_ids = anti_dict[sid].intersection(cluster)
                    if conflict_ids:
                        violators = (sid, list(conflict_ids)[0])
                        break
            if violators:
                sid_a, sid_b = violators
                target_found = False
                for cj, other_cluster in enumerate(clusters):
                    if cj == ci:
                        continue
                    if len(other_cluster) < 4 and all(s not in anti_dict.get(sid_b, set()) for s in other_cluster):
                        cluster.remove(sid_b)
                        other_cluster.append(sid_b)
                        changed = True
                        target_found = True
                        break
                if not target_found:
                    for cj, other_cluster in enumerate(clusters):
                        if cj == ci: continue
                        for swap_candidate in other_cluster:
                            if swap_candidate not in anti_dict.get(sid_a, set()) and sid_b not in anti_dict.get(other_cluster[0], set()):
                                cluster.remove(sid_b)
                                other_cluster.remove(swap_candidate)
                                cluster.append(swap_candidate)
                                other_cluster.append(sid_b)
                                changed = True
                                target_found = True
                                break
                        if target_found: break
                if changed:
                    break
    return clusters

clusters = enforce_antiprefs(clusters)


In [None]:
mentors = {int(row['ID']) for i, row in df.iterrows() if str(row['mentoring']).strip().lower() == 'yes'}

for i, cluster in enumerate(clusters):
    mentor_ids = [sid for sid in cluster if sid in mentors]
    if len(mentor_ids) > 1:
        for sid in mentor_ids[1:]: 
            for j, other_cluster in enumerate(clusters):
                if i == j: 
                    continue
                if all(mid not in mentors for mid in other_cluster) and len(other_cluster) < 4:
                    cluster.remove(sid)
                    other_cluster.append(sid)
                    break
            if sid in cluster:  
                for j, other_cluster in enumerate(clusters):
                    if i == j: continue
                    swap_idx = next((k for k, member in enumerate(other_cluster) if member not in mentors), None)
                    if swap_idx is not None:
                        swap_id = other_cluster[swap_idx]
                        cluster.remove(sid)
                        other_cluster[swap_idx] = sid
                        cluster.append(swap_id)
                        break


In [None]:
print("Final Clusters:")
for idx, cluster in enumerate(clusters, start=1):
    print(f"Group {idx} (size {len(cluster)}): {cluster}")


In [None]:
final_data = []

for cluster_id, cluster in enumerate(clusters, start=1):
    for student_id in cluster:
        student_data = df[df["ID"] == student_id].iloc[0].to_dict()
        student_data["cluster_id"] = cluster_id
        final_data.append(student_data)

df_final = pd.DataFrame(final_data)

output_file = "student_clusters.csv"
df_final.to_csv(output_file, index=False)

print(f"Clustered data saved to {output_file}")
