In [None]:
!pip install pyspark spark-nlp matplotlib numpy tqdm transformers scikit-learn

In [None]:
import json
import torch
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

# 🔧 Load PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")
model.eval()

# 📂 Load dữ liệu
with open("/opt/workspace/data.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# 🧠 Lấy câu hỏi từ role "user"
questions = []
for item in data:
    for msg in item.get("messages", []):
        if msg["role"] == "user" and msg.get("content"):
            questions.append(msg["content"])

# ✨ Trích xuất embedding
def get_embedding(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=256, truncation=True)
    with torch.no_grad():
        output = model(input_ids)[0]
        embedding = output.mean(dim=1).squeeze().numpy()
    return embedding

print("🔍 Đang tạo embedding...")
embeddings = [get_embedding(q) for q in tqdm(questions)]

# 🔍 Nhóm bằng KMeans
k = 10
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(embeddings)

# 📊 Đếm số câu hỏi theo nhóm
counts = Counter(labels)

# 🖼️ Vẽ biểu đồ
plt.figure(figsize=(10, 5))
plt.bar(counts.keys(), counts.values(), color="teal")
plt.xlabel("Cụm câu hỏi")
plt.ylabel("Số lượng")
plt.title("📈 Tần suất các cụm câu hỏi theo PhoBERT")
plt.xticks(range(k))
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.show()

# 🧾 In mẫu câu trong mỗi cụm
for cluster_id in range(k):
    print(f"\n🔹 Cụm {cluster_id} ({counts[cluster_id]} câu hỏi):")
    for idx, label in enumerate(labels):
        if label == cluster_id:
            print(f"  - {questions[idx]}")
            if idx > 2: break  # chỉ in 3 câu đầu cụm
