In [None]:
!pip install pyspark sparknlp numpy scikit-learn tqdm --upgrade transformers torch --upgrade accelerate

In [None]:
import json
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN

# üîß T·∫£i tokenizer v√† model PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")
model.eval()

# üìÇ ƒê·ªçc d·ªØ li·ªáu JSONL
file_path = "/opt/workspace/data.jsonl"
with open(file_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# üéØ L·ªçc v√† k·∫øt h·ª£p user-assistant th√†nh c√°c ƒëo·∫°n h·ªôi tho·∫°i
conversations = []
for item in data:
    messages = item.get("messages", [])
    pair = {}
    for m in messages:
        if m["role"] == "assistant" and m.get("content") is None:
            pair = None
            break
        if m["role"] == "user":
            pair["user"] = m["content"]
        elif m["role"] == "assistant":
            pair["assistant"] = m["content"]
    if pair and "user" in pair and "assistant" in pair:
        conversations.append(pair)

# ‚ú® Tr√≠ch xu·∫•t embedding t·ª´ c√¢u h·ªèi user (PhoBERT)
def get_embedding(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=256, truncation=True)
    with torch.no_grad():
        output = model(input_ids)[0]
        embedding = output.mean(dim=1).squeeze().numpy()
    return embedding

questions = [conv["user"] for conv in conversations]
answers = [conv["assistant"] for conv in conversations]
embeddings = [get_embedding(q) for q in tqdm(questions, desc="Embedding")]

# üîç Nh√≥m theo ng·ªØ nghƒ©a ƒë·ªÉ lo·∫°i tr√πng l·∫∑p
similarity_matrix = cosine_similarity(embeddings)
distance_matrix = np.clip(1 - similarity_matrix, 0, None)
dbscan = DBSCAN(metric="precomputed", eps=0.1, min_samples=1).fit(distance_matrix)

# üßπ Gi·ªØ l·∫°i 1 c√¢u h·ªèi ƒë·∫°i di·ªán cho m·ªói nh√≥m
selected_indices = {label: idx for idx, label in enumerate(dbscan.labels_)}.values()
cleaned_data = []
for i in selected_indices:
    cleaned_data.append({
        "messages": [
            {"role": "user", "content": questions[i]},
            {"role": "assistant", "content": answers[i]}
        ]
    })

# üìù Ghi ra file JSONL
output_path = "/opt/workspace/clean_conversations.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for item in cleaned_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"‚úÖ ƒê√£ l∆∞u file t·∫°i: {output_path}")