In [None]:
!pip install pyspark sparknlp numpy scikit-learn tqdm --upgrade transformers torch accelerate

import json
import re
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN

# üîß Load PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")
model.eval()

# üìÇ Load JSONL data
file_path = "/opt/workspace/data.jsonl"
with open(file_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# üéØ Extract user-assistant pairs & clean invalid assistant responses
conversations = []
for item in data:
    messages = item.get("messages", [])
    pair = {}
    for m in messages:
        if m["role"] == "assistant":
            if m.get("content") is None or "can not solve" in m.get("content", "").lower():
                pair = None
                break
            pair["assistant"] = m["content"]
        elif m["role"] == "user":
            pair["user"] = m["content"]
    if pair and "user" in pair and "assistant" in pair:
        conversations.append(pair)

# ‚ú® T·∫°o embedding t·ª´ user question b·∫±ng PhoBERT
def get_embedding(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=256, truncation=True)
    with torch.no_grad():
        output = model(input_ids)[0]
        embedding = output.mean(dim=1).squeeze().numpy()
    return embedding

questions = [conv["user"] for conv in conversations]
answers = [conv["assistant"] for conv in conversations]

# üß† Chu·∫©n h√≥a c√¢u h·ªèi ƒë·ªÉ l·ªçc theo m·∫´u chung
def normalize_question(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"cho t√¥i th√¥ng tin c·ªßa [\w\s\.\-]+", "cho t√¥i th√¥ng tin c·ªßa ...", text)
    text = re.sub(r"c√≥ bao nhi√™u thi·∫øt b·ªã [\w\s]+ t·∫°i [\w\s]+", "c√≥ bao nhi√™u thi·∫øt b·ªã ... t·∫°i ...", text)
    text = re.sub(r"c√≥ bao nhi√™u thi·∫øt b·ªã [\w\s]+", "c√≥ bao nhi√™u thi·∫øt b·ªã ...", text)
    text = re.sub(r"thi·∫øt b·ªã [\w\s]+ c√≥ nh·ªØng thu·ªôc t√≠nh g√¨", "thi·∫øt b·ªã ... c√≥ nh·ªØng thu·ªôc t√≠nh g√¨", text)
    text = re.sub(r"cho t√¥i bi·∫øt th√¥ng tin", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"cho t√¥i bi·∫øt s·ªë l∆∞·ª£ng", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"cho t√¥i bi·∫øt th√¥ng tin nh·ªØng thi·∫øt b·ªã", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"th·ªëng k√™", " ", text, flags=re.IGNORECASE)
    return text.strip()

normalized_questions = [normalize_question(q) for q in questions]

# üßπ L·∫•y ch·ªâ 1 c√¢u ƒë·∫°i di·ªán cho m·ªói d·∫°ng m·∫´u
unique_indices = {}
for idx, norm_q in enumerate(normalized_questions):
    if norm_q not in unique_indices:
        unique_indices[norm_q] = idx

filtered_questions = [questions[i] for i in unique_indices.values()]
filtered_answers = [answers[i] for i in unique_indices.values()]

# üß† L·∫•y embedding v√† so s√°nh ƒë·ªô t∆∞∆°ng ƒë·ªìng ƒë·ªÉ lo·∫°i tr√πng theo nghƒ©a
embeddings = [get_embedding(q) for q in tqdm(filtered_questions, desc="Embedding")]

similarity_matrix = cosine_similarity(embeddings)
distance_matrix = np.clip(1 - similarity_matrix, 0, None)
dbscan = DBSCAN(metric="precomputed", eps=0.1, min_samples=1).fit(distance_matrix)

# üßπ Gi·ªØ l·∫°i 1 c√¢u h·ªèi ƒë·∫°i di·ªán cho m·ªói c·ª•m ng·ªØ nghƒ©a
selected_indices = {label: idx for idx, label in enumerate(dbscan.labels_)}.values()
cleaned_data = []
for i in selected_indices:
    cleaned_data.append({
        "messages": [
            {"role": "user", "content": filtered_questions[i]},
            {"role": "assistant", "content": filtered_answers[i]}
        ]
    })

# üíæ Xu·∫•t ra file
output_path = "/opt/workspace/clean_conversations.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for item in cleaned_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"‚úÖ ƒê√£ l∆∞u file s·∫°ch t·∫°i: {output_path}")
