In [None]:
pip install pyspark spark-nlp pandas matplotlib scipy

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, udf, lower, trim, regexp_replace
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, VectorUDT
from scipy.spatial.distance import cosine

import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, DeBertaEmbeddings, SentenceEmbeddings

# 1. Khởi tạo Spark NLP
spark = sparknlp.start()

# 2. Đọc dữ liệu JSONL
input_file_path = "/opt/workspace/gen_1604_formated.jsonl"
df = spark.read.option("multiLine", False).json(input_file_path)

user_questions = df.select(explode("messages").alias("msg")) \
    .filter(col("msg.role") == "user") \
    .select(col("msg.content").alias("text")) \
    .filter(col("text").isNotNull())

# 3. Pipeline NLP
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie", "vie") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("word_embeddings")

sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["document", "word_embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")  # hoặc "CLS" nếu có

pipeline = Pipeline(stages=[
    document_assembler,
    tokenizer,
    embeddings,
    sentence_embeddings
])

model = pipeline.fit(user_questions)
embedded_data = model.transform(user_questions)

# 4. Trích xuất vector thành cột features
def extract_vector(annot):
    if annot and isinstance(annot, list) and 'embeddings' in annot[0]:
        return Vectors.dense(annot[0]['embeddings'])
    return Vectors.dense([0.0] * 768)

extract_vector_udf = udf(extract_vector, VectorUDT())
vectorized_data = embedded_data.withColumn("features", extract_vector_udf(col("sentence_embeddings")))

# 5. Chuyển về Pandas để tính cosine similarity
pd_data = vectorized_data.select("text", "features").toPandas()
pd_data["features"] = pd_data["features"].apply(lambda v: v.toArray())

# 6. Nhóm theo ngữ nghĩa bằng cosine similarity
semantic_groups = []
visited = set()
threshold = 0.15  # cosine distance dưới ngưỡng này là cùng ngữ nghĩa

for idx, (text_i, vec_i) in enumerate(zip(pd_data["text"], pd_data["features"])):
    if idx in visited:
        continue
    group = [text_i]
    visited.add(idx)
    for jdx in range(idx + 1, len(pd_data)):
        if jdx in visited:
            continue
        dist = cosine(vec_i, pd_data["features"][jdx])
        if dist < threshold:
            group.append(pd_data["text"][jdx])
            visited.add(jdx)
    semantic_groups.append(group)

# 7. Sắp xếp và in nhóm ngữ nghĩa theo số lượng câu giảm dần
semantic_groups = sorted(semantic_groups, key=len, reverse=True)

print("Các nhóm câu hỏi trùng ngữ nghĩa (sắp xếp theo số lượng):")
for i, group in enumerate(semantic_groups):
    print(f"\nNhóm {i+1} (số lượng: {len(group)}):")
    for q in group:
        print(f"- {q}")

