In [None]:
pip install pyspark spark-nlp pandas matplotlib

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, udf, lower, trim, regexp_replace, count, split
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler, CountVectorizer
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import collect_list
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, DeBertaEmbeddings

# Kh·ªüi t·∫°o Spark NLP
spark = sparknlp.start()

# ƒê·ªçc d·ªØ li·ªáu t·ª´ file JSONL
input_file_path = "/opt/workspace/gen_1604_formated.jsonl"
df = spark.read.option("multiLine", False).json(input_file_path)

# Tr√≠ch xu·∫•t c√¢u h·ªèi t·ª´ role l√† "user"
user_questions = df.select(explode("messages").alias("msg")) \
    .filter(col("msg.role") == "user") \
    .select(col("msg.content").alias("text")) \
    .filter(col("text").isNotNull())

# T·∫°o pipeline NLP
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie", "vie") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings") \
    .setCaseSensitive(True)

pipeline = Pipeline(stages=[document_assembler, tokenizer, embeddings])

# Ch·∫°y pipeline
model = pipeline.fit(user_questions)
embedded_data = model.transform(user_questions)

# UDF ƒë·ªÉ t√≠nh vector trung b√¨nh t·ª´ embedding
def avg_embeddings(embeddings):
    if embeddings:
        avg = np.mean([e['embeddings'] for e in embeddings], axis=0)
        return Vectors.dense(avg.tolist())
    return Vectors.dense([])

avg_embeddings_udf = udf(avg_embeddings, VectorUDT())

# Chuy·ªÉn th√†nh vector ƒë·∫∑c tr∆∞ng
vectorized_data = embedded_data.withColumn("features", avg_embeddings_udf(col("embeddings")))

# Chu·∫©n h√≥a vector
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(vectorized_data)
scaled_data = scaler_model.transform(vectorized_data)

# KMeans clustering
kmeans = KMeans(featuresCol="scaled_features", predictionCol="cluster", k=3)
kmeans_model = kmeans.fit(scaled_data)
clustered_data = kmeans_model.transform(scaled_data)

# Chu·∫©n h√≥a text: ch·ªâ lo·∫°i d·∫•u c√¢u, gi·ªØ nguy√™n ti·∫øng Vi·ªát
normalized_data = clustered_data.withColumn(
    "normalized_text",
    trim(lower(regexp_replace(col("text"), "[\\p{Punct}]", "")))
)

# ƒê·∫øm t·∫ßn su·∫•t c√°c c√¢u h·ªèi sau khi normalize
grouped = normalized_data.groupBy("normalized_text", "cluster") \
    .agg(count("*").alias("frequency")) \
    .orderBy(col("frequency").desc())

# L·∫•y 10 c√¢u h·ªèi c√≥ t·∫ßn su·∫•t xu·∫•t hi·ªán nhi·ªÅu nh·∫•t
top_10 = grouped.limit(10)
print("Top 10 c√¢u h·ªèi xu·∫•t hi·ªán nhi·ªÅu nh·∫•t:")
top_10.show(truncate=False)

# V·∫Ω bi·ªÉu ƒë·ªì t·∫ßn su·∫•t theo c·ª•m
cluster_counts = grouped.groupBy("cluster").sum("frequency") \
    .withColumnRenamed("sum(frequency)", "count") \
    .orderBy("cluster") \
    .toPandas()

plt.figure(figsize=(8, 5))
plt.bar(cluster_counts["cluster"], cluster_counts["count"], color="teal")
plt.xlabel("Cluster ID")
plt.ylabel("Number of Questions")
plt.title("Semantic Question Clustering Frequency")
plt.xticks(cluster_counts["cluster"])
plt.tight_layout()
plt.show()

# ================================
# üîç Tr√≠ch xu·∫•t t·ª´ kh√≥a c·ªßa t·ª´ng cluster
# ================================
print("\nT·ª™ KH√ìA ƒê·∫†I DI·ªÜN CHO M·ªñI CLUSTER:\n")

from pyspark.ml.feature import CountVectorizer

# Chu·∫©n b·ªã d·ªØ li·ªáu token h√≥a cho t·∫•t c·∫£
tokenized_data = normalized_data.withColumn(
    "tokens", split(col("normalized_text"), " ")
)

# R√∫t t·ª´ kh√≥a cho t·ª´ng cluster
for i in range(kmeans.getK()):
    print(f"=== Cluster {i} ===")
    cluster_df = tokenized_data.filter(col("cluster") == i)

    # T√≠nh TF cho c·ª•m
    cv = CountVectorizer(inputCol="tokens", outputCol="keyword_features", vocabSize=10)
    cv_model = cv.fit(cluster_df)
    top_keywords = cv_model.vocabulary
    print("Top keywords:", top_keywords)
    print()
# Gom c√¢u h·ªèi v√† embeddings theo cluster
clusters = clustered_data.select("cluster", "text", "features") \
    .groupBy("cluster") \
    .agg(
        collect_list("text").alias("questions"),
        collect_list("features").alias("features_list")
    ).collect()

# H√†m ch·ªçn c√¢u ƒë·∫°i di·ªán (medoid) cho m·ªói cluster
def get_medoid_question(questions, features):
    vecs = np.array([v.toArray() for v in features])
    dists = cosine_distances(vecs)
    total_dists = dists.sum(axis=1)
    medoid_idx = np.argmin(total_dists)
    return questions[medoid_idx]

# T·∫°o b·∫£ng ch·ªß ƒë·ªÅ ƒë·∫°i di·ªán
topic_table = []
for row in clusters:
    cluster_id = row["cluster"]
    questions = row["questions"]
    features = row["features_list"]
    topic = get_medoid_question(questions, features)
    frequency = len(questions)
    topic_table.append((cluster_id, topic, frequency))

# Chuy·ªÉn sang pandas ƒë·ªÉ in ra b·∫£ng
import pandas as pd
topic_df = pd.DataFrame(topic_table, columns=["Cluster", "Topic", "Frequency"])
topic_df = topic_df.sort_values("Cluster")
print("\nüìå Ch·ªß ƒë·ªÅ ƒë·∫°i di·ªán cho t·ª´ng c·ª•m:")
print(topic_df.to_string(index=False))