In [None]:
pip install --upgrade pyspark spark-nlp pandas matplotlib scipy google-generativeai pandas openpyxl

In [None]:
import json
import pandas as pd
import math
import re
import time
from collections import defaultdict

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, udf
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, VectorUDT

import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, DeBertaEmbeddings, SentenceEmbeddings
from scipy.spatial.distance import cosine

import google.generativeai as genai  # ‚úÖ Gemini API

import matplotlib.pyplot as plt
import seaborn as sns

# --- B·∫Øt ƒë·∫ßu t√≠nh th·ªùi gian ---
total_start = time.time()

# --- 1. Kh·ªüi t·∫°o Spark NLP ---
t1 = time.time()
spark = sparknlp.start()
print(f"‚úÖ Kh·ªüi t·∫°o Spark NLP: {time.time() - t1:.2f} gi√¢y")

# --- 2. ƒê·ªçc d·ªØ li·ªáu JSONL ---
t2 = time.time()
input_file_path = "/opt/workspace/gen_1604_formated.jsonl"
df = spark.read.option("multiLine", False).json(input_file_path)
user_questions = df.select(explode("messages").alias("msg")) \
    .filter(col("msg.role") == "user") \
    .select(col("msg.content").alias("text")) \
    .filter(col("text").isNotNull())
print(f"‚úÖ ƒê·ªçc d·ªØ li·ªáu: {time.time() - t2:.2f} gi√¢y")

# --- 3. Pipeline NLP ---
t3 = time.time()
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie", "vie") \
    .setInputCols(["document", "token"]).setOutputCol("word_embeddings")
sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["document", "word_embeddings"]).setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")
pipeline = Pipeline(stages=[document_assembler, tokenizer, embeddings, sentence_embeddings])
model = pipeline.fit(user_questions)
embedded_data = model.transform(user_questions)
print(f"‚úÖ NLP Embedding: {time.time() - t3:.2f} gi√¢y")

# --- 4. Tr√≠ch xu·∫•t vector ---
t4 = time.time()
def extract_vector(annot):
    if annot and isinstance(annot, list) and 'embeddings' in annot[0]:
        return Vectors.dense(annot[0]['embeddings'])
    return Vectors.dense([0.0] * 768)

extract_vector_udf = udf(extract_vector, VectorUDT())
vectorized_data = embedded_data.withColumn("features", extract_vector_udf(col("sentence_embeddings")))
print(f"‚úÖ Tr√≠ch xu·∫•t vector: {time.time() - t4:.2f} gi√¢y")

# --- 5. Chuy·ªÉn v·ªÅ Pandas ---
t5 = time.time()
pd_data = vectorized_data.select("text", "features").toPandas()
pd_data["features"] = pd_data["features"].apply(lambda v: v.toArray())
print(f"‚úÖ Chuy·ªÉn sang Pandas: {time.time() - t5:.2f} gi√¢y")

# --- 6. Nh√≥m ng·ªØ nghƒ©a ---
t6 = time.time()
semantic_groups = []
visited = set()
threshold = 0.15
for idx, (text_i, vec_i) in enumerate(zip(pd_data["text"], pd_data["features"])):
    if idx in visited:
        continue
    group = [text_i]
    visited.add(idx)
    for jdx in range(idx + 1, len(pd_data)):
        if jdx in visited:
            continue
        dist = cosine(vec_i, pd_data["features"][jdx])
        if dist < threshold:
            group.append(pd_data["text"][jdx])
            visited.add(jdx)
    semantic_groups.append(group)
semantic_groups = sorted(semantic_groups, key=len, reverse=True)
print(f"‚úÖ Nh√≥m ng·ªØ nghƒ©a: {time.time() - t6:.2f} gi√¢y")

# --- 7. C·∫•u h√¨nh Gemini ---
GEMINI_API_KEY = "AIzaSyBRRCysUg0kCd1rLPA8dt0LwP-BS1hC9SQ"  # üõ†Ô∏è Thay b·∫±ng key h·ª£p l·ªá
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.5-pro-preview-03-25")

# --- 8. T√°ch nh√≥m theo batch ---
def split_groups_into_batches(groups, batch_size=10):
    for i in range(0, len(groups), batch_size):
        yield groups[i:i+batch_size]

# --- 9. G·ªçi Gemini API ---
def classify_multiple_groups_with_gemini(groups_batch):
    prompt = (
        "B·∫°n h√£y ph√¢n lo·∫°i t·ª´ng nh√≥m c√°c c√¢u h·ªèi d∆∞·ªõi ƒë√¢y v·ªÅ 5 lo·∫°i:\n"
        "1. B√°o h·ªèng thi·∫øt b·ªã, s·ª± c·ªë, tr·∫°ng th√°i b√°o h·ªèng thi·∫øt b·ªã.\n"
        "2. B·∫£o d∆∞·ª°ng thi·∫øt b·ªã, tr·∫°ng th√°i b·∫£o d∆∞·ª°ng, l·ªãch b·∫£o d∆∞·ª°ng.\n"
        "3. ƒêi·ªÅu chuy·ªÉn thi·∫øt b·ªã, thi·∫øt b·ªã ƒë∆∞·ª£c ƒëi·ªÅu chuy·ªÉn ƒëi ƒë√¢u.\n"
        "4. V·∫•n ƒë·ªÅ nh√¢n s·ª±, khu v·ª±c, th√¥ng tin c√° nh√¢n, ch·ª©c v·ª•, khu v·ª±c qu·∫£n l√Ω, ng∆∞·ªùi qu·∫£n l√Ω, t√™n ri√™ng.\n"
        "5. T√†i s·∫£n, thi·∫øt b·ªã, lo·∫°i t√†i s·∫£n, khu v·ª±c ch·ª©a t√†i s·∫£n.\n\n"
        "Danh s√°ch c√°c nh√≥m c√¢u h·ªèi:\n"
    )
    for group_idx, group_texts in enumerate(groups_batch, 1):
        prompt += f"Nh√≥m {group_idx}:\n"
        for idx, text in enumerate(group_texts, 1):
            prompt += f"  {idx}. {text}\n"
        prompt += "\n"
    prompt += "H√£y tr·∫£ v·ªÅ danh s√°ch c√°c s·ªë nguy√™n t·ª´ 1 ƒë·∫øn 5, m·ªói s·ªë l√† ph√¢n lo·∫°i cho nh√≥m t∆∞∆°ng ·ª©ng theo th·ª© t·ª± nh√≥m ƒë√£ cho, v√≠ d·ª•: [1, 2, 1, 5, 3,...]"

    try:
        response = gemini_model.generate_content(prompt)
        answer = response.text.strip()
        labels = list(map(int, re.findall(r"[1-5]", answer)))
        return labels
    except Exception as e:
        print(f"L·ªói g·ªçi Gemini: {e}")
        return None

# --- 10. Ch·∫°y ph√¢n lo·∫°i theo batch ---
t7 = time.time()
batch_size = 10
all_group_labels = []
for batch_idx, batch_groups in enumerate(split_groups_into_batches(semantic_groups, batch_size=batch_size)):
    labels = classify_multiple_groups_with_gemini(batch_groups)
    if labels:
        all_group_labels.extend(labels)
    else:
        all_group_labels.extend([None] * len(batch_groups))
    print(f"ƒê√£ x·ª≠ l√Ω batch {batch_idx + 1} / {math.ceil(len(semantic_groups) / batch_size)}")
print(f"‚úÖ G·ªçi Gemini & ph√¢n lo·∫°i: {time.time() - t7:.2f} gi√¢y")

# --- 11. K·∫øt qu·∫£ chi ti·∫øt ---
for i, (group, label) in enumerate(zip(semantic_groups, all_group_labels)):
    print(f"\nNh√≥m {i+1} (Lo·∫°i {label}, s·ªë l∆∞·ª£ng: {len(group)}):")
    for q in group:
        print(f"- {q}")

# --- 12. Th·ªëng k√™ s·ªë l∆∞·ª£ng c√¢u h·ªèi theo lo·∫°i d∆∞·ªõi d·∫°ng b·∫£ng ---
category_labels = {
    1: "B√°o h·ªèng thi·∫øt b·ªã",
    2: "B·∫£o d∆∞·ª°ng thi·∫øt b·ªã",
    3: "ƒêi·ªÅu chuy·ªÉn thi·∫øt b·ªã",
    4: "V·∫•n ƒë·ªÅ nh√¢n s·ª±",
    5: "T√†i s·∫£n / thi·∫øt b·ªã"
}
category_stats = defaultdict(int)
for group, label in zip(semantic_groups, all_group_labels):
    if label is not None:
        category_stats[label] += len(group)

stats_df = pd.DataFrame([
    {"Lo·∫°i": i, "T√™n ph√¢n lo·∫°i": category_labels[i], "S·ªë l∆∞·ª£ng c√¢u h·ªèi": category_stats[i]}
    for i in range(1, 6)
])
print("\n--- Th·ªëng k√™ s·ªë l∆∞·ª£ng c√¢u h·ªèi theo ph√¢n lo·∫°i ---")
print(stats_df.to_string(index=False))

# --- 13. V·∫Ω bi·ªÉu ƒë·ªì c·ªôt ---
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
barplot = sns.barplot(
    x="T√™n ph√¢n lo·∫°i", 
    y="S·ªë l∆∞·ª£ng c√¢u h·ªèi", 
    data=stats_df, 
    palette="Set2"
)
for p in barplot.patches:
    barplot.annotate(
        format(p.get_height(), ".0f"), 
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha='center', va='center',
        fontsize=11, color='black', 
        xytext=(0, 10), 
        textcoords='offset points'
    )
plt.title("Bi·ªÉu ƒë·ªì ph√¢n lo·∫°i c√°c nh√≥m c√¢u h·ªèi", fontsize=16)
plt.xlabel("Ph√¢n lo·∫°i", fontsize=12)
plt.ylabel("S·ªë l∆∞·ª£ng c√¢u h·ªèi", fontsize=12)
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()

# --- T·ªïng k·∫øt th·ªùi gian ---
total_end = time.time()
print(f"\n‚è±Ô∏è T·ªïng th·ªùi gian th·ª±c thi to√†n b·ªô script: {total_end - total_start:.2f} gi√¢y")
