In [1]:
import json
import os
from pathlib import Path
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 定义相对路径
themes_dir = Path("../../Data/Process/Part1/themes")
# 检查路径是否存在
if not themes_dir.exists():
    raise FileNotFoundError(f"无法找到主题文件夹: {themes_dir}")

# 读取所有JSON文件并构建题库
ielts_questions = []
for json_file in themes_dir.glob("*.json"):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        ielts_questions.append({
            "theme": data["theme"],
            "questions": data["questions"]
        })

In [3]:
# 获取主题列表
themes = [item["theme"] for item in ielts_questions]

# 加载预训练 BERT 模型用于语义嵌入
model = SentenceTransformer("all-MiniLM-L6-v2")

# 计算主题的向量表示
theme_embeddings = model.encode(themes)

In [4]:
# 进行聚类
num_clusters = 5  # 设定核心主题数目
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(theme_embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
# 归类核心主题
core_themes = {}
for idx, label in enumerate(clusters):
    if label not in core_themes:
        core_themes[label] = []
    core_themes[label].append(themes[idx])

In [6]:
# 输出核心主题归类结果
for cluster, grouped_themes in core_themes.items():
    print(f"\n核心主题 {cluster + 1}: {', '.join(grouped_themes)}")


核心主题 1: Old buildings, Borrowing and lending, Accommodation, Sitting down, Area you live in, Hometown, Street markets

核心主题 3: Friends, Lost and found, Feeling bored, Stories, Work, Swimming, Sports, Art

核心主题 2: Dreams, Mirrors

核心主题 5: Study, Advertisements, Websites, Reading, Cars, Hats/Caps, Emails, Collecting things, Computers, Talents, Colors

核心主题 4: Evening time, Watches, Time management


In [7]:
from openai import OpenAI

In [10]:
# 构造 Prompt
PROMPT_TEMPLATE = """
You are an experienced IELTS speaking examiner. Below is a collection of common Part 1 IELTS speaking topics:  

{text}  

### Your Task: 
1. Summarize 5-7 core themes, each covering multiple original topics.  
2. Explain the reasoning behind your categorization to ensure alignment with IELTS exam logic.  
3. Ensure the theme names are concise and clear.  

### Output Format: 
Core Theme 1: [Theme Name]  
- [Subtopic]  
- [Subtopic]  
- [Subtopic]  
(Reasoning: …)  

Core Theme 2: [Theme Name]  
- [Subtopic]  
- [Subtopic]  
- [Subtopic]  
(Reasoning: …) 

...  
"""

In [24]:
def extract_theme_with_llm(text, client):
    prompt = PROMPT_TEMPLATE.format(text=text)
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are an experienced IELTS speaking examiner"},
            {"role": "user", "content": prompt}
        ],
        temperature=1.0,
        stream=False
    )
    
    result = response.choices[0].message.content
    return result;

In [25]:
# 从环境变量获取API密钥
api_key = os.getenv('DEEPSEEK_API_KEY')
if not api_key:
    raise ValueError("请在.env文件中设置DEEPSEEK_API_KEY环境变量")

client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
result = extract_theme_with_llm(ielts_questions, client)
print(result)

### Core Theme 1: Daily Life and Personal Preferences  
- Old buildings  
- Accommodation  
- Evening time  
- Sitting down  
- Colors  
(Reasoning: These topics revolve around personal habits, living environments, and preferences, which are common in IELTS Part 1 to assess the candidate's ability to describe their daily life and express opinions about familiar settings.)  

---

### Core Theme 2: Social Interactions and Relationships  
- Friends  
- Borrowing and lending  
- Stories  
- Hometown  
(Reasoning: These topics focus on interpersonal relationships, social activities, and community connections, which are essential for evaluating the candidate's ability to discuss social dynamics and personal experiences.)  

---

### Core Theme 3: Hobbies and Leisure Activities  
- Reading  
- Sports  
- Swimming  
- Art  
- Collecting things  
(Reasoning: These topics explore the candidate's interests, hobbies, and leisure activities, allowing them to demonstrate vocabulary related to recre

In [17]:
import re

In [18]:
def parse_theme_text_to_json(text):
    # 定义正则表达式模式
    theme_pattern = r"### Core Theme (\d+): \*\*(.*?)\*\*\s*(.*?)\s*\(Reasoning: (.*?)\)"
    summary_pattern = r"### Summary of Reasoning:\s*(.*)"
    
    # 查找所有主题
    themes = re.findall(theme_pattern, text, re.DOTALL)
    
    # 查找总结
    summary_match = re.search(summary_pattern, text, re.DOTALL)
    summary = summary_match.group(1).strip() if summary_match else ""
    
    # 构建JSON结构
    result = {
        "core_themes": [],
        "summary": summary
    }
    
    for theme in themes:
        theme_number, theme_name, subtopics_text, reasoning = theme
        subtopics = [s.strip() for s in subtopics_text.split("-") if s.strip()]
        
        theme_data = {
            "theme_number": int(theme_number),
            "theme_name": theme_name.strip(),
            "subtopics": subtopics,
            "reasoning": reasoning.strip()
        }
        result["core_themes"].append(theme_data)
    
    return result

In [26]:
chunk_res = parse_theme_text_to_json(result)
print(json.dumps(chunk_res, indent=2, ensure_ascii=False))

{
  "core_themes": [
    {
      "theme_number": 1,
      "theme_name": "Daily Life and Personal Preferences",
      "subtopics": [
        "Old buildings",
        "Accommodation",
        "Evening time",
        "Sitting down",
        "Colors"
      ],
      "reasoning": "These topics revolve around personal habits, living environments, and preferences, which are common in IELTS Part 1 to assess the candidate's ability to describe their daily life and express opinions about familiar settings."
    },
    {
      "theme_number": 2,
      "theme_name": "Social Interactions and Relationships",
      "subtopics": [
        "Friends",
        "Borrowing and lending",
        "Stories",
        "Hometown"
      ],
      "reasoning": "These topics focus on interpersonal relationships, social activities, and community connections, which are essential for evaluating the candidate's ability to discuss social dynamics and personal experiences."
    },
    {
      "theme_number": 3,
      "them

In [20]:
def save_json_to_file(data, file_path):
    # 确保目录存在
    file_path = Path(file_path)
    file_path.parent.mkdir(parents=True, exist_ok=True)
    
    # 保存JSON文件
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"文件已成功保存到: {file_path.absolute()}")

In [27]:
# 保存到文件
output_path = "Part1/summary/summary.json"
save_json_to_file(chunk_res, output_path)

文件已成功保存到: /Users/yokumi/Documents/IELTS_Learner/Process/Extraction/Part1/summary/summary.json
