In [1]:
import os
import json
import re
from openai import OpenAI
from pathlib import Path

In [2]:
PROMPT_TEMPLATE = """
You are a professional IELTS speaking question bank processing assistant. Extract structured information from the following text based on the given requirements:  

### Text Content:
{text}  

### Processing Requirements:
1. Extract the theme:  
   - Identify the first meaningful heading marked with `#` as the theme.  
   - If the theme is in Chinese, translate it into English.  
   - If no valid theme is found, return `"theme": null`.  

2. Extract all complete questions:  
   - Questions must end with a `?`.  
   - Remove any numbering or additional characters before the question.  
   - If no questions are found, return `"questions": []`.  

3. Ignore irrelevant content, including:  
   - Unrelated text such as "雅思哥", "ELTS BR", or any OCR artifacts.  
   - Watermarks, special characters, and image links.  

4. Return the result in strict JSON format, with two fields: `theme` and `questions`, without any formatting such as Markdown or code blocks.   

### Example Output:
{{
    "theme": "App",
    "questions": [
        "What kind of apps have you downloaded on your phone?",
        "What apps do you still use?"
    ]
}}
"""

In [3]:
def process_text_with_llm(text, client):
    prompt = PROMPT_TEMPLATE.format(text=text)
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a professional IELTS speaking question bank processing assistant"},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        stream=False
    )
    
    try:
        result = json.loads(response.choices[0].message.content)
        print(f"成功提取主题：{result['theme']}")
        print(f"提取到{len(result['questions'])}个问题")
        return result
    except json.JSONDecodeError:
        print("API返回格式异常，尝试提取JSON部分...")
        json_str = re.search(r'\{.*\}', response.choices[0].message.content, re.DOTALL)
        if json_str:
            return json.loads(json_str.group())
        raise

In [4]:
def save_theme_data(theme_data, output_dir):
    # 创建主题文件夹
    theme_dir = Path(output_dir) / "themes"
    theme_dir.mkdir(exist_ok=True)
    
    # 生成安全的文件名
    safe_theme_name = re.sub(r'[^\w\u4e00-\u9fff-]', '_', theme_data['theme'])
    output_path = theme_dir / f"{safe_theme_name}.json"
    
    # 如果文件已存在则跳过
    if output_path.exists():
        print(f"主题 '{theme_data['theme']}' 已存在，跳过保存")
        return False
    
    # 保存JSON文件
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(theme_data, f, ensure_ascii=False, indent=2)
    
    print(f"成功保存主题 '{theme_data['theme']}' 到 {output_path}")
    return True

In [5]:
def process_md_file(file_path, client, output_dir):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # 使用LLM处理文本
    theme_data = process_text_with_llm(content, client)
    theme_data['source_file'] = str(Path(file_path).name)
    
    # 保存主题数据
    return save_theme_data(theme_data, output_dir)

In [6]:
def process_folder(folder_path, output_dir = None):
    # 从环境变量获取API密钥
    api_key = os.getenv('DEEPSEEK_API_KEY')
    if not api_key:
        raise ValueError("请在.env文件中设置DEEPSEEK_API_KEY环境变量")
    
    client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
    # 创建输出目录
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    processed_count = 0
    skipped_count = 0
    
    # 遍历文件夹中的所有md文件
    for filename in os.listdir(folder_path):
        if filename.endswith('.md'):
            file_path = os.path.join(folder_path, filename)
            try:
                success = process_md_file(file_path, client, output_dir)
                if success:
                    processed_count += 1
                else:
                    skipped_count += 1
            except Exception as e:
                print(f'处理文件 {filename} 时出错: {str(e)}')
                continue
    
    print(f"\n处理完成：")
    print(f"- 成功处理 {processed_count} 个文件")
    print(f"- 跳过 {skipped_count} 个已存在的主题")
    print(f"- 结果保存在 {output_dir / 'themes'} 目录中")

In [11]:
process_folder('/Users/yokumi/Documents/IELTS_Learner/Data/Spider/ocr_res/Part1', output_dir= "/Users/yokumi/Documents/IELTS_Learner/Data/Process/Part1")

成功提取主题：Old buildings
提取到4个问题
主题 'Old buildings' 已存在，跳过保存
成功提取主题：Lost and found
提取到4个问题
主题 'Lost and found' 已存在，跳过保存
成功提取主题：Art
提取到4个问题
成功保存主题 'Art' 到 /Users/yokumi/Documents/IELTS_Learner/Data/Process/Part1/themes/Art.json
成功提取主题：Work
提取到4个问题
成功保存主题 'Work' 到 /Users/yokumi/Documents/IELTS_Learner/Data/Process/Part1/themes/Work.json
成功提取主题：Computers
提取到4个问题
成功保存主题 'Computers' 到 /Users/yokumi/Documents/IELTS_Learner/Data/Process/Part1/themes/Computers.json
成功提取主题：Collecting things
提取到4个问题
成功保存主题 'Collecting things' 到 /Users/yokumi/Documents/IELTS_Learner/Data/Process/Part1/themes/Collecting_things.json
成功提取主题：Accommodation
提取到3个问题
成功保存主题 'Accommodation' 到 /Users/yokumi/Documents/IELTS_Learner/Data/Process/Part1/themes/Accommodation.json
成功提取主题：Cars
提取到4个问题
成功保存主题 'Cars' 到 /Users/yokumi/Documents/IELTS_Learner/Data/Process/Part1/themes/Cars.json
成功提取主题：Street markets
提取到4个问题
成功保存主题 'Street markets' 到 /Users/yokumi/Documents/IELTS_Learner/Data/Process/Part1/themes/Street_markets.json
成功提取主题