解析字幕文件生成json

In [48]:
import re
import json
import os
from collections import defaultdict

# 存储所有参数
bilingual_subtitles_without_time = []

def parse_ass(ass_file_path):
    # 读取ASS文件
    with open(ass_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    print(f"找到 {len(lines)} 行")
    # 找到事件部分
    events_start = False
    dialogues = []

    for line in lines:
        line = line.strip()
        if line.startswith('[Events]'):
            events_start = True
            continue
        if events_start and line.startswith('Dialogue:'):
            # 解析对话行
            parts = line.split(',', 9)  # 分割前9个逗号
            if len(parts) < 10:
                continue
                
            start_time = parts[1].strip()
            style = parts[3].strip()
            text = parts[9].strip()
            
            # 简单的文本清理
            text = re.sub(r'\{.*?\}', '', text)  # 移除样式标签
            text = text.replace('\\N', '\n')     # 转换换行符
            
            dialogues.append({
                'start': start_time,
                'style': style,
                'text': text
            })

    bilingual_subtitles = []

    print(f"找到 {len(dialogues)} 条对话")

    i = 0

    for dialogue in dialogues:
        # print(dialogue)
        # 处理每个对话
        if dialogue['style'] == 'Dial_JP':
            bilingual_subtitles.append({
                'start_time': dialogue['start'],
                'instruction': dialogue['text'] + '翻译成中文',
                'input': "",
                'output': ""
            })
        elif dialogue['style'] == 'Dial_CH':
            # 遍历 bilingual_subtitles
            for item in bilingual_subtitles:
                if item['start_time'] == dialogue['start']:
                    item['output'] = dialogue['text']
                    break

    for item in bilingual_subtitles:
        bilingual_subtitles_without_time.append({
            'instruction': item['instruction'],
            'input': item['input'],
            'output': item['output']
        })

    print(len(bilingual_subtitles_without_time))
    return bilingual_subtitles_without_time


# 保存为JSON
def to_json(output_json_path):
    print(len(bilingual_subtitles_without_time))
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(bilingual_subtitles_without_time, f, ensure_ascii=False, indent=2)

# 遍历整个文件夹获取全部ass文件
def find_ass_files(folder_path):
    ass_files = []
    
    # 遍历文件夹及其子文件夹
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith('.ass'):
                # 获取完整路径
                full_path = os.path.join(root, file)
                ass_files.append(full_path)
    
    return ass_files
    

ass_file_path = 'D:/myproject/aristr_whisper/dataset/ass/'
json_output = 'D:/myproject/aristr_whisper/dataset/json/dataset.json'

for ass_file in find_ass_files(ass_file_path):
    print(f"ass文件路径: {ass_file}")
    parse_ass(ass_file)

# parse_ass('D:/myproject/aristr_whisper/dataset/ass/01.ass')
to_json(json_output)



ass文件路径: D:/myproject/aristr_whisper/dataset/ass/01.ass
找到 897 行
找到 849 条对话
383
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/02.ass
找到 925 行
找到 877 条对话
782
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/03.ass
找到 824 行
找到 776 条对话
1133
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/04.ass
找到 901 行
找到 853 条对话
1516
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/05.ass
找到 893 行
找到 845 条对话
1901
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/06.ass
找到 921 行
找到 873 条对话
2295
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/07.ass
找到 894 行
找到 846 条对话
2670
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/08.ass
找到 814 行
找到 766 条对话
2996
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/09.ass
找到 842 行
找到 794 条对话
3350
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/10.ass
找到 672 行
找到 623 条对话
3624
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/11.ass
找到 684 行
找到 636 条对话
3898
ass文件路径: D:/myproject/aristr_whisper/dataset/ass/12.ass
找到 772 行
找到 721 条对话
4183
ass文件路径: D:/myproject/aristr_w