In [12]:
import re
import pandas as pd
import torch
import numpy as np
from glob import glob
import os
import json

In [7]:
class TransformerToJson:
    def __init__(self, folder_path):
        self.folder_path = folder_path
        self.out_dir = os.path.join(folder_path, "processed_json")
        os.makedirs(self.out_dir, exist_ok=True)

    @staticmethod
    def parse_filename(filename):
        basename = os.path.basename(filename)
        m = re.match(r"(\d+)_(\d+)_(\d+)\.srt", basename)
        if not m:
            raise ValueError(f"Неправильное имя файла: {filename}")
        return int(m.group(1)), int(m.group(2)), int(m.group(3))

    @staticmethod
    def parse_srt(filepath):
        with open(filepath, encoding='utf-8-sig') as f:
            content = f.read()
        blocks = re.split(r'\n\s*\n', content)
        subtitles = []
        for block in blocks:
            lines = block.strip().split('\n')
            if len(lines) >= 3:
                index = lines[0].strip()
                try:
                    index = int(index)
                except Exception:
                    index = -1
                times = lines[1].strip()
                text = " ".join(lines[2:]).strip()
                m = re.match(r'(\d+:\d+:\d+,\d+)\s*-->\s*(\d+:\d+:\d+,\d+)', times)
                if m:
                    start, end = m.groups()
                else:
                    start, end = None, None
                subtitles.append({
                    "index": index,
                    "start": start,
                    "end": end,
                    "text": text
                })
        return subtitles

    @staticmethod
    def clean_subtitle_text(text):
        return re.sub(r'\([^)]+\)', '', text).strip()

    def process_srt_file(self, filepath):
        series_id, season, episode = self.parse_filename(filepath)
        data = {
            "series_id": series_id,
            "season": season,
            "episode": episode,
            "filename": os.path.basename(filepath),
            "subtitles": []
        }
        for sub in self.parse_srt(filepath):
            clean_text = self.clean_subtitle_text(sub['text'])
            if clean_text:  # Только непустые реплики
                entry = {
                    "index": sub["index"],
                    "start": sub["start"],
                    "end": sub["end"],
                    "text": sub["text"],
                    "clean_text": clean_text
                }
                data["subtitles"].append(entry)
        out_path = os.path.join(self.out_dir, os.path.basename(filepath).replace('.srt', '.json'))
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"Saved: {out_path}")

    def process_folder(self):
        files = sorted(glob(os.path.join(self.folder_path, "*.srt")))
        for file in files:
            self.process_srt_file(file)


In [8]:
t2j = TransformerToJson("subtitles")
t2j.process_folder()

Saved: subtitles/processed_json/14194_1_1.json
Saved: subtitles/processed_json/14194_1_10.json
Saved: subtitles/processed_json/14194_1_11.json
Saved: subtitles/processed_json/14194_1_2.json
Saved: subtitles/processed_json/14194_1_3.json
Saved: subtitles/processed_json/14194_1_4.json
Saved: subtitles/processed_json/14194_1_5.json
Saved: subtitles/processed_json/14194_1_6.json
Saved: subtitles/processed_json/14194_1_7.json
Saved: subtitles/processed_json/14194_1_8.json
Saved: subtitles/processed_json/14194_1_9.json
Saved: subtitles/processed_json/15716_1_1.json
Saved: subtitles/processed_json/15716_1_10.json
Saved: subtitles/processed_json/15716_1_11.json
Saved: subtitles/processed_json/15716_1_2.json
Saved: subtitles/processed_json/15716_1_3.json
Saved: subtitles/processed_json/15716_1_4.json
Saved: subtitles/processed_json/15716_1_5.json
Saved: subtitles/processed_json/15716_1_6.json
Saved: subtitles/processed_json/15716_1_7.json
Saved: subtitles/processed_json/15716_1_8.json
Saved: su

## Разметка предложений

**Данные код используется исключительно для разметки обучающей выборки**

In [4]:

def mark_key_phrases(json_path, key_indexes):
    if isinstance(key_indexes, int):
        key_indexes = [key_indexes]
    key_indexes = set(key_indexes)

    with open(json_path, encoding='utf-8') as f:
        data = json.load(f)

    for sub in data['subtitles']:
        idx = sub.get('index')
        try:
            idx = int(idx)
        except Exception:
            idx = -1
        sub['is_key'] = 1 if idx in key_indexes else 0

    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Файл размечен: {json_path}")


In [50]:
#mark_key_phrases("subtitles/processed_json/16784_1_2.json",[7, 10, 11, 14, 15, 22, 23, 61, 62, 66, 67, 81, 84, 86, 91, 92, 110, 113, 118, 121, 129, 130, 139, 142, 144, 145, 148, 171, 173, 176, 177, 191, 192, 193, 194, 196, 197, 209, 210, 213, 220])


Файл размечен: subtitles/processed_json/16784_1_2.json
