In [1]:
import re
import json
from pathlib import Path
from datetime import timedelta

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from keybert import KeyBERT

# === Config ===
INPUT_FILE = "transcript.txt"
FORMATTED_FILE = "formatted_transcript.txt"
FINAL_JSON = "final/4nMwZhF_D-g_chapters.json"
CHUNK_SIZE = 300  # seconds

# === Load mBART (for summarization + translation) ===
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# === Load KeyBERT (for title generation) ===
kw_model = KeyBERT()

# === Utility Functions ===
def parse_timestamp(ts):
    h, m, s = map(int, ts.split(':'))
    return h * 3600 + m * 60 + s

def seconds_to_timestamp(seconds):
    return str(timedelta(seconds=seconds))

def summarize_and_translate(text):
    tokenizer.src_lang = "hi_IN"
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    summary_hi = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0]

    tokenizer.src_lang = "hi_IN"
    translated = model.generate(tokenizer(summary_hi, return_tensors="pt").input_ids, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

def get_title(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=1)
    return keywords[0][0] if keywords else "Untitled"

# === Step 1: Format the Transcript ===
with open(INPUT_FILE, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Save formatted file (optional step if already clean)
with open(FORMATTED_FILE, "w", encoding="utf-8") as out:
    for line in lines:
        line = line.strip()
        if line:
            out.write(line + '\n')

# === Step 2: Chunk Transcript into N-second Blocks ===
chunks = []
current_chunk = []
current_duration = 0
chunk_start = None

for line in lines:
    match = re.match(r"\[(\d+:\d+:\d+) - (\d+:\d+:\d+)\]: (.+)", line)
    if not match:
        continue

    start, end, content = match.groups()
    start_sec = parse_timestamp(start)
    end_sec = parse_timestamp(end)
    duration = end_sec - start_sec

    if chunk_start is None:
        chunk_start = start_sec

    if current_duration + duration > CHUNK_SIZE:
        chunks.append((chunk_start, current_chunk))
        current_chunk = [(start_sec, content)]
        current_duration = duration
        chunk_start = start_sec
    else:
        current_chunk.append((start_sec, content))
        current_duration += duration

if current_chunk:
    chunks.append((chunk_start, current_chunk))

# === Step 3: Summarize, Translate, Title, and Save ===
Path("final").mkdir(exist_ok=True)

results = []
for chunk_start, segment in chunks:
    full_text = " ".join([text for _, text in segment])
    translated_summary = summarize_and_translate(full_text)
    title = get_title(translated_summary)

    results.append({
        "start_time": seconds_to_timestamp(chunk_start),
        "title": title,
        "summary": translated_summary
    })

with open(FINAL_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print(f"✅ Saved chapters to: {FINAL_JSON}")


✅ Saved chapters to: final/4nMwZhF_D-g_chapters.json


In [2]:
def format_timestamp_properly(ts):
    """Ensure timestamp is in hh:mm:ss with leading zeroes."""
    parts = ts.split(":")
    return f"{int(parts[0]):02d}:{int(parts[1]):02d}:{int(parts[2]):02d}"

# Format and save to required JSON structure
final_output = []
for item in results:
    final_output.append({
        "start_time": format_timestamp_properly(item["start_time"]),
        "title": item["title"],
        "summary": item["summary"]
    })

with open("4nMwZhF_D-g_chapters.json", "w", encoding="utf-8") as f:
    json.dump(final_output, f, ensure_ascii=False, indent=4)

print("🎉 Final chapter file saved: 4nMwZhF_D-g_chapters.json")


🎉 Final chapter file saved: 4nMwZhF_D-g_chapters.json


In [4]:
import os
import json

# Path to the folder containing final chapter JSON files
final_folder = "final"  # Change if your folder is named differently

# List all JSON files in the folder
chapter_files = [f for f in os.listdir(final_folder) if f.endswith(".json")]

# Print the content of each file
for file in chapter_files:
    filepath = os.path.join(final_folder, file)
    print(f"\n📄 Contents of: {file}")
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
        for entry in data:
            print(f"⏱ {entry['start_time']} | 🏷 {entry['title']}\n📝 {entry['summary']}\n")



📄 Contents of: 4nMwZhF_D-g_chapters.json
⏱ 0:00:00 | 🏷 class maker
📝 Newcomers! Tea - maker, Pancher - maker, Cuddler - maker, Bar - dancer and waiter These are some professions, some jobs that some people do, but in our country these words are often used like jokes. In this short video today, I would like to talk a little bit about social evil, which is class - maker. Class - maker correctly heard its own class - maker. Class - maker on class - maker, that is, on the one hand, make a difference People of númer p removets Straße

⏱ 0:07:03 | 🏷 people inferiority
📝 People ignore them because they think they are of a lower level. They think they are of a lower section in society. Do you know the most special thing about a classist society? An induction who lives in such a society. A person who lives in such a society will always look down on the people below him. Mo and the people above him will always join hands. Will always bow down to the people above him. G sir, yes madam. There wil