In [6]:
import re

from unidecode import unidecode  # for handling diacritics


def sanitize_filename(title: str, max_length: int = 80) -> str:
    """Sanitize YouTube title for use as filename."""
    # Remove anything in square brackets
    clean = re.sub(r'\[.*?\]', '', title)
    
    # Remove anything in parentheses
    clean = re.sub(r'\(.*?\)', '', clean)
    
    # Replace problematic characters
    clean = re.sub(r'[\\/*?:"<>|]', '', clean)
    
    # Remove extra whitespace
    clean = ' '.join(clean.split())
    
    # Transliterate Vietnamese characters to ASCII
    clean = unidecode(clean)
    
    # Truncate and clean up
    clean = clean[:max_length].strip()
    
    # Replace spaces with underscores
    clean = clean.replace(' ', '_')
    
    return clean

# Test it
title = "Kinh Văn Căn Bản của Thiền Tập 01 [TTSĐCTTĐB 03] ｜ TS Thích Nhất Hạnh(27-11-1994, Làng Mai)"
print(sanitize_filename(title))

Kinh_Van_Can_Ban_cua_Thien_Tap_01_|_TS_Thich_Nhat_Hanh
