In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import re
from typing import Any, Dict, Optional, Tuple

import yaml

from tnh_scholar.metadata import Frontmatter, Metadata

In [3]:
my_dict = {"key1": "test1", "key2": "test2"}

In [4]:
meta = Metadata(my_dict)


In [7]:
meta.to_dict()

{'key1': 'test1', 'key2': 'test2'}

In [7]:
out = Frontmatter.embed(my_dict, "This is a short\ndocument with two lines.")
print(out)

---
key1: test1
key2: test2
---

This is a short
document with two lines.


In [8]:
Frontmatter.extract(out)

({'key1': 'test1', 'key2': 'test2'},
 'This is a short\ndocument with two lines.')

In [None]:
# JSON-LD implementation:

class MetadataHandler:
    """Handles JSON-LD metadata embedding and extraction for text documents."""
    
    JSONLD_MARKER = "---begin:json-ld---"
    MARKER_END = "---end:json-ld---"
    
    @staticmethod
    def extract(content: str) -> Tuple[Optional[Dict[str, Any]], str]:
        """Extract JSON-LD metadata and content from text.
        
        Args:
            content: Text content potentially containing JSON-LD metadata
            
        Returns:
            Tuple of (metadata dict or None, content without metadata)
        """
        pattern = f"{MetadataHandler.JSONLD_MARKER}(.*?){MetadataHandler.MARKER_END}"
        if match := re.search(pattern, content, re.DOTALL):
            try:
                metadata = json.loads(match[1].strip())
                # Remove the metadata block from content
                clean_content = re.sub(pattern, "", content, flags=re.DOTALL).strip()
                return metadata, clean_content
            except json.JSONDecodeError:
                return None, content
        return None, content

    @staticmethod
    def embed(metadata: Dict[str, Any], content: str) -> str:
        """Embed metadata as JSON-LD in text content.
        
        Args:
            metadata: Dictionary of metadata to embed
            content: Text content
            
        Returns:
            Content with embedded metadata
        """
        json_str = json.dumps(metadata, indent=2)
        metadata_block = (
            f"{MetadataHandler.JSONLD_MARKER}\n"
            f"{json_str}\n{MetadataHandler.MARKER_END}\n\n"
            )
        return f"{metadata_block}{content}"

def create_youtube_metadata(video_info: Dict[str, Any]) -> Dict[str, Any]:
    """Create JSON-LD metadata for YouTube video content.
    
    Based on YouTube's VideoObject schema:
    https://schema.org/VideoObject
    
    Args:
        video_info: Video information from yt-dlp
        
    Returns:
        JSON-LD metadata dictionary
    """
    return {
        "@context": "https://schema.org",
        "@type": "VideoObject",
        "@id": video_info.get("id"),  
        "name": video_info.get("title"),
        "description": video_info.get("description"),
        "uploadDate": video_info.get("upload_date"),
        "duration": video_info.get("duration"),
        "author": {
            "@type": "Person",
            "name": video_info.get("uploader"),
            "url": video_info.get("channel_url")
        },
        "publisher": {
            "@type": "Organization",
            "name": video_info.get("channel"),
            "url": video_info.get("channel_url")
        },
        "inLanguage": video_info.get("language"),
        "genre": video_info.get("categories"),
        "keywords": video_info.get("tags"),
        "url": video_info.get("webpage_url"),
        "originalUrl": video_info.get("original_url"),
        "videoId": video_info.get("id"),
        "contentUrl": f"https://www.youtube.com/watch?v={video_info.get('id')}",
        "chanelUrl": video_info.get("channel_url"),
    }