# Parse document text content and metadata with Tikara

In [1]:
# 1. Instantiate tika

from tikara import Tika

tika = Tika(lazy_load=True)

In [2]:
# 2. Pick an input file

from pathlib import Path


# input_doc = Path("../test/data/demo.docx")
# input_doc = Path("../test/data/bad_xml.xml")
# input_doc = Path("../test/data/CantinaBand3.wav")
# input_doc = Path("../test/data/docx-shapes.docx")
# input_doc = Path("../test/data/category-level.docx")
# input_doc = Path("../test/data/fake-email.eml")
# input_doc = Path("../test/data/fake-email-multiple-attachments.msg")
# input_doc = Path("../test/data/docx-tables.docx")
# input_doc = Path("../test/data/emoji.xlsx")
# input_doc = Path("../test/data/testPDF_childAttachments.pdf")
# input_doc = Path("../test/data/science-exploration-369p.pptx")
# input_doc = Path("../test/data/simple.epub")
# input_doc = Path("../test/data/failure-after-repair.pdf")
# input_doc = Path("../test/data/korean-text-with-tables.pdf")
# input_doc = Path("../test/data/docx-tables.docx")
input_doc = Path("../test/data/BigBuckBunny_320x180.mp4")
# input_doc = Path("../test/data/Aimer-les-fourmis_fma-140809_001_00-01-39.wav")
# input_doc = Path("../test/data/Arthur.mp3")

In [3]:
# 3. Extract text from the input file to a byte stream 
#  This is an advanced use case, remove the output_stream parameter to get a string output

output_stream, metadata = tika.parse(
    obj=input_doc,
    output_stream=True,
    output_format="xhtml"
)

content = output_stream.read().decode("utf-8")

metadata

TikaMetadata(encoding=None, compression='avc1', paragraph_count=None, revision=None, word_count=None, line_count=None, character_count=None, character_count_with_spaces=None, page_count=None, chars_per_page=None, table_count=None, component_count=None, image_count=None, hidden_slides=None, resource_name='../test/data/BigBuckBunny_320x180.mp4', resource_path=None, embedded_resource_type=None, embedded_relationship_id=None, embedded_depth=None, created=None, modified='2025:01:28 10:15:00-05:00', content_type='video/mp4', content_type_override=None, content_length=64657027, title='Big Buck Bunny', description=None, type=None, keywords=None, company=None, creator='Blender Foundation', publisher=None, contributor=None, language=None, identifier=None, application=None, application_version=None, producer=None, version=None, template=None, security=None, is_encrypted=None, height=180, width=320, duration='0:09:56', sample_rate=None, stream_count=None, image_pixel_aspect_ratio=None, image_color

In [4]:
metadata.model_dump(exclude_unset=True, exclude_none=True, exclude={"raw_metadata"})

{'compression': 'avc1',
 'resource_name': '../test/data/BigBuckBunny_320x180.mp4',
 'modified': '2025:01:28 10:15:00-05:00',
 'content_type': 'video/mp4',
 'content_length': 64657027,
 'title': 'Big Buck Bunny',
 'creator': 'Blender Foundation',
 'height': 180,
 'width': 320,
 'duration': '0:09:56'}

In [5]:
metadata.raw_metadata

{'Artist': 'Blender Foundation',
 'Minor Version': '0.2.0',
 'Next Track ID': '3',
 'Modify Date': '0000:00:00 00:00:00',
 'Media Modify Date': '0000:00:00 00:00:00',
 'Current Time': '0 s',
 'X-TIKA:Parsed-By-Full-Set': 'org.apache.tika.parser.DefaultParser',
 'Track Layer': '0',
 'File Type Extension': 'mp4',
 'Media Duration': '0:09:56',
 'Graphics Mode': 'srcCopy',
 'Time Scale': '1000',
 'Track Header Version': '0',
 'X Resolution': '72',
 'Handler Description': 'SoundHandler',
 'Media Language Code': 'und',
 'Audio Bits Per Sample': '16',
 'Audio Channels': '2',
 'Encoder': 'Lavf52.14.0',
 'Media Data Offset': '36',
 'Poster Time': '0 s',
 'Media Header Version': '0',
 'Create Date': '0000:00:00 00:00:00',
 'ExifTool Version Number': '12.76',
 'Track ID': '1',
 'Movie Header Version': '0',
 'Preferred Rate': '1',
 'File Inode Change Date/Time': '2025:01:28 10:15:00-05:00',
 'File Type': 'MP4',
 'resourceName': '../test/data/BigBuckBunny_320x180.mp4',
 'Duration': '0:09:56',
 'Ima

In [6]:
import xml.etree.ElementTree as ET

tree = ET.fromstring(content)
ET.indent(tree, space="  ")  # Available in Python 3.9+
pretty_xml = ET.tostring(tree, encoding='unicode')
print(pretty_xml)


<html:html xmlns:html="http://www.w3.org/1999/xhtml">
  <html:head>
    <html:meta name="Artist" content="Blender Foundation" />
    <html:meta name="Minor Version" content="0.2.0" />
    <html:meta name="Next Track ID" content="3" />
    <html:meta name="Modify Date" content="0000:00:00 00:00:00" />
    <html:meta name="Media Modify Date" content="0000:00:00 00:00:00" />
    <html:meta name="Current Time" content="0 s" />
    <html:meta name="Track Layer" content="0" />
    <html:meta name="File Type Extension" content="mp4" />
    <html:meta name="Media Duration" content="0:09:56" />
    <html:meta name="Graphics Mode" content="srcCopy" />
    <html:meta name="Time Scale" content="1000" />
    <html:meta name="Track Header Version" content="0" />
    <html:meta name="X Resolution" content="72" />
    <html:meta name="Handler Description" content="SoundHandler" />
    <html:meta name="Media Language Code" content="und" />
    <html:meta name="Audio Bits Per Sample" content="16" />
   