# Parse document text content and metadata with Tikara

In [1]:
# 1. Instantiate tika

from tikara import Tika

tika = Tika(lazy_load=True)

In [2]:
# 2. Pick an input file

from pathlib import Path


# input_doc = Path("../test/data/demo.docx")
# input_doc = Path("../test/data/bad_xml.xml")
# input_doc = Path("../test/data/CantinaBand3.wav")
# input_doc = Path("../test/data/docx-shapes.docx")
# input_doc = Path("../test/data/category-level.docx")
# input_doc = Path("../test/data/fake-email.eml")
# input_doc = Path("../test/data/fake-email-multiple-attachments.msg")
# input_doc = Path("../test/data/docx-tables.docx")
# input_doc = Path("../test/data/emoji.xlsx")
# input_doc = Path("../test/data/testPDF_childAttachments.pdf")
# input_doc = Path("../test/data/science-exploration-369p.pptx")
# input_doc = Path("../test/data/simple.epub")
# input_doc = Path("../test/data/failure-after-repair.pdf")
# input_doc = Path("../test/data/korean-text-with-tables.pdf")
# input_doc = Path("../test/data/docx-tables.docx")
# input_doc = Path("../test/data/big_buck_bunny_480p_h264.mov")
# input_doc = Path("../test/data/big_buck_bunny_480p_stereo.avi")
# input_doc = Path("../test/data/big_buck_bunny_480p_stereo.ogg")
# input_doc = Path("../test/data/Aimer-les-fourmis_fma-140809_001_00-01-39.wav")
input_doc = Path("../test/data/Arthur.mp3")

In [3]:
# 3. Extract text from the input file to a byte stream 
#  This is an advanced use case, remove the output_stream parameter to get a string output

output_stream, metadata = tika.parse(
    obj=input_doc,
    output_stream=True,
    output_format="xhtml"
)

content = output_stream.read().decode("utf-8")

metadata

TikaMetadata(encoding=None, compression='MP3', paragraph_count=None, revision=None, word_count=None, line_count=None, character_count=None, character_count_with_spaces=None, page_count=None, chars_per_page=None, table_count=None, component_count=None, image_count=None, hidden_slides=None, resource_name='../test/data/Arthur.mp3', resource_path=None, embedded_resource_type=None, embedded_relationship_id=None, embedded_depth=None, created=None, modified=None, content_type='audio/mpeg', content_type_override=None, content_length=3197074, title=None, description=None, type=None, keywords=None, company=None, creator=None, publisher=None, contributor=None, language=None, identifier=None, application=None, application_version='MPEG 3 Layer III Version 1', producer=None, version=None, template=None, security=None, is_encrypted=None, height=None, width=None, duration=228.37692260742188, sample_rate=None, stream_count=None, image_pixel_aspect_ratio=None, image_color_space=None, audio_channels='St

In [4]:
metadata.model_dump(exclude_unset=True, exclude_none=True, exclude={"raw_metadata"})

{'compression': 'MP3',
 'resource_name': '../test/data/Arthur.mp3',
 'content_type': 'audio/mpeg',
 'content_length': 3197074,
 'application_version': 'MPEG 3 Layer III Version 1',
 'duration': 228.37692260742188,
 'audio_channels': 'Stereo'}

In [5]:
metadata.raw_metadata

{'xmpDM:audioSampleRate': '44100',
 'channels': '2',
 'X-TIKA:Parsed-By': 'org.apache.tika.parser.DefaultParser',
 'X-TIKA:Parsed-By-Full-Set': 'org.apache.tika.parser.DefaultParser',
 'xmpDM:audioCompressor': 'MP3',
 'resourceName': '../test/data/Arthur.mp3',
 'Content-Length': '3197074',
 'xmpDM:audioChannelType': 'Stereo',
 'version': 'MPEG 3 Layer III Version 1',
 'xmpDM:duration': '228.37692260742188',
 'Content-Type': 'audio/mpeg',
 'samplerate': '44100'}

In [6]:
import xml.etree.ElementTree as ET

tree = ET.fromstring(content)
ET.indent(tree, space="  ")  # Available in Python 3.9+
pretty_xml = ET.tostring(tree, encoding='unicode')
print(pretty_xml)


<html:html xmlns:html="http://www.w3.org/1999/xhtml">
  <html:head>
    <html:meta name="xmpDM:audioSampleRate" content="44100" />
    <html:meta name="channels" content="2" />
    <html:meta name="X-TIKA:Parsed-By" content="org.apache.tika.parser.DefaultParser" />
    <html:meta name="X-TIKA:Parsed-By" content="org.apache.tika.parser.mp3.Mp3Parser" />
    <html:meta name="xmpDM:audioCompressor" content="MP3" />
    <html:meta name="resourceName" content="../test/data/Arthur.mp3" />
    <html:meta name="Content-Length" content="3197074" />
    <html:meta name="xmpDM:audioChannelType" content="Stereo" />
    <html:meta name="version" content="MPEG 3 Layer III Version 1" />
    <html:meta name="xmpDM:duration" content="228.37692260742188" />
    <html:meta name="Content-Type" content="audio/mpeg" />
    <html:meta name="samplerate" content="44100" />
    <html:title />
  </html:head>
  <html:body>
    <html:p>228.37692</html:p>
  </html:body>
</html:html>
