# Parse document text content and metadata with Tikara

In [1]:
# 1. Instantiate tika

from tikara import Tika

tika = Tika(lazy_load=True)

In [2]:
# 2. Pick an input file

from pathlib import Path


# input_doc = Path("../test/data/numbers_gs150.jpg")
# input_doc = Path("../test/data/stock_gs200.jpg")
# input_doc = Path("../test/data/captcha1.jpg")
input_doc = Path("../test/data/plaid_c150.jpg")

In [3]:
# 3. Extract text from the input file to a byte stream 
#  This is an advanced use case, remove the output_stream parameter to get a string output

output_stream, metadata = tika.parse(
    obj=input_doc,
    output_stream=True,
    output_format="txt"
)

content = output_stream.read().decode("utf-8")

content

INFO  [main] 11:08:50,294 org.apache.tika.parser.ocr.TesseractOCRParser Tesseract is installed and is being invoked. This can add greatly to processing time.  If you do not want tesseract to be applied to your files see: https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr


'CENTER\nFOR THE i hisericdomiom Malioue\n\nJanuary 6-29\n\nFri & Saturdays at 8\nSandes Matinees at 2 pa\n\n_ Tickets: 723-8698\n\n| www.henegar.org pres rama tard\n\n\n\n'

In [4]:
metadata.model_dump(exclude_unset=True, exclude_none=True, exclude={"raw_metadata"})

{'table_count': '4 Huffman tables',
 'component_count': 3,
 'resource_name': '../test/data/plaid_c150.jpg',
 'content_type': 'image/jpeg',
 'content_length': 114086,
 'height': 940,
 'width': 640}

In [5]:
metadata.raw_metadata

{'Number of Tables': '4 Huffman tables',
 'Compression Type': 'Baseline',
 'Data Precision': '8 bits',
 'X-TIKA:Parsed-By-Full-Set': 'org.apache.tika.parser.DefaultParser',
 'Number of Components': '3',
 'tiff:ImageLength': '940',
 'Component 2': 'Cb component: Quantization table 1, Sampling factors 1 horiz/1 vert',
 'Thumbnail Height Pixels': '0',
 'Component 1': 'Y component: Quantization table 0, Sampling factors 2 horiz/2 vert',
 'X Resolution': '1 dot',
 'Extension Code': 'Thumbnail stored using 3 bytes/pixel',
 'File Size': '114086 bytes',
 'Component 3': 'Cr component: Quantization table 1, Sampling factors 1 horiz/1 vert',
 'Version': '1.2',
 'File Name': 'apache-tika-2447546473670837897.jpg',
 'Content-Length': '114086',
 'tiff:BitsPerSample': '8',
 'Content-Type': 'image/jpeg',
 'Resolution Units': 'none',
 'File Modified Date': 'Tue Jan 28 11:08:50 -05:00 2025',
 'resourceName': '../test/data/plaid_c150.jpg',
 'Image Height': '940 pixels',
 'Thumbnail Width Pixels': '0',
 'I

In [7]:
lang = tika.detect_language(content)

lang

TikaDetectLanguageResult(language='en', confidence=<TikaLanguageConfidence.HIGH: 'HIGH'>, raw_score=0.9999939799308777)