# Build a custom [parser](https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/parser/Parser.java) and use it with Tikara


You can create your own parser and use it with Tika, right from Python. This is extremely powerful because it allows you to combine Python's extensive content analysis libraries with Tika's huge list of existing parsers.


In [1]:
# 1. Initialize JVM so that we can use Java imports

from tikara.util.java import initialize_jvm

initialize_jvm()

In [2]:
# 2. Create a custom parser class. Must have the same exact method signatures as the Java interface:
#  https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/parser/Parser.java


from typing import TYPE_CHECKING
from jpype import JImplements, JOverride, JString

if TYPE_CHECKING:
    from org.apache.tika.parser import Parser

def get_parsers() -> list["Parser"]:
    from xml.sax import ContentHandler
    from java.io import InputStream
    from org.apache.tika.parser import ParseContext, Parser
    from org.apache.tika.metadata import Metadata
    from org.apache.tika.mime import MediaType
    from java.util import HashSet

    @JImplements(Parser)
    class MarkdownParser:
        def __init__(self) -> None:
            self.supported_types = HashSet()
            self.supported_types.add(MediaType.parse("text/x-web-markdown"))
            # tika doesn't use this by default. it will only ever be found if you add a custom detector and mime type
            #  see the custom detector example for more info: 
            #  https://github.com/baughmann/tikara/blob/master/examples/custom_detector.ipynb
            self.supported_types.add(MediaType.parse("text/markdown"))

        @JOverride
        def getSupportedTypes(self, context: ParseContext) -> HashSet:
            return self.supported_types

        @JOverride
        def parse(
            self, 
            stream: InputStream, 
            handler: ContentHandler,
            metadata: Metadata,
            context: ParseContext
        ) -> None:
            print("Parsing markdown with custom parser")
            bytes_array = bytearray()
            byte = stream.read()
            while byte != -1:
                bytes_array.append(byte)
                byte = stream.read()
            
            content = bytes_array.decode('utf-8')
            
            # Convert to Java char array using JClass
            chars = JString(content).toCharArray()
            
            handler.startDocument()
            handler.characters(chars, 0, len(chars)) # type: ignore  # noqa: PGH003
            handler.endDocument()



    return [MarkdownParser()]

In [3]:
# 3. Initialize Tikara with the custom parser

from tikara import Tika


tika = Tika(custom_parsers=get_parsers)

In [4]:
# 4. Parse the file

from pathlib import Path


input_file = Path("../README.md")

content, metadata = tika.parse(input_file)

content, metadata

Parsing markdown with custom parser


('<?xml version="1.0" encoding="UTF-8"?>\n&lt;img src="https://raw.githubusercontent.com/baughmann/tikara/refs/heads/master/tikara_logo.svg" alt="Tikara Logo" style="width:100px;"/&gt;\n\n# Tikara\n\n![Coverage](https://img.shields.io/badge/dynamic/xml?url=https://raw.githubusercontent.com/baughmann/tikara/refs/heads/master/coverage.xml&amp;query=/coverage/@line-rate%20*%20100&amp;suffix=%25&amp;color=brightgreen&amp;label=coverage) ![Tests](https://img.shields.io/badge/dynamic/xml?url=https://raw.githubusercontent.com/baughmann/tikara/refs/heads/master/junit.xml&amp;query=/testsuites/testsuite/@tests&amp;label=tests&amp;color=green) ![PyPI](https://img.shields.io/pypi/v/tikara) ![GitHub License](https://img.shields.io/github/license/baughmann/tikara) ![PyPI - Downloads](https://img.shields.io/pypi/dm/tikara) ![GitHub issues](https://img.shields.io/github/issues/baughmann/tikara) ![GitHub pull requests](https://img.shields.io/github/issues-pr/baughmann/tikara) ![GitHub stars](https://i