In [None]:
# | default_exp _components.encoder.avro

In [None]:
# | export


import io
import json
from typing import *

import fastavro
from pydantic import BaseModel, create_model
from pydantic.main import ModelMetaclass

from fastkafka._components.logger import get_logger
from fastkafka._components.meta import export

In [None]:
import tempfile

import pytest
from pydantic import Field

from fastkafka._components.logger import suppress_timestamps

In [None]:
# | export

logger = get_logger(__name__)

In [None]:
suppress_timestamps()
logger = get_logger(__name__, level=20)
logger.info("ok")

[INFO] __main__: ok


In [None]:
# | export


@export("fastkafka.encoder")
class AvroBase(BaseModel):
    """This is base pydantic class that will add some methods"""

    @classmethod
    def avro_schema_for_pydantic(
        cls,
        pydantic_model: Union[BaseModel, ModelMetaclass],
        by_alias: bool = True,
        namespace: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Return the avro schema for the pydantic class

        :param by_alias: generate the schemas using the aliases defined, if any
        :param namespace: Provide an optional namespace string to use in schema generation
        :return: dict with the Avro Schema for the model
        """

        if isinstance(pydantic_model, BaseModel):
            schema = pydantic_model.__class__.schema(by_alias=by_alias)
        elif isinstance(pydantic_model, ModelMetaclass):
            schema = pydantic_model.schema(by_alias=by_alias)
        else:
            raise ValueError(
                f"Unknown type {type(pydantic_model)} given for pydantic_model parameter"
            )

        if namespace is None:
            # default namespace will be based on title
            namespace = schema["title"]

        return cls._avro_schema(schema, namespace)

    @classmethod
    def avro_schema(
        cls, by_alias: bool = True, namespace: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Return the avro schema for the pydantic class

        :param by_alias: generate the schemas using the aliases defined, if any
        :param namespace: Provide an optional namespace string to use in schema generation
        :return: dict with the Avro Schema for the model
        """
        schema = cls.schema(by_alias=by_alias)

        if namespace is None:
            # default namespace will be based on title
            namespace = schema["title"]

        return cls._avro_schema(schema, namespace)

    @staticmethod
    def _avro_schema(schema: Dict[str, Any], namespace: str) -> Dict[str, Any]:
        """Return the avro schema for the given pydantic schema"""

        classes_seen = set()

        def get_definition(ref: str, schema: Dict[str, Any]) -> Dict[str, Any]:
            """Reading definition of base schema for nested structs"""
            id = ref.replace("#/definitions/", "")
            d = schema.get("definitions", {}).get(id)
            if d is None:
                raise RuntimeError(f"Definition {id} does not exist")
            return d  # type: ignore

        def get_type(value: Dict[str, Any]) -> Dict[str, Any]:
            """Returns a type of a single field"""
            t = value.get("type")
            f = value.get("format")
            r = value.get("$ref")
            a = value.get("additionalProperties")
            avro_type_dict: Dict[str, Any] = {}
            if "default" in value:
                avro_type_dict["default"] = value.get("default")
            if "description" in value:
                avro_type_dict["doc"] = value.get("description")
            if "allOf" in value and len(value["allOf"]) == 1:
                r = value["allOf"][0]["$ref"]
            if r is not None:
                class_name = r.replace("#/definitions/", "")
                if class_name in classes_seen:
                    avro_type_dict["type"] = class_name
                else:
                    d = get_definition(r, schema)
                    if "enum" in d:
                        avro_type_dict["type"] = {
                            "type": "enum",
                            "symbols": [str(v) for v in d["enum"]],
                            "name": d["title"],
                        }
                    else:
                        avro_type_dict["type"] = {
                            "type": "record",
                            "fields": get_fields(d),
                            # Name of the struct should be unique true the complete schema
                            # Because of this the path in the schema is tracked and used as name for a nested struct/array
                            "name": class_name,
                        }
                    classes_seen.add(class_name)
            elif t == "array":
                items = value.get("items")
                tn = get_type(items)  # type: ignore
                # If items in array are a object:
                if "$ref" in items:  # type: ignore
                    tn = tn["type"]
                # If items in array are a logicalType
                if (
                    isinstance(tn, dict)
                    and isinstance(tn.get("type", {}), dict)
                    and tn.get("type", {}).get("logicalType") is not None
                ):
                    tn = tn["type"]
                avro_type_dict["type"] = {"type": "array", "items": tn}
            elif t == "string" and f == "date-time":
                avro_type_dict["type"] = {
                    "type": "long",
                    "logicalType": "timestamp-micros",
                }
            elif t == "string" and f == "date":
                avro_type_dict["type"] = {
                    "type": "int",
                    "logicalType": "date",
                }
            elif t == "string" and f == "time":
                avro_type_dict["type"] = {
                    "type": "long",
                    "logicalType": "time-micros",
                }
            elif t == "string" and f == "uuid":
                avro_type_dict["type"] = {
                    "type": "string",
                    "logicalType": "uuid",
                }
            elif t == "string":
                avro_type_dict["type"] = "string"
            elif t == "number":
                avro_type_dict["type"] = "double"
            elif t == "integer":
                # integer in python can be a long
                avro_type_dict["type"] = "long"
            elif t == "boolean":
                avro_type_dict["type"] = "boolean"
            elif t == "object":
                if a is None:
                    value_type = "string"
                else:
                    value_type = get_type(a)  # type: ignore
                if isinstance(value_type, dict) and len(value_type) == 1:
                    value_type = value_type.get("type")  # type: ignore
                avro_type_dict["type"] = {"type": "map", "values": value_type}
            else:
                raise NotImplementedError(
                    f"Type '{t}' not support yet, "
                    f"please report this at https://github.com/godatadriven/pydantic-avro/issues"
                )
            return avro_type_dict

        def get_fields(s: Dict[str, Any]) -> List[Dict[str, Any]]:
            """Return a list of fields of a struct"""
            fields = []

            required = s.get("required", [])
            for key, value in s.get("properties", {}).items():
                avro_type_dict = get_type(value)
                avro_type_dict["name"] = key

                if key not in required:
                    if avro_type_dict.get("default") is None:
                        avro_type_dict["type"] = ["null", avro_type_dict["type"]]
                        avro_type_dict["default"] = None

                fields.append(avro_type_dict)
            return fields

        fields = get_fields(schema)

        return {
            "type": "record",
            "namespace": namespace,
            "name": schema["title"],
            "fields": fields,
        }

In [None]:
test_user_schema = {
    "type": "record",
    "namespace": "User",
    "name": "User",
    "fields": [
        {"type": "string", "name": "name"},
        {"type": ["null", "long"], "name": "favorite_number", "default": None},
        {"type": ["null", "string"], "name": "favorite_color", "default": None},
    ],
}


class User(BaseModel):
    name: str
    favorite_number: Optional[int] = None
    favorite_color: Optional[str] = None

In [None]:
actual = AvroBase.avro_schema_for_pydantic(User)
display(actual)
assert actual == test_user_schema

{'type': 'record',
 'namespace': 'User',
 'name': 'User',
 'fields': [{'type': 'string', 'name': 'name'},
  {'type': ['null', 'long'], 'name': 'favorite_number', 'default': None},
  {'type': ['null', 'string'], 'name': 'favorite_color', 'default': None}]}

In [None]:
# ToDo
# 1. Rewrite with fastavro - Done
# 2. Generate schema from pydantic itself - Done
#        - Pydantic to avro schema conversion methods - Done
# 3. Generate pydantic class from avro schema

In [None]:
# | export


@export("fastkafka.encoder")
def avro_encoder(msg: BaseModel) -> bytes:
    """
    Encoder to encode pydantic instances to avro message

    Args:
        msg: An instance of pydantic basemodel

    Returns:
        A bytes message which is encoded from pydantic basemodel
    """
    schema = fastavro.schema.parse_schema(AvroBase.avro_schema_for_pydantic(msg))
    bytes_writer = io.BytesIO()
    fastavro.schemaless_writer(bytes_writer, schema, msg.dict())
    raw_bytes = bytes_writer.getvalue()
    return raw_bytes

In [None]:
msg = User(name="Kumaran", favorite_number=9, favorite_color="black")


actual = avro_encoder(msg)
display(actual)

assert isinstance(actual, bytes)
assert actual == b"\x0eKumaran\x02\x12\x02\nblack"

b'\x0eKumaran\x02\x12\x02\nblack'

In [None]:
# | export


@export("fastkafka.encoder")
def avro_decoder(raw_msg: bytes, cls: ModelMetaclass) -> Any:
    """
    Decoder to decode avro encoded messages to pydantic model instance

    Args:
        raw_msg: Avro encoded bytes message received from Kafka topic
        cls: Pydantic class; This pydantic class will be used to construct instance of same class

    Returns:
        An instance of given pydantic class
    """
    schema = fastavro.schema.parse_schema(AvroBase.avro_schema_for_pydantic(cls))

    bytes_reader = io.BytesIO(raw_msg)
    msg_dict = fastavro.schemaless_reader(bytes_reader, schema)

    return cls(**msg_dict)

In [None]:
raw_msg = b"\x06123\x02\x00\x02\x06111"


actual = avro_decoder(raw_msg, cls=User)
display(actual)

assert isinstance(actual, User)
assert actual.name == "123"
assert actual.favorite_number == 0
assert actual.favorite_color == "111"

User(name='123', favorite_number=0, favorite_color='111')

In [None]:
# | export


@export("fastkafka.encoder")
def avsc_to_pydantic(schema: Dict[str, Any]) -> ModelMetaclass:
    """
    Generate pydantic model from given Avro Schema

    Args:
        schema: Avro schema in dictionary format

    Returns:
        Pydantic model class built from given avro schema
    """
    if "type" not in schema or schema["type"] != "record":
        raise AttributeError("Type not supported")
    if "name" not in schema:
        raise AttributeError("Name is required")
    if "fields" not in schema:
        raise AttributeError("fields are required")

    classes = {}

    def get_python_type(t: Union[str, Dict[str, Any]]) -> str:
        """Returns python type for given avro type"""
        optional = False
        if isinstance(t, str):
            if t == "string":
                py_type = "str"
            elif t == "long" or t == "int":
                py_type = "int"
            elif t == "boolean":
                py_type = "bool"
            elif t == "double" or t == "float":
                py_type = "float"
            elif t in classes:
                py_type = t
            else:
                raise NotImplementedError(f"Type {t} not supported yet")
        elif isinstance(t, list):
            if "null" in t:
                optional = True
            if len(t) > 2 or (not optional and len(t) > 1):
                raise NotImplementedError("Only a single type ia supported yet")
            c = t.copy()
            c.remove("null")
            py_type = get_python_type(c[0])
        elif t.get("logicalType") == "uuid":
            py_type = "UUID"
        elif t.get("logicalType") == "decimal":
            py_type = "Decimal"
        elif (
            t.get("logicalType") == "timestamp-millis"
            or t.get("logicalType") == "timestamp-micros"
        ):
            py_type = "datetime"
        elif (
            t.get("logicalType") == "time-millis"
            or t.get("logicalType") == "time-micros"
        ):
            py_type = "time"
        elif t.get("logicalType") == "date":
            py_type = "date"
        elif t.get("type") == "enum":
            enum_name = t.get("name")
            if enum_name not in classes:
                enum_class = f"class {enum_name}(str, Enum):\n"
                for s in t.get("symbols"):  # type: ignore
                    enum_class += f'    {s} = "{s}"\n'
                classes[enum_name] = enum_class
            py_type = enum_name  # type: ignore
        elif t.get("type") == "string":
            py_type = "str"
        elif t.get("type") == "array":
            sub_type = get_python_type(t.get("items"))  # type: ignore
            py_type = f"List[{sub_type}]"
        elif t.get("type") == "record":
            record_type_to_pydantic(t)
            py_type = t.get("name")  # type: ignore
        elif t.get("type") == "map":
            value_type = get_python_type(t.get("values"))  # type: ignore
            py_type = f"Dict[str, {value_type}]"
        else:
            raise NotImplementedError(
                f"Type {t} not supported yet, "
                f"please report this at https://github.com/godatadriven/pydantic-avro/issues"
            )
        if optional:
            return f"Optional[{py_type}]"
        else:
            return py_type

    def record_type_to_pydantic(schema: Dict[str, Any]) -> ModelMetaclass:
        """Convert a single avro record type to a pydantic class"""
        name = (
            schema["name"]
            if "." not in schema["name"]
            else schema["name"].split(".")[-1]
        )
        current = f"class {schema['name']}(BaseModel):\n"

        kwargs: Dict[str, Tuple[str, Any]] = {}

        if len(schema["fields"]) == 0:
            raise ValueError("Avro schema has no fields")

        for field in schema["fields"]:
            n = field["name"]
            t = get_python_type(field["type"])
            default = field.get("default")
            if "default" not in field:
                kwargs[n] = (t, ...)
                current += f"    {n}: {t}\n"
            elif isinstance(default, (bool, type(None))):
                kwargs[n] = (t, default)
                current += f"    {n}: {t} = {default}\n"
            else:
                kwargs[n] = (t, default)
                current += f"    {n}: {t} = {json.dumps(default)}\n"

        classes[name] = current
        pydantic_model = create_model(name, **kwargs)  # type: ignore
        return pydantic_model  # type: ignore

    return record_type_to_pydantic(schema)

In [None]:
user_schema = AvroBase.avro_schema_for_pydantic(User)
display(user_schema)

A = avsc_to_pydantic(user_schema)
display(A)
display(A.__fields__)
assert isinstance(A, ModelMetaclass)
assert list(A.__fields__.keys()) == ["name", "favorite_number", "favorite_color"]

assert A(name="Kumaran", favorite_number="9", favorite_color="black") == User(
    name="Kumaran", favorite_number="9", favorite_color="black"
)

{'type': 'record',
 'namespace': 'User',
 'name': 'User',
 'fields': [{'type': 'string', 'name': 'name'},
  {'type': ['null', 'long'], 'name': 'favorite_number', 'default': None},
  {'type': ['null', 'string'], 'name': 'favorite_color', 'default': None}]}

pydantic.main.User

{'name': ModelField(name='name', type=str, required=True),
 'favorite_number': ModelField(name='favorite_number', type=Optional[int], required=False, default=None),
 'favorite_color': ModelField(name='favorite_color', type=Optional[str], required=False, default=None)}

In [None]:
user_schema = AvroBase.avro_schema_for_pydantic(User)
user_schema["fields"] = []

display(user_schema)

with pytest.raises(ValueError) as e:
    A = avsc_to_pydantic(user_schema)
display(e)

{'type': 'record', 'namespace': 'User', 'name': 'User', 'fields': []}

<ExceptionInfo ValueError('Avro schema has no fields') tblen=3>