diff --git a/docusaurus.config.ts b/docusaurus.config.ts index 8df13af2b37..d09764d4c8e 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -98,44 +98,44 @@ const config: Config = { { type: 'docSidebar', sidebarId: 'startSidebar', - position: 'right', + position: 'left', label: 'Start', }, { type: 'docSidebar', sidebarId: 'introductionSidebar', - position: 'right', + position: 'left', label: 'Introduction', }, { type: 'docSidebar', sidebarId: 'guideSidebar', - position: 'right', + position: 'left', label: 'Guide', }, { type: 'docSidebar', sidebarId: 'specificationSidebar', - position: 'right', + position: 'left', label: 'Specification', }, { type: 'docSidebar', sidebarId: 'communitySidebar', - position: 'right', + position: 'left', label: 'Community', }, { to: '/user', label: 'Users', - position: "right", + position: "left", }, { - position: 'right', + position: 'left', to: '/download', label: 'Download', }, - {to: '/blog', label: 'Blog', position: 'right'}, + {to: '/blog', label: 'Blog', position: 'left'}, { type: 'dropdown', label: 'ASF', @@ -175,6 +175,10 @@ const config: Config = { } ] }, + { + type: 'docsVersionDropdown', + position: 'right', + }, { href: 'https://github.com/apache/fury', position: 'right', diff --git a/i18n/en-us/docusaurus-plugin-content-blog/authors.yml b/i18n/en-US/docusaurus-plugin-content-blog/authors.yml similarity index 100% rename from i18n/en-us/docusaurus-plugin-content-blog/authors.yml rename to i18n/en-US/docusaurus-plugin-content-blog/authors.yml diff --git a/i18n/en-us/docusaurus-plugin-content-blog/options.json b/i18n/en-US/docusaurus-plugin-content-blog/options.json similarity index 100% rename from i18n/en-us/docusaurus-plugin-content-blog/options.json rename to i18n/en-US/docusaurus-plugin-content-blog/options.json diff --git a/i18n/en-us/docusaurus-plugin-content-docs/current.json b/i18n/en-US/docusaurus-plugin-content-docs/current.json similarity index 100% rename from i18n/en-us/docusaurus-plugin-content-docs/current.json rename to i18n/en-US/docusaurus-plugin-content-docs/current.json diff --git a/i18n/en-us/docusaurus-plugin-content-docs/current/.keep b/i18n/en-US/docusaurus-plugin-content-docs/current/.keep similarity index 100% rename from i18n/en-us/docusaurus-plugin-content-docs/current/.keep rename to i18n/en-US/docusaurus-plugin-content-docs/current/.keep diff --git a/i18n/en-us/docusaurus-theme-classic/footer.json b/i18n/en-US/docusaurus-theme-classic/footer.json similarity index 100% rename from i18n/en-us/docusaurus-theme-classic/footer.json rename to i18n/en-US/docusaurus-theme-classic/footer.json diff --git a/i18n/en-us/docusaurus-theme-classic/navbar.json b/i18n/en-US/docusaurus-theme-classic/navbar.json similarity index 100% rename from i18n/en-us/docusaurus-theme-classic/navbar.json rename to i18n/en-US/docusaurus-theme-classic/navbar.json diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/java_serialization_spec.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/java_serialization_spec.md new file mode 100644 index 00000000000..82709374608 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/java_serialization_spec.md @@ -0,0 +1,557 @@ +--- +title: Fury Java Serialization Format +sidebar_position: 1 +id: fury_java_serialization_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Spec overview + +Fury Java Serialization is an automatic object serialization framework that supports reference and polymorphism. Fury +will +convert an object from/to fury java serialization binary format. Fury has two core concepts for java serialization: + +- **Fury Java Binary format** +- **Framework to convert object to/from Fury Java Binary format** + +The serialization format is a dynamic binary format. The dynamics and reference/polymorphism support make Fury flexible, +much more easy to use, but +also introduce more complexities compared to static serialization frameworks. So the format will be more complex. + +Here is the overall format: + +``` +| fury header | object ref meta | object class meta | object value data | +``` + +The data are serialized using little endian byte order overall. If bytes swap is costly for some object, +Fury will write the byte order for that object into the data instead of converting it to little endian. + +## Fury header + +Fury header consists starts one byte: + +``` +| 4 bits | 1 bit | 1 bit | 1 bit | 1 bit | optional 4 bytes | ++---------------+-------+-------+--------+-------+------------------------------------+ +| reserved bits | oob | xlang | endian | null | unsigned int for meta start offset | +``` + +- null flag: 1 when object is null, 0 otherwise. If an object is null, other bits won't be set. +- endian flag: 1 when data is encoded by little endian, 0 for big endian. +- xlang flag: 1 when serialization uses xlang format, 0 when serialization uses Fury java format. +- oob flag: 1 when passed `BufferCallback` is not null, 0 otherwise. + +If meta share mode is enabled, an uncompressed unsigned int is appended to indicate the start offset of metadata. + +## Reference Meta + +Reference tracking handles whether the object is null, and whether to track reference for the object by writing +corresponding flags and maintaining internal state. + +Reference flags: + +| Flag | Byte Value | Description | +|---------------------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| +| NULL FLAG | `-3` | This flag indicates the object is a null value. We don't use another byte to indicate REF, so that we can save one byte. | +| REF FLAG | `-2` | This flag indicates the object is already serialized previously, and fury will write a ref id with unsigned varint format instead of serialize it again | +| NOT_NULL VALUE FLAG | `-1` | This flag indicates the object is a non-null value and fury doesn't track ref for this type of object. | +| REF VALUE FLAG | `0` | This flag indicates the object is referencable and the first time to serialize. | + +When reference tracking is disabled globally or for specific types, or for certain types within a particular +context(e.g., a field of a class), only the `NULL` and `NOT_NULL VALUE` flags will be used for reference meta. + +## Class Meta + +Fury supports to register class by an optional id, the registration can be used for security check and class +identification. +If a class is registered, it will have a user-provided or an auto-growing unsigned int i.e. `class_id`. + +Depending on whether meta share mode and registration is enabled for current class, Fury will write class meta +differently. + +### Schema consistent + +If schema consistent mode is enabled globally or enabled for current class, class meta will be written as follows: + +- If class is registered, it will be written as a fury unsigned varint: `class_id << 1`. +- If class is not registered: + - If class is not an array, fury will write one byte `0bxxxxxxx1` first, then write class name. + - The first little bit is `1`, which is different from first bit `0` of + encoded class id. Fury can use this information to determine whether to read class by class id for + deserialization. + - If class is not registered and class is an array, fury will write one byte `dimensions << 1 | 1` first, then write + component + class subsequently. This can reduce array class name cost if component class is or will be serialized. + - Class will be written as two enumerated fury unsigned by default: `package name` and `class name`. If meta share + mode is + enabled, + class will be written as an unsigned varint which points to index in `MetaContext`. + +### Schema evolution + +If schema evolution mode is enabled globally or enabled for current class, class meta will be written as follows: + +- If meta share mode is not enabled, class meta will be written as schema consistent mode. Additionally, field meta such + as field type + and name will be written with the field value using a key-value like layout. +- If meta share mode is enabled, class meta will be written as a meta-share encoded binary if class hasn't been written + before, otherwise an unsigned varint id which references to previous written class meta will be written. + +## Meta share + +> This mode will forbid streaming writing since it needs to look back for update the start offset after the whole object +> graph +> writing and meta collecting is finished. Only in this way we can ensure deserialization failure doesn't lost shared +> meta. +> Meta streamline will be supported in the future for enclosed meta sharing which doesn't cross multiple serializations +> of different objects. + +For Schema consistent mode, class will be encoded as an enumerated string by full class name. Here we mainly describe +the meta layout for schema evolution mode: + +``` +| 8 bytes meta header | meta size | variable bytes | variable bytes | variable bytes | ++-------------------------------+-----------|--------------------+-------------------+----------------+ +| 7 bytes hash + 1 bytes header | 1~2 bytes | current class meta | parent class meta | ... | +``` + +Class meta are encoded from parent class to leaf class, only class with serializable fields will be encoded. + +### Meta header + +Meta header is a 64 bits number value encoded in little endian order. + +- Lowest 4 digits `0b0000~0b1110` are used to record num classes. `0b1111` is preserved to indicate that Fury need to + read more bytes for length using Fury unsigned int encoding. If current class doesn't has parent class, or parent + class doesn't have fields to serialize, or we're in a context which serialize fields of current class + only( `ObjectStreamSerializer#SlotInfo` is an example), num classes will be 1. +- 5rd bit is used to indicate whether this class needs schema evolution. +- 6rd bit is used to indicate whether the size sum of all layers meta is less than 256. +- Other 56 bits is used to store the unique hash of `flags + all layers class meta`. + +### Meta size + +- If the size sum of all layers meta is less than 256, then one byte is written next to indicate the length of meta. +- Otherwise, write size as two bytes in little endian. + +### Single layer class meta + +``` +| unsigned varint | meta string | meta string | field info: variable bytes | variable bytes | ... | ++----------------------------+-----------------------+---------------------+-------------------------------+-----------------+-----+ +| num fields + register flag | header + package name | header + class name | header + type id + field name | next field info | ... | +``` + +- num fields: encode `num fields << 1 | register flag(1 when class registered)` as unsigned varint. + - If class is registered, then an unsigned varint class id will be written next, package and class name will be + omitted. + - If current class is schema consistent, then num field will be `0` to flag it. + - If current class isn't schema consistent, then num field will be the number of compatible fields. For example, + users + can use tag id to mark some field as compatible field in schema consistent context. In such cases, schema + consistent + fields will be serialized first, then compatible fields will be serialized next. At deserialization, Fury will use + fields info of those fields which aren't annotated by tag id for deserializing schema consistent fields, then use + fields info in meta for deserializing compatible fields. +- Package name encoding(omitted when class is registered): + - encoding algorithm: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL` + - Header: `6 bits size | 2 bits encoding flags`. The `6 bits size: 0~63` will be used to indicate size `0~63`, + the value `63` the size need more byte to read, the encoding will encode `size - 63` as a varint next. +- Class name encoding(omitted when class is registered): + - encoding algorithm: `UTF8/LOWER_UPPER_DIGIT_SPECIAL/FIRST_TO_LOWER_SPECIAL/ALL_TO_LOWER_SPECIAL` + - header: `6 bits size | 2 bits encoding flags`. The `6 bits size: 0~63` will be used to indicate size `0~63`, + the value `63` the size need more byte to read, the encoding will encode `size - 63` as a varint next. +- Field info: + - header(8 + bits): `3 bits size + 2 bits field name encoding + polymorphism flag + nullability flag + ref tracking flag`. + Users can use annotation to provide those info. + - 2 bits field name encoding: + - encoding: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL/TAG_ID` + - If tag id is used, i.e. field name is written by an unsigned varint tag id. 2 bits encoding will be `11`. + - size of field name: + - The `3 bits size: 0~7` will be used to indicate length `1~7`, the value `6` the size read more bytes, + the encoding will encode `size - 7` as a varint next. + - If encoding is `TAG_ID`, then num_bytes of field name will be used to store tag id. + - ref tracking: when set to 1, ref tracking will be enabled for this field. + - nullability: when set to 1, this field can be null. + - polymorphism: when set to 1, the actual type of field will be the declared field type even the type if + not `final`. + - type id: + - For registered type-consistent classes, it will be the registered class id. + - Otherwise it will be encoded as `OBJECT_ID` if it isn't `final` and `FINAL_OBJECT_ID` if it's `final`. The + meta for such types is written separately instead of inlining here is to reduce meta space cost if object of + this type is serialized in current object graph multiple times, and the field value may be null too. + - Field name: If type id is set, type id will be used instead. Otherwise meta string encoding length and data will + be written instead. + +Field order are left as implementation details, which is not exposed to specification, the deserialization need to +resort fields based on Fury field comparator. In this way, fury can compute statistics for field names or types and +using a more compact encoding. + +### Other layers class meta + +Same encoding algorithm as the previous layer except: + +- header + package name: + - Header: + - If package name has been written before: `varint index + sharing flag(set)` will be written + - If package name hasn't been written before: + - If meta string encoding is `LOWER_SPECIAL` and the length of encoded string `<=` 64, then header will be + `6 bits size + encoding flag(set) + sharing flag(unset)`. + - Otherwise, header will + be `3 bits unset + 3 bits encoding flags + encoding flag(unset) + sharing flag(unset)` + +## Meta String + +Meta string is mainly used to encode meta strings such as class name and field names. + +### Encoding Algorithms + +String binary encoding algorithm: + +| Algorithm | Pattern | Description | +|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| UTF-8 | any chars | UTF-8 encoding | + +Encoding flags: + +| Encoding Flag | Pattern | Encoding Algorithm | +|---------------------------|---------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | every char is in `a-z._$\|` | `LOWER_SPECIAL` | +| FIRST_TO_LOWER_SPECIAL | every char is in `a-z[c1,c2]` except first char is upper case | replace first upper case char to lower case, then use `LOWER_SPECIAL` | +| ALL_TO_LOWER_SPECIAL | every char is in `a-zA-Z[c1,c2]` | replace every upper case char by `\|` + `lower case`, then use `LOWER_SPECIAL`, use this encoding if it's smaller than Encoding `LOWER_UPPER_DIGIT_SPECIAL` | +| LOWER_UPPER_DIGIT_SPECIAL | every char is in `a-zA-Z[c1,c2]` | use `LOWER_UPPER_DIGIT_SPECIAL` encoding if it's smaller than Encoding `FIRST_TO_LOWER_SPECIAL` | +| UTF8 | any utf-8 char | use `UTF-8` encoding | +| Compression | any utf-8 char | lossless compression | + +Notes: + +- For package name encoding, `c1,c2` should be `._`; For field/type name encoding, `c1,c2` should be `_$`; +- Depending on cases, one can choose encoding `flags + data` jointly, uses 3 bits of first byte for flags and other + bytes + for data. + +### Shared meta string + +The shared meta string format consists of header and encoded string binary. Header of encoded string binary will be +inlined +in shared meta header. + +Header is written using little endian order, Fury can read this flag first to determine how to deserialize the data. + +#### Write by data + +If string hasn't been written before, the data will be written as follows: + +``` +| unsigned varint: string binary size + 1 bit: not written before | 56 bits: unique hash | 3 bits encoding flags + string binary | +``` + +If string binary size is less than `16` bytes, the hash will be omitted to save spaces. Unique hash can be omitted too +if caller pass a flag to disable it. In such cases, the format will be: + +``` +| unsigned varint: string binary size + 1 bit: not written before | 3 bits encoding flags + string binary | +``` + +#### Write by ref + +If string has been written before, the data will be written as follows: + +``` +| unsigned varint: written string id + 1 bit: written before | +``` + +## Value Format + +### Basic types + +#### Bool + +- size: 1 byte +- format: 0 for `false`, 1 for `true` + +#### Byte + +- size: 1 byte +- format: write as pure byte. + +#### Short + +- size: 2 byte +- byte order: little endian order + +#### Char + +- size: 2 byte +- byte order: little endian order + +#### Unsigned int + +- size: 1~5 byte +- Format: The most significant bit (MSB) in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then + the next byte should be read until the first bit of the next byte is unset. + +#### Signed int + +- size: 1~5 byte +- Format: First convert the number into positive unsigned int by `(v << 1) ^ (v >> 31)` ZigZag algorithm, then encoding + it as an unsigned int. + +#### Unsigned long + +- size: 1~9 byte +- Fury PVL(Progressive Variable-length Long) Encoding: + - positive long format: first bit in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then the next byte should be read until the first bit is unset. + +#### Signed long + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in [-1073741824, 1073741823], encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - First convert the number into positive unsigned long by `(v << 1) ^ (v >> 63)` ZigZag algorithm to reduce cost of + small negative numbers, then encoding it as an unsigned long. + +#### Float + +- size: 4 byte +- format: convert float to 4 bytes int by `Float.floatToRawIntBits`, then write as binary by little endian order. + +#### Double + +- size: 8 byte +- format: convert double to 8 bytes int by `Double.doubleToRawLongBits`, then write as binary by little endian order. + +### String + +Format: + +``` +| header: size << 2 | 2 bits encoding flags | binary data | +``` + +- `size + encoding` will be concat as a long and encoded as an unsigned var long. The little 2 bits is used for + encoding: + 0 for `latin`, 1 for `utf-16`, 2 for `utf-8`. +- encoded string binary data based on encoding: `latin/utf-16/utf-8`. + +Which encoding to choose: + +- For JDK8: fury detect `latin` at runtime, if string is `latin` string, then use `latin` encoding, otherwise + use `utf-16`. +- For JDK9+: fury use `coder` in `String` object for encoding, `latin`/`utf-16` will be used for encoding. +- If the string is encoded by `utf-8`, then fury will use `utf-8` to decode the data. But currently fury doesn't enable + utf-8 encoding by default for java. Cross-language string serialization of fury uses `utf-8` by default. + +### Collection + +> All collection serializers must extend `AbstractCollectionSerializer`. + +Format: + +``` +length(unsigned varint) | collection header | elements header | elements data +``` + +#### Collection header + +- For `ArrayList/LinkedArrayList/HashSet/LinkedHashSet`, this will be empty. +- For `TreeSet`, this will be `Comparator` +- For subclass of `ArrayList`, this may be extra object field info. + +#### Elements header + +In most cases, all collection elements are same type and not null, elements header will encode those homogeneous +information to avoid the cost of writing it for every element. Specifically, there are four kinds of information +which will be encoded by elements header, each use one bit: + +- If track elements ref, use the first bit `0b1` of the header to flag it. +- If the collection has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + element type, this flag is invalid. +- If the collection element types are not declared type, use the 3rd bit `0b100` of the header to flag it. +- If the collection element types are different, use the 4rd bit `0b1000` header to flag it. + +By default, all bits are unset, which means all elements won't track ref, all elements are same type, not null and +the actual element is the declared type in the custom class field. + +The implementation can generate different deserialization code based read header, and look up the generated code from a +linear map/list. + +#### Elements data + +Based on the elements header, the serialization of elements data may skip `ref flag`/`null flag`/`element class info`. + +`CollectionSerializer#write/read` can be taken as an example. + +### Array + +#### Primitive array + +Primitive array are taken as a binary buffer, serialization will just write the length of array size as an unsigned int, +then copy the whole buffer into the stream. + +Such serialization won't compress the array. If users want to compress primitive array, users need to register custom +serializers for such types. + +#### Object array + +Object array is serialized using the collection format. Object component type will be taken as collection element +generic +type. + +### Map + +> All Map serializers must extend `AbstractMapSerializer`. + +Format: + +``` +| length(unsigned varint) | map header | key value pairs data | +``` + +#### Map header + +- For `HashMap/LinkedHashMap`, this will be empty. +- For `TreeMap`, this will be `Comparator` +- For other `Map`, this may be extra object field info. + +#### Map Key-Value data + +Map iteration is too expensive, Fury won't compute the header like for collection before since it introduce +[considerable overhead](https://github.com/apache/fury/issues/925). +Users can use `MapFieldInfo` annotation to provide header in advance. Otherwise Fury will use first key-value pair to +predict header optimistically, and update the chunk header if the prediction failed at some pair. + +Fury will serialize map chunk by chunk, every chunk has 127 pairs at most. + +``` +| 1 byte | 1 byte | variable bytes | ++----------------+----------------+-----------------+ +| KV header | chunk size: N | N*2 objects | +``` + +KV header: + +- If track key ref, use the first bit `0b1` of the header to flag it. +- If the key has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + key type, this flag is invalid. +- If the actual key type of map is not the declared key type, use the 3rd bit `0b100` of the header to flag it. +- If track value ref, use the 4th bit `0b1000` of the header to flag it. +- If the value has null, use the 5th bit `0b10000` of the header to flag it. If ref tracking is enabled for this + value type, this flag is invalid. +- If the value type of map is not the declared value type, use the 6rd bit `0b100000` of the header to flag it. +- If key or value is null, that key and value will be written as a separate chunk, and chunk size writing will be + skipped too. + +If streaming write is enabled, which means Fury can't update written `chunk size`. In such cases, map key-value data +format will be: + +``` +| 1 byte | variable bytes | ++----------------+-----------------+ +| KV header | N*2 objects | +``` + +`KV header` will be a header marked by `MapFieldInfo` in java. The implementation can generate different deserialization +code based read header, and look up the generated code from a linear map/list. + +### Enum + +Enums are serialized as an unsigned var int. If the order of enum values change, the deserialized enum value may not be +the value users expect. In such cases, users must register enum serializer by make it write enum value as an enumerated +string with unique hash disabled. + +### Object + +Object means object of `pojo/struct/bean/record` type. +Object will be serialized by writing its fields data in fury order. + +Depending on schema compatibility, objects will have different formats. + +#### Field order + +Field will be ordered as following, every group of fields will have its own order: + +- primitive fields: larger size type first, smaller later, variable size type last. +- boxed primitive fields: same order as primitive fields +- final fields: same type together, then sorted by field name lexicographically. +- collection fields: same order as final fields +- map fields: same order as final fields +- other fields: same order as final fields + +#### Schema consistent + +Object fields will be serialized one by one using following format: + +``` +Primitive field value: +| var bytes | ++----------------+ +| value data | ++----------------+ +Boxed field value: +| one byte | var bytes | ++-----------+---------------+ +| null flag | field value | ++-----------+---------------+ +field value of final type with ref tracking: +| var bytes | var objects | ++-----------+-------------+ +| ref meta | value data | ++-----------+-------------+ +field value of final type without ref tracking: +| one byte | var objects | ++-----------+-------------+ +| null flag | field value | ++-----------+-------------+ +field value of non-final type with ref tracking: +| one byte | var bytes | var objects | ++-----------+-------------+-------------+ +| ref meta | class meta | value data | ++-----------+-------------+-------------+ +field value of non-final type without ref tracking: +| one byte | var bytes | var objects | ++-----------+------------+------------+ +| null flag | class meta | value data | ++-----------+------------+------------+ +``` + +#### Schema evolution + +Schema evolution have similar format as schema consistent mode for object except: + +- For this object type itself, `schema consistent` mode will write class by id/name, but `schema evolution` mode will + write class field names, types and other meta too, see [Class meta](#class-meta). +- Class meta of `final custom type` needs to be written too, because peers may not have this class defined. + +### Class + +Class will be serialized using class meta format. + +## Implementation guidelines + +- Try to merge multiple bytes into an int/long write before writing to reduce memory IO and bound check cost. +- Read multiple bytes as an int/long, then split into multiple bytes to reduce memory IO and bound check cost. +- Try to use one varint/long to write flags and length together to save one byte cost and reduce memory io. +- Condition branches are less expensive compared to memory IO cost unless there are too many branches. diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/row_format_spec.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/row_format_spec.md new file mode 100644 index 00000000000..eefd9d9793b --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/row_format_spec.md @@ -0,0 +1,24 @@ +--- +title: Fury Row Format +sidebar_position: 2 +id: fury_row_format_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Row Format + +Coming soon diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/xlang_serialization_spec.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/xlang_serialization_spec.md new file mode 100644 index 00000000000..d15a3da9fd3 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/specification/xlang_serialization_spec.md @@ -0,0 +1,807 @@ +--- +title: Fury Xlang Serialization Format +sidebar_position: 0 +id: fury_xlang_serialization_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Cross-language Serialization Specification + +> Format Version History: +> +> - Version 0.1 - serialization spec formalized + +Fury xlang serialization is an automatic object serialization framework that supports reference and polymorphism. +Fury will convert an object from/to fury xlang serialization binary format. +Fury has two core concepts for xlang serialization: + +- **Fury xlang binary format** +- **Framework implemented in different languages to convert object to/from Fury xlang binary format** + +The serialization format is a dynamic binary format. The dynamics and reference/polymorphism support make Fury flexible, +much more easy to use, but +also introduce more complexities compared to static serialization frameworks. So the format will be more complex. + +## Type Systems + +### Data Types + +- bool: a boolean value (true or false). +- int8: a 8-bit signed integer. +- int16: a 16-bit signed integer. +- int32: a 32-bit signed integer. +- var_int32: a 32-bit signed integer which use fury var_int32 encoding. +- int64: a 64-bit signed integer. +- var_int64: a 64-bit signed integer which use fury PVL encoding. +- sli_int64: a 64-bit signed integer which use fury SLI encoding. +- float16: a 16-bit floating point number. +- float32: a 32-bit floating point number. +- float64: a 64-bit floating point number including NaN and Infinity. +- string: a text string encoded using Latin1/UTF16/UTF-8 encoding. +- enum: a data type consisting of a set of named values. Rust enum with non-predefined field values are not supported as + an enum. +- named_enum: an enum whose value will be serialized as the registered name. +- struct: a morphic(final) type serialized by Fury Struct serializer. i.e. it doesn't have subclasses. Suppose we're + deserializing `List`, we can save dynamic serializer dispatch since `SomeClass` is morphic(final). +- compatible_struct: a morphic(final) type serialized by Fury compatible Struct serializer. +- named_struct: a `struct` whose type mapping will be encoded as a name. +- named_compatible_struct: a `compatible_struct` whose type mapping will be encoded as a name. +- ext: a type which will be serialized by a customized serializer. +- named_ext: an `ext` type whose type mapping will be encoded as a name. +- list: a sequence of objects. +- set: an unordered set of unique elements. +- map: a map of key-value pairs. Mutable types such as `list/map/set/array/tensor/arrow` are not allowed as key of map. +- duration: an absolute length of time, independent of any calendar/timezone, as a count of nanoseconds. +- timestamp: a point in time, independent of any calendar/timezone, as a count of nanoseconds. The count is relative + to an epoch at UTC midnight on January 1, 1970. +- local_date: a naive date without timezone. The count is days relative to an epoch at UTC midnight on Jan 1, 1970. +- decimal: exact decimal value represented as an integer value in two's complement. +- binary: an variable-length array of bytes. +- array: only allow numeric components. Other arrays will be taken as List. The implementation should support the + interoperability between array and list. +- array: multidimensional array which every sub-array can have different sizes but all have same type. +- bool_array: one dimensional int16 array. +- int8_array: one dimensional int8 array. +- int16_array: one dimensional int16 array. +- int32_array: one dimensional int32 array. +- int64_array: one dimensional int64 array. +- float16_array: one dimensional half_float_16 array. +- float32_array: one dimensional float32 array. +- float64_array: one dimensional float64 array. +- arrow record batch: an arrow [record batch](https://arrow.apache.org/docs/cpp/tables.html#record-batches) object. +- arrow table: an arrow [table](https://arrow.apache.org/docs/cpp/tables.html#tables) object. + +Note: + +- Unsigned int/long are not added here, since not every language support those types. + +### Polymorphisms + +For polymorphism, if one non-final class is registered, and only one subclass is registered, then we can take all +elements in List/Map have same type, thus reduce runtime check cost. + +Collection/Array polymorphism are not fully supported, since some languages such as golang have only one collection +type. If users want to get exactly the type he passed, he must pass that type when deserializing or annotate that type +to the field of struct. + +### Type disambiguation + +Due to differences between type systems of languages, those types can't be mapped one-to-one between languages. When +deserializing, Fury use the target data structure type and the data type in the data jointly to determine how to +deserialize and populate the target data structure. For example: + +```java +class Foo { + int[] intArray; + Object[] objects; + List objectList; +} + +class Foo2 { + int[] intArray; + List objects; + List objectList; +} +``` + +`intArray` has an `int32_array` type. But both `objects` and `objectList` fields in the serialize data have `list` data +type. When deserializing, the implementation will create an `Object` array for `objects`, but create a `ArrayList` +for `objectList` to populate its elements. And the serialized data of `Foo` can be deserialized into `Foo2` too. + +Users can also provide meta hints for fields of a type, or the type whole. Here is an example in java which use +annotation to provide such information. + +```java +@FuryObject(fieldsNullable = false, trackingRef = false) +class Foo { + @FuryField(trackingRef = false) + int[] intArray; + @FuryField(polymorphic = true) + Object object; + @FuryField(tagId = 1, nullable = true) + List objectList; +} +``` + +Such information can be provided in other languages too: + +- cpp: use macro and template. +- golang: use struct tag. +- python: use typehint. +- rust: use macro. + +### Type ID + +All internal data types are expressed using an ID in range `0~64`. Users can use `0~4096` for representing their +types. + +### Type mapping + +See [Type mapping](../guide/xlang_type_mapping.md) + +## Spec overview + +Here is the overall format: + +``` +| fury header | object ref meta | object type meta | object value data | +``` + +The data are serialized using little endian byte order overall. If bytes swap is costly for some object, +Fury will write the byte order for that object into the data instead of converting it to little endian. + +## Fury header + +Fury header consists starts one byte: + +``` +| 2 bytes | 4 bits | 1 bit | 1 bit | 1 bit | 1 bit | 1 byte | optional 4 bytes | ++--------------+---------------+-------+-------+--------+-------+------------+------------------------------------+ +| magic number | reserved bits | oob | xlang | endian | null | language | unsigned int for meta start offset | +``` + +- magic number: used to identify fury serialization protocol, current version use `0x62d4`. +- null flag: 1 when object is null, 0 otherwise. If an object is null, other bits won't be set. +- endian flag: 1 when data is encoded by little endian, 0 for big endian. +- xlang flag: 1 when serialization uses xlang format, 0 when serialization uses Fury java format. +- oob flag: 1 when passed `BufferCallback` is not null, 0 otherwise. +- language: the language when serializing objects, such as JAVA, PYTHON, GO, etc. Fury can use this flag to determine whether spend more time on serialization to make the deserialization faster for dynamic languages. + +If meta share mode is enabled, an uncompressed unsigned int is appended to indicate the start offset of metadata. + +## Reference Meta + +Reference tracking handles whether the object is null, and whether to track reference for the object by writing +corresponding flags and maintaining internal state. + +Reference flags: + +| Flag | Byte Value | Description | +|---------------------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| +| NULL FLAG | `-3` | This flag indicates the object is a null value. We don't use another byte to indicate REF, so that we can save one byte. | +| REF FLAG | `-2` | This flag indicates the object is already serialized previously, and fury will write a ref id with unsigned varint format instead of serialize it again | +| NOT_NULL VALUE FLAG | `-1` | This flag indicates the object is a non-null value and fury doesn't track ref for this type of object. | +| REF VALUE FLAG | `0` | This flag indicates the object is referencable and the first time to serialize. | + +When reference tracking is disabled globally or for specific types, or for certain types within a particular +context(e.g., a field of a type), only the `NULL` and `NOT_NULL VALUE` flags will be used for reference meta. + +For languages which doesn't support reference such as rust, reference tracking must be disabled for correct +deserialization by fury rust implementation. + +For languages whose object values are not null by default: + +- In rust, Fury takes `Option:None` as a null value +- In c++, Fury takes `std::nullopt` as a null value +- In golang, Fury takes `null interface/pointer` as a null value + +If one want to deserialize in languages like `Java/Python/JavaScript`, he should mark the type with all fields +not-null by default, or using schema-evolution mode to carry the not-null fields info in the data. + +## Type Meta + +For every type to be serialized, it must be registered with an optional ID first. The registered type will have a +user-provided or an auto-growing unsigned int i.e. `type_id`. The registration can be used for security check and type +identification. The id of user registered type will be added by `64` to make space for Fury internal data types. + +Depending on whether meta share mode and registration is enabled for current type, Fury will write type meta +differently. + +### Schema consistent + +- If schema consistent mode is enabled globally when creating fury, type meta will be written as a fury unsigned varint + of `type_id`. Schema evolution related meta will be ignored. +- If schema evolution mode is enabled globally when creating fury, and current class is configured to use schema + consistent mode like `struct` vs `table` in flatbuffers: + - Type meta will be add to `captured_type_defs`: `captured_type_defs[type def stub] = map size` ahead when + registering type. + - Get index of the meta in `captured_type_defs`, write that index as `| unsigned varint: index |`. + +### Schema evolution + +If schema evolution mode is enabled globally when creating fury, and enabled for current type, type meta will be written +using one of the following mode. Which mode to use is configured when creating fury. + +- Normal mode(meta share not enabled): + - If type meta hasn't been written before, add `type def` + to `captured_type_defs`: `captured_type_defs[type def] = map size`. + - Get index of the meta in `captured_type_defs`, write that index as `| unsigned varint: index |`. + - After finished the serialization of the object graph, fury will start to write `captured_type_defs`: + - Firstly, set current to `meta start offset` of fury header + - Then write `captured_type_defs` one by one: + + ```python + buffer.write_var_uint32(len(writting_type_defs) - len(schema_consistent_type_def_stubs)) + for type_meta in writting_type_defs: + if not type_meta.is_stub(): + type_meta.write_type_def(buffer) + writing_type_defs = copy(schema_consistent_type_def_stubs) + ``` + +- Meta share mode: the writing steps are same as the normal mode, but `captured_type_defs` will be shared across + multiple serializations of different objects. For example, suppose we have a batch to serialize: + + ```python + captured_type_defs = {} + stream = ... + # add `Type1` to `captured_type_defs` and write `Type1` + fury.serialize(stream, [Type1()]) + # add `Type2` to `captured_type_defs` and write `Type2`, `Type1` is written before. + fury.serialize(stream, [Type1(), Type2()]) + # `Type1` and `Type2` are written before, no need to write meta. + fury.serialize(stream, [Type1(), Type2()]) + ``` + +- Streaming mode(streaming mode doesn't support meta share): + - If type meta hasn't been written before, the data will be written as: + + ``` + | unsigned varint: 0b11111111 | type def | + ``` + + - If type meta has been written before, the data will be written as: + + ``` + | unsigned varint: written index << 1 | + ``` + + `written index` is the id in `captured_type_defs`. + - With this mode, `meta start offset` can be omitted. + +> The normal mode and meta share mode will forbid streaming writing since it needs to look back for update the start +> offset after the whole object graph writing and meta collecting is finished. Only in this way we can ensure +> deserialization failure in meta share mode doesn't lost shared meta. + +#### Type Def + +Here we mainly describe the meta layout for schema evolution mode: + +``` +| 8 bytes meta header | variable bytes | variable bytes | variable bytes | ++-------------------------------+--------------------+-------------------+----------------+ +| 7 bytes hash + 1 bytes header | current type meta | parent type meta | ... | +``` + +Type meta are encoded from parent type to leaf type, only type with serializable fields will be encoded. + +##### Meta header + +Meta header is a 64 bits number value encoded in little endian order. + +- Lowest 4 digits `0b0000~0b1110` are used to record num classes. `0b1111` is preserved to indicate that Fury need to + read more bytes for length using Fury unsigned int encoding. If current type doesn't has parent type, or parent + type doesn't have fields to serialize, or we're in a context which serialize fields of current type + only, num classes will be 1. +- The 5th bit is used to indicate whether this type needs schema evolution. +- Other 56 bits are used to store the unique hash of `flags + all layers type meta`. + +##### Single layer type meta + +``` +| unsigned varint | var uint | field info: variable bytes | variable bytes | ... | ++-----------------+----------+-------------------------------+-----------------+-----+ +| num_fields | type id | header + type id + field name | next field info | ... | +``` + +- num fields: encode `num fields` as unsigned varint. + - If the current type is schema consistent, then num_fields will be `0` to flag it. + - If the current type isn't schema consistent, then num_fields will be the number of compatible fields. For example, + users can use tag id to mark some fields as compatible fields in schema consistent context. In such cases, schema + consistent fields will be serialized first, then compatible fields will be serialized next. At deserialization, + Fury will use fields info of those fields which aren't annotated by tag id for deserializing schema consistent + fields, then use fields info in meta for deserializing compatible fields. +- type id: the registered id for the current type, which will be written as an unsigned varint. +- field info: + - header(8 + bits): `4 bits size + 2 bits field name encoding + nullability flag + ref tracking flag`. + Users can use annotation to provide those info. + - 2 bits field name encoding: + - encoding: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL/TAG_ID` + - If tag id is used, i.e. field name is written by an unsigned varint tag id. 2 bits encoding will be `11`. + - size of field name: + - The `4 bits size: 0~14` will be used to indicate length `1~15`, the value `15` indicates to read more bytes, + the encoding will encode `size - 15` as a varint next. + - If encoding is `TAG_ID`, then num_bytes of field name will be used to store tag id. + - ref tracking: when set to 1, ref tracking will be enabled for this field. + - nullability: when set to 1, this field can be null. + - field name: If tag id is set, tag id will be used instead. Otherwise meta string encoding `[length]` and data will + be written instead. + - type id: + - Format: `id << 1 | polymorphic flag`. If field type is polymorphic, this flag is set to `0b1`, otherwise it's + `0b0` + - For registered type-consistent classes, it will be the registered type id. + - For struct type it will be written as `STRUCT`. + - The meta for struct type is written separately instead of inlining here is to reduce meta space cost if object of + this type is serialized in current object graph multiple times, and the field value may be null too. + - For enum type, it will be written as `ENUM`. + - For collection type, it will be written as `COLLECTION`, then write element type recursively. + - For map type, it will be written as `MAP`, then write key and value type recursively. + +Field order are left as implementation details, which is not exposed to specification, the deserialization need to +resort fields based on Fury field comparator. In this way, fury can compute statistics for field names or types and +using a more compact encoding. + +##### Other layers type meta + +Same encoding algorithm as the previous layer. + +## Meta String + +Meta string is mainly used to encode meta strings such as field names. + +### Encoding Algorithms + +String binary encoding algorithm: + +| Algorithm | Pattern | Description | +|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| UTF-8 | any chars | UTF-8 encoding | + +Encoding flags: + +| Encoding Flag | Pattern | Encoding Algorithm | +|---------------------------|----------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | every char is in `a-z._\|` | `LOWER_SPECIAL` | +| FIRST_TO_LOWER_SPECIAL | every char is in `a-z._` except first char is upper case | replace first upper case char to lower case, then use `LOWER_SPECIAL` | +| ALL_TO_LOWER_SPECIAL | every char is in `a-zA-Z._` | replace every upper case char by `\|` + `lower case`, then use `LOWER_SPECIAL`, use this encoding if it's smaller than Encoding `LOWER_UPPER_DIGIT_SPECIAL` | +| LOWER_UPPER_DIGIT_SPECIAL | every char is in `a-zA-Z._` | use `LOWER_UPPER_DIGIT_SPECIAL` encoding if it's smaller than Encoding `FIRST_TO_LOWER_SPECIAL` | +| UTF8 | any utf-8 char | use `UTF-8` encoding | +| Compression | any utf-8 char | lossless compression | + +Notes: + +- Depending on cases, one can choose encoding `flags + data` jointly, uses 3 bits of first byte for flags and other + bytes + for data. + +## Value Format + +### Basic types + +#### bool + +- size: 1 byte +- format: 0 for `false`, 1 for `true` + +#### int8 + +- size: 1 byte +- format: write as pure byte. + +#### int16 + +- size: 2 byte +- byte order: raw bytes of little endian order + +#### unsigned int32 + +- size: 4 byte +- byte order: raw bytes of little endian order + +#### unsigned varint32 + +- size: 1~5 byte +- Format: The most significant bit (MSB) in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then + the next byte should be read until the first bit of the next byte is unset. + +#### signed int32 + +- size: 4 byte +- byte order: raw bytes of little endian order + +#### signed varint32 + +- size: 1~5 byte +- Format: First convert the number into positive unsigned int by `(v << 1) ^ (v >> 31)` ZigZag algorithm, then encode + it as an unsigned varint. + +#### unsigned int64 + +- size: 8 byte +- byte order: raw bytes of little endian order + +#### unsigned varint64 + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in `[0, 2147483647]`, encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - positive long format: first bit in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then the next byte should be read until the first bit is unset. + +#### signed int64 + +- size: 8 byte +- byte order: raw bytes of little endian order + +#### signed varint64 + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in `[-1073741824, 1073741823]`, encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - First convert the number into positive unsigned long by `(v << 1) ^ (v >> 63)` ZigZag algorithm to reduce cost of + small negative numbers, then encoding it as an unsigned long. + +#### float32 + +- size: 4 byte +- format: encode the specified floating-point value according to the IEEE 754 floating-point "single format" bit layout, + preserving Not-a-Number (NaN) values, then write as binary by little endian order. + +#### float64 + +- size: 8 byte +- format: encode the specified floating-point value according to the IEEE 754 floating-point "double format" bit layout, + preserving Not-a-Number (NaN) values. then write as binary by little endian order. + +### string + +Format: + +``` +| unsigned varint64: size << 2 `bitor` 2 bits encoding flags | binary data | +``` + +- `size + encoding` will be concat as a long and encoded as an unsigned varint64. The little 2 bits is used for + encoding: + 0 for `latin1(ISO-8859-1)`, 1 for `utf-16`, 2 for `utf-8`. +- encoded string binary data based on encoding: `latin/utf-16/utf-8`. + +Which encoding to choose: + +- For JDK8: fury detect `latin` at runtime, if string is `latin` string, then use `latin` encoding, otherwise + use `utf-16`. +- For JDK9+: fury use `coder` in `String` object for encoding, `latin`/`utf-16` will be used for encoding. +- If the string is encoded by `utf-8`, then fury will use `utf-8` to decode the data. Cross-language string + serialization of fury uses `utf-8` by default. + +### list + +Format: + +``` +| unsigned varint64: length << 4 `bitor` 4 bits elements header | elements data | +``` + +#### elements header + +In most cases, all elements are same type and not null, elements header will encode those homogeneous +information to avoid the cost of writing it for every element. Specifically, there are four kinds of information +which will be encoded by elements header, each use one bit: + +- If track elements ref, use the first bit `0b1` of the header to flag it. +- If the elements have null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + element type, this flag is invalid. +- If the element types are not the declared type, use the 3rd bit `0b100` of the header to flag it. +- If the element types are different, use the 4rd bit `0b1000` header to flag it. + +By default, all bits are unset, which means all elements won't track ref, all elements are same type, not null and +the actual element is the declared type in the custom type field. + +The implementation can generate different deserialization code based read header, and look up the generated code from +a linear map/list. + +#### elements data + +Based on the elements header, the serialization of elements data may skip `ref flag`/`null flag`/`element type info`. + +```python +fury = ... +buffer = ... +elems = ... +if element_type_is_same: + if not is_declared_type: + fury.write_type(buffer, elem_type) + elem_serializer = get_serializer(...) + if track_ref: + for elem in elems: + if not ref_resolver.write_ref_or_null(buffer, elem): + elem_serializer.write(buffer, elem) + elif has_null: + for elem in elems: + if elem is None: + buffer.write_byte(null_flag) + else: + buffer.write_byte(not_null_flag) + elem_serializer.write(buffer, elem) + else: + for elem in elems: + elem_serializer.write(buffer, elem) +else: + if track_ref: + for elem in elems: + fury.write_ref(buffer, elem) + elif has_null: + for elem in elems: + fury.write_nullable(buffer, elem) + else: + for elem in elems: + fury.write_value(buffer, elem) +``` + +[`CollectionSerializer#writeElements`](https://github.com/apache/fury/blob/20a1a78b17a75a123a6f5b7094c06ff77defc0fe/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractCollectionSerializer.java#L302) +can be taken as an example. + +### array + +#### primitive array + +Primitive array are taken as a binary buffer, serialization will just write the length of array size as an unsigned int, +then copy the whole buffer into the stream. + +Such serialization won't compress the array. If users want to compress primitive array, users need to register custom +serializers for such types or mark it as list type. + +#### object array + +Object array is serialized using the list format. Object component type will be taken as list element +generic type. + +### map + +> All Map serializers must extend `AbstractMapSerializer`. + +Format: + +``` +| length(unsigned varint) | key value chunk data | ... | key value chunk data | +``` + +#### map key-value chunk data + +Map iteration is too expensive, Fury won't compute the header like for list since it introduce +[considerable overhead](https://github.com/apache/fury/issues/925). +Users can use `MapFieldInfo` annotation to provide the header in advance. Otherwise Fury will use first key-value pair +to predict header optimistically, and update the chunk header if the prediction failed at some pair. + +Fury will serialize the map chunk by chunk, every chunk has 255 pairs at most. + +``` +| 1 byte | 1 byte | variable bytes | ++----------------+----------------+-----------------+ +| KV header | chunk size: N | N*2 objects | +``` + +KV header: + +- If track key ref, use the first bit `0b1` of the header to flag it. +- If the key has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + key type, this flag is invalid. +- If the actual key type of map is not the declared key type, use the 3rd bit `0b100` of the header to flag it. +- If track value ref, use the 4th bit `0b1000` of the header to flag it. +- If the value has null, use the 5th bit `0b10000` of the header to flag it. If ref tracking is enabled for this + value type, this flag is invalid. +- If the value type of map is not the declared value type, use the 6rd bit `0b100000` of the header to flag it. +- If key or value is null, that key and value will be written as a separate chunk, and chunk size writing will be + skipped too. + +If streaming write is enabled, which means Fury can't update written `chunk size`. In such cases, map key-value data +format will be: + +``` +| 1 byte | variable bytes | ++----------------+-----------------+ +| KV header | N*2 objects | +``` + +`KV header` will be a header marked by `MapFieldInfo` in java. For languages such as golang, this can be computed in +advance for non-interface types most times. The implementation can generate different deserialization code based read +header, and look up the generated code from a linear map/list. + +#### Why serialize chunk by chunk? + +When fury will use first key-value pair to predict header optimistically, it can't know how many pairs have same +meta(tracking kef ref, key has null and so on). If we don't write chunk by chunk with max chunk size, we must write at +least `X` bytes to take up a place for later to update the number which has same elements, `X` is the num_bytes for +encoding varint encoding of map size. + +And most map size are smaller than 255, if all pairs have same data, the chunk will be 1. This is common in golang/rust, +which object are not reference by default. + +Also, if only one or two keys have different meta, we can make it into a different chunk, so that most pairs can share +meta. + +The implementation can accumulate read count with map size to decide whether to read more chunks. + +### enum + +Enums are serialized as an unsigned var int. If the order of enum values change, the deserialized enum value may not be +the value users expect. In such cases, users must register enum serializer by make it write enum value as an enumerated +string with unique hash disabled. + +### decimal + +Not supported for now. + +### struct + +Struct means object of `class/pojo/struct/bean/record` type. +Struct will be serialized by writing its fields data in fury order. + +Depending on schema compatibility, structs will have different formats. + +#### field order + +Field will be ordered as following, every group of fields will have its own order: + +- primitive fields: larger size type first, smaller later, variable size type last. +- boxed primitive fields: same order as primitive fields +- final fields: same type together, then sorted by field name lexicographically. +- list fields: same order as final fields +- map fields: same order as final fields +- other fields: same order as final fields + +#### schema consistent + +Object will be written as: + +``` +| 4 byte | variable bytes | ++---------------+------------------+ +| type hash | field values | +``` + +Type hash is used to check the type schema consistency across languages. Type hash will be the first 32 bits of 56 bits +value of the type meta. + +Object fields will be serialized one by one using following format: + +``` +not null primitive field value: +| var bytes | ++----------------+ +| value data | ++----------------+ +nullable primitive field value: +| one byte | var bytes | ++-----------+---------------+ +| null flag | field value | ++-----------+---------------+ +field value of final type with ref tracking: +| var bytes | var objects | ++-----------+-------------+ +| ref meta | value data | ++-----------+-------------+ +field value of final type without ref tracking: +| one byte | var objects | ++-----------+-------------+ +| null flag | field value | ++-----------+-------------+ +field value of non-final type with ref tracking: +| one byte | var bytes | var objects | ++-----------+-------------+-------------+ +| ref meta | type meta | value data | ++-----------+-------------+-------------+ +field value of non-final type without ref tracking: +| one byte | var bytes | var objects | ++-----------+------------+------------+ +| null flag | type meta | value data | ++-----------+------------+------------+ +``` + +#### Schema evolution + +Schema evolution have similar format as schema consistent mode for object except: + +- For the object type, `schema consistent` mode will write type by id only, but `schema evolution` mode will + write type consisting of field names, types and other meta too, see [Type meta](#type-meta). +- Type meta of `final custom type` needs to be written too, because peers may not have this type defined. + +### Type + +Type will be serialized using type meta format. + +## Implementation guidelines + +### How to reduce memory read/write code + +- Try to merge multiple bytes into an int/long write before writing to reduce memory IO and bound check cost. +- Read multiple bytes as an int/long, then split into multiple bytes to reduce memory IO and bound check cost. +- Try to use one varint/long to write flags and length together to save one byte cost and reduce memory io. +- Condition branches are less expensive compared to memory IO cost unless there are too many branches. + +### Fast deserialization for static languages without runtime codegen support + +For type evolution, the serializer will encode the type meta into the serialized data. The deserializer will compare +this meta with class meta in the current process, and use the diff to determine how to deserialize the data. + +For java/javascript/python, we can use the diff to generate serializer code at runtime and load it as class/function for +deserialization. In this way, the type evolution will be as fast as type consist mode. + +For C++/Rust, we can't generate the serializer code at runtime. So we need to generate the code at compile-time using +meta programming. But at that time, we don't know the type schema in other processes, so we can't generate the +serializer code for such inconsistent types. We may need to generate the code which has a loop and compare field name +one by one to decide whether to deserialize and assign the field or skip the field value. + +One fast way is that we can optimize the string comparison into `jump` instructions: + +- Assume the current type has `n` fields, and the peer type has `n1` fields. +- Generate an auto growing `field id` from `0` for every sorted field in the current type at the compile time. +- Compare the received type meta with current type, generate same id if the field name is same, otherwise generate an + auto growing id starting from `n`, cache this meta at runtime. +- Iterate the fields of received type meta, use a `switch` to compare the `field id` to deserialize data + and `assign/skip` field value. **Continuous** field id will be optimized into `jump` in `switch` block, so it will + very fast. + +Here is an example, suppose process A has a class `Foo` with version 1 defined as `Foo1`, process B has a class `Foo` +with version 2 defined as `Foo2`: + +```c++ +// class Foo with version 1 +class Foo1 { + int32_t v1; // id 0 + std::string v2; // id 1 +}; +// class Foo with version 2 +class Foo2 { + // id 0, but will have id 2 in process A + bool v0; + // id 1, but will have id 0 in process A + int32_t v1; + // id 2, but will have id 3 in process A + int64_t long_value; + // id 3, but will have id 1 in process A + std::string v2; + // id 4, but will have id 4 in process A + std::vector list; +}; +``` + +When process A received serialized `Foo2` from process B, here is how it deserialize the data: + +```c++ +Foo1 foo1 = ...; +const std::vector &field_infos = type_meta.field_infos; +for (const auto &field_info : field_infos) { + switch (field_info.field_id) { + case 0: + foo1.v1 = buffer.read_varint32(); + break; + case 1: + foo1.v2 = fury.read_string(); + break; + default: + fury.skip_data(field_info); + } +} +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/community.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/community.md new file mode 100644 index 00000000000..eb3d4ed11d5 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/community.md @@ -0,0 +1,85 @@ +--- +title: 社区 +sidebar_position: 0 +id: community +--- + +Apache Fury 是一个由社区驱动的开源项目,项目的蓬勃发展得益于社区贡献。 +我们邀请您根据自己的意愿尽可能地参与项目。以下是几种贡献方式: + +- 使用 Apache Fury 并分享使用体验和反馈问题; +- 为项目提供最佳实践示例; +- 报告错误并修复; +- 贡献代码和参与文档建设。 + +## 邮件列表 + +| 邮件列表 | 描述 | 订阅 | 取消订阅 | 发送邮件 | 活动 | +|-------------------------|---------------------------------------------|-------------------------------------------------------|-----------------------------------------------------------|------------------------------------|-----------------------------------------------------------------------| +| dev@fury.apache.org | 开发相关讨论 | [订阅](mailto:dev-subscribe@fury.apache.org) | [取消订阅](mailto:dev-unsubscribe@fury.apache.org) | [发送邮件](mailto:dev@fury.apache.org) | [邮件列表活动](https://lists.apache.org/list.html?dev@fury.apache.org) | +| commits@fury.apache.org | 仓库的所有 commits | [订阅](mailto:commits-subscribe@fury.apache.org) | [取消订阅](mailto:commits-unsubscribe@fury.apache.org) | 只读的邮件列表 | [邮件列表活动](https://lists.apache.org/list.html?commits@fury.apache.org) | + +在尝试发送邮件之前,请确保订阅上述的邮件列表。 + +**如果您没有订阅邮件列表,您的邮件将被拒绝或不会收到回复。** + +### 如何订阅邮件列表 + +要发送邮件至邮件列表,请先通过以下方式订阅: + +1. 发送电子邮件至 listname-subscribe@fury.apache.org,并相应替换 `listname`; +2. 回复您将收到的确认电子邮件,保持邮件主题行完整; +3. 然后您将收到一封欢迎的电子邮件,订阅成功。 + +在讨论电子邮件中的代码片段时,请确保: + +- 您不要链接到外部服务中的文件,因为此类文件可能会更改、被删除或链接可能会中断,从而使存档的电子邮件线程变得无用; +- 您粘贴文本而不是文本屏幕截图; +- 粘贴代码时保持格式,以保持代码可读; +- 有足够的导入语句以避免产生代码歧义。 + +## Slack + +您可以加入[Slack 上的 Apache Fury™ 社区](https://join.slack.com/t/fury-project/shared_invite/zt-1u8soj4qc-ieYEu7ciHOqA2mo47llS8A)。 + +这里有一些社区规则: + +- 保持尊重和友善; +- 所有重要的决定和结论都必须反映到邮件列表中。 “如果这没有在邮件列表中有相关的讨论记录,则代表它不生效” ; +- [The Apache Way](https://theapacheway.com/on-list/); +- 使用 Slack 线程来防止并行对话淹没当前的对话频道; +- 请不要直接向邮件列表发送 Bug fix、Issue 分配和 Code Review 消息。这些内容应该被社区贡献者自愿处理并分配。 + +## Issue 跟踪 + +我们使用 GitHub Issues 来跟踪所有 Issues: + +- 代码相关问题:https://github.com/apache/fury/issues +- 网站相关问题:https://github.com/apache/fury-site/issues + +您需要有一个 [GitHub 帐户](https://github.com/signup) 才能创建问题。 +如果您没有 [GitHub 帐户](https://github.com/signup),您可以发送电子邮件至 dev@fury.apache.org。 + +### 报告 Bug + +您在报告 Bug 之前,应该: + +- 验证该 Bug 确实存在; +- 搜索 [Issue List](https://github.com/apache/fury/issues) 以确保不存在相关 Bug。 +- 在 Issue List 中创建 [bug 报告](https://github.com/apache/fury/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml)。 +- 如果可能的话,深入研究 Apache Fury 的源代码,并针对您报告的 Bug 提交补丁,这有助于快速修复 Bug。 + +### 报告安全漏洞 + +Apache Fury 是 [Apache 软件基金会](https://apache.org/) 的一个项目,遵循 [ASF 漏洞处理流程](https://apache.org/security/#vulnerability-handling)。 + +要报告您发现的新的安全漏洞,请遵循 [ASF 漏洞报告流程](https://apache.org/security/#reporting-a-vulnerability),该流程解释了如何私下向社区维护者发送详细的漏洞信息。 + +### New Feature + +欢迎您增强功能或新功能建议。提案越具体、越合理,您在 Fury 社区的影响力就越大。它有可能在之后版本发布。 + +### 项目源代码 + +- Fury Core 存储库:https://github.com/apache/fury +- Fury 网站存储库:https://github.com/apache/fury-site diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_join_community.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_join_community.md new file mode 100644 index 00000000000..8203570f064 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_join_community.md @@ -0,0 +1,108 @@ +--- +title: 如何加入 Fury 社区 +sidebar_position: 0 +id: how_to_join_community +--- + +首先为你选择加入开源贡献行列的行为点赞 👍🏻。再者,十分感谢你选择参与到 Fury 社区,为这个开源项目做出贡献。 + +## Fury 贡献指南 + +Fury 团队通常在 github 上进行开发和 issue 维护,请打开 [Github 网站](https://github.com/),点击右上角 `Sign up` 按钮,注册一个自己的账号,开启你开源之旅的第一步。 + +在 [Fury仓库](https://github.com/apache/fury)中,我们有一份面向所有开源贡献者的[指南](https://fury.apache.org/zh-CN/docs/community/),介绍了有关版本管理、分支管理等内容,**请花几分钟时间阅读了解一下**。 + +## 你的第一个 Pull Request + +### Step0:安装 Git + +Git 是一种版本控制系统,用于跟踪和管理软件开发项目中的代码变更。它帮助开发者记录和管理代码的历史记录,方便团队协作、代码版本控制、合并代码等操作。通过 Git,您可以追踪每个文件的每个版本,并轻松地在不同版本之间进行切换和比较。Git 还提供了分支管理功能,使得可以同时进行多个并行开发任务。 + +- 访问 Git 官方网站:[https://git-scm.com](https://git-scm.com) +- 下载最新版本的 Git 安装程序。 +- 运行下载的安装程序,按照安装向导的提示进行安装。 +- 安装完成后,你可以通过命令行使用 `git version` 命令确认安装成功。 + +### Step1:Fork 项目 + +- 首先需要 fork 这个项目,进入[Fury项目页面](https://github.com/apache/fury),点击右上角的 Fork 按钮 +- 你的 github 帐号中会出现 xxxx(你的 github 用户名)/fury 这个项目 +- 在本地电脑上使用以下命令: 得到一个 fury 文件夹 + +``` +// ssh +git clone git@github.com:xxxx(你的github用户名)/fury.git +// https +git clone https://github.com/xxxx(你的github用户名)/fury.git +``` + +### Step2:获取项目代码 + +- 进入 fury 文件夹,添加 fury 的远程地址 + +``` +git remote add upstream https://github.com/apache/fury.git +``` + +### Step3:创建分支 + +- 好了,现在可以开始贡献我们的代码了。fury 默认分支为 main 分支。无论是功能开发、bug 修复、文档编写,都请新建立一个分支,再合并到 main 分支上。使用以下代码创建分支: + +```shell +// 创建功能开发分支 +git checkout -b feat/xxxx + +// 创建问题修复开发分支 +git checkout -b fix/xxxx + +// 创建文档、demo分支 +git checkout -b docs/add-java-demo +``` + +假设我们创建了文档修改分支 `docs/add-java-demo` + +- 假设我们已经添加了一些代码,提交到代码库 + +- git add . + +- git commit -a -m "docs: add java demo and related docs" 。 + +### Step4:合并修改 + +- 切换回自己的开发分支: + +``` +git checkout docs/add-java-demo +``` + +- 把更新代码提交到自己的分支中: + +``` +git push origin docs/add-java-demo +``` + +### Step5:提交 Pull Request + +你可以在你的 github 代码仓库页面点击 `Compare & pull request` 按钮。或通过 `contribute` 按钮创建。 + +- 填写这是什么类型的修改 +- 填写关联的 issue +- 若有复杂变更,请说明背景和解决方案 + +相关信息填写完成后,点击 Create pull request 提交。 + +## **轻松步入 Fury 开源贡献之旅** + +"**good first issue**" 是一个在开源社区常见的标签,这个标签的目的是帮助新贡献者找到适合入门的问题。 + +Fury 的入门问题,你可以通过 [issue 列表](https://github.com/apache/fury/issues)查看。 + +如果你当前**有时间和意愿**参与到社区贡献,可以在 issue 里看一看 **good first issue**,选择一个感兴趣、适合自己的认领。 + +## 拥抱 Apache Fury 社区 + +在你为 Fury 贡献代码之余,我们鼓励你参与其他让社区更加繁荣的事情,比如: + +- 为项目的发展、功能规划 等提建议。 +- 创作文章、视频,开办讲座来宣传 Fury。 +- 撰写推广计划,同团队一同执行。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_release.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_release.md new file mode 100644 index 00000000000..8439fe6fb4c --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_release.md @@ -0,0 +1,531 @@ +--- +title: 如何发布 +sidebar_position: 0 +id: how_to_release +--- + +本文主要介绍如何发布新版本的 Apache Fury。 + +## 介绍 + +源代码发布是 Apache 最重视以及最重要的部分。 + +请注意许可证和发布的软件签名问题。发布软件是一件严肃的事情,并会产生相应的法律后果。 + +## release manager 第一次发布 + +### 环境要求 + +此发布过程在 Ubuntu 系统中运行,需要以下几个环境依赖: + +- JDK 1.8+ +- Apache Maven 3.x+ +- Python 3.8 +- GnuPG 2.x +- Git +- SVN(Apache 基金会使用 svn 来托管项目发布) +- **设置环境变量**:如果您在不同的目录下配置了 gpg 密钥,请执行 `export GNUPGHOME=$(xxx)` 导出环境变量。 + +### 准备 GPG 密钥 + +如果您是第一次作为软件发布者,您需要准备一个 GPG 密钥。 + +您可以参考这里的[快速开始](https://infra.apache.org/openpgp.html)获取一个 GPG 密钥或者获取更多相关信息。 + +#### 安装 GPG + +```bash +sudo apt install gnupg2 +``` + +#### 生成 GPG 密钥 + +请使用您的 Apache 名字和电子邮件地址生成 GPG 密钥: + +```bash +$ gpg --full-gen-key +gpg (GnuPG) 2.2.20; Copyright (C) 2020 Free Software Foundation, Inc. +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. + +Please select what kind of key you want: + (1) RSA and RSA (default) + (2) DSA and Elgamal + (3) DSA (sign only) + (4) RSA (sign only) + (14) Existing key from card +Your selection? 1 # input 1 +RSA keys may be between 1024 and 4096 bits long. +What keysize do you want? (2048) 4096 # input 4096 +Requested keysize is 4096 bits +Please specify how long the key should be valid. + 0 = key does not expire + = key expires in n days + w = key expires in n weeks + m = key expires in n months + y = key expires in n years +Key is valid for? (0) 0 # input 0 +Key does not expire at all +Is this correct? (y/N) y # input y + +GnuPG needs to construct a user ID to identify your key. + +Real name: Chaokun Yang # input your name +Email address: chaokunyang@apache.org # input your email +Comment: CODE SIGNING KEY # input some annotations, optional +You selected this USER-ID: + "Chaokun " + +Change (N)ame, (C)omment, (E)mail or (O)kay/(Q)uit? O # input O +We need to generate a lot of random bytes. It is a good idea to perform +some other action (type on the keyboard, move the mouse, utilize the +disks) during the prime generation; this gives the random number +generator a better chance to gain enough entropy. +We need to generate a lot of random bytes. It is a good idea to perform +some other action (type on the keyboard, move the mouse, utilize the +disks) during the prime generation; this gives the random number +generator a better chance to gain enough entropy. + +# Input the security key +┌──────────────────────────────────────────────────────┐ +│ Please enter this passphrase │ +│ │ +│ Passphrase: _______________________________ │ +│ │ +│ │ +└──────────────────────────────────────────────────────┘ +# key generation will be done after your inputting the key with the following output +gpg: key E49B00F626B marked as ultimately trusted +gpg: revocation certificate stored as '/Users/chaokunyang/.gnupg/openpgp-revocs.d/1E2CDAE4C08AD7D694D1CB139D7BE8E45E580BA4.rev' +public and secret key created and signed. + +pub rsa4096 2022-07-12 [SC] + 1E2CDAE4C08AD7D694D1CB139D7BE8E45E580BA4 +uid [ultimate] Chaokun +sub rsa4096 2022-07-12 [E] +``` + +#### 上传公钥至 GPG 密钥服务器 + +首先,列出您所创建的 GPG 密钥: + +```bash +gpg --list-keys +``` + +执行相关命令之后,您将看到如下输出: + +```bash +-------------------------------------------------- +pub rsa4096 2024-03-27 [SC] + 1E2CDAE4C08AD7D694D1CB139D7BE8E45E580BA4 +uid [ultimate] chaokunyang (CODE SIGNING KEY) +sub rsa4096 2024-03-27 [E] +``` + +然后,将您的密钥 ID 发送到密钥服务器: + +```bash +gpg --keyserver keys.openpgp.org --send-key # e.g., 1E2CDAE4C08AD7D694D1CB139D7BE8E45E580BA4 +``` + +其中,`keys.openpgp.org` 是一个随机选择的密钥服务器,可以使用 keyserver.ubuntu.com 或任何其他功能完备的密钥服务器。 + +#### 检查密钥是否创建成功 + +上传大约需要一分钟;之后,您可以通过电子邮件在相应的密钥服务器上检查。 + +将密钥上传到密钥服务器的主要目的是为了加入一个可信的[信任网络](https://infra.apache.org/release-signing.html#web-of-trust)。 + +#### 将 GPG 公钥添加到项目 KEYS 文件中 + +发布分支的 svn 仓库是:https://dist.apache.org/repos/dist/release/incubator/fury + +请在发布分支的 KEYS 中添加公钥: + +```bash +svn co https://dist.apache.org/repos/dist/release/incubator/fury fury-dist +# As this step will copy all the versions, it will take some time. If the network is broken, please use svn cleanup to delete the lock before re-execute it. +cd fury-dist +(gpg --list-sigs YOUR_NAME@apache.org && gpg --export --armor YOUR_NAME@apache.org) >> KEYS # Append your key to the KEYS file +svn add . # It is not needed if the KEYS document exists before. +svn ci -m "add gpg key for YOUR_NAME" # Later on, if you are asked to enter a username and password, just use your apache username and password. +``` + +#### 将 GPG 公钥上传到您的 GitHub 帐户 + +- 输入 `https://github.com/settings/keys` 以添加您的 GPG 密钥。 +- 如果添加后发现“未验证”字样,请将 GPG 密钥中使用的电子邮件地址绑定到您的 GitHub 帐户(https://github.com/settings/emails)。 + +### 延伸阅读 + +建议您在发布之前阅读以下文档,了解有关 Apache 基金会发布软件的更多详细信息,但这不是必须的: + +- 发布政策:https://www.apache.org/legal/release-policy.html +- 孵化器发布:http://incubator.apache.org/guides/releasemanagement.html +- TLP 版本:https://infra.apache.org/release-distribution +- 发布标志:https://infra.apache.org/release-signing.html +- 发布发布:https://infra.apache.org/release-publishing.html +- 发布下载页面:https://infra.apache.org/release-download-pages.html +- 发布 maven artifacts:https://infra.apache.org/publishing-maven-artifacts.html + +## 开始有关发布的讨论 + +通过发送电子邮件至以下地址发起有关下一个版本的讨论:dev@fury.apache.org: + +标题: + +``` +[DISCUSS] Release Apache Fury(incubating) ${release_version} +``` + +内容: + +``` +Hello, Apache Fury(incubating) Community, + +This is a call for a discussion to release Apache Fury(incubating) version ${release_version}. + +The change lists about this release: + +https://github.com/apache/fury/compare/v0.4.1...v0.5.0 + +Please leave your comments here about this release plan. We will bump the version in repo and start the release process after the discussion. + +Thanks, + +${name} +``` + +## 准备发布 + +如果讨论结果中没有出现反对声音,您需要做一些发布版本的准备工作。 + +### Github 分支和标签 + +- 创建一个名为 `releases-0.5.0` +- 通过执行命令将版本 `$version` 升级到 `python ci/release.py -l all -version $version` +- 执行 git commit 并将分支推送到 `git@github.com:apache/fury.git` +- 通过 `git tag v0.5.0-rc1` 创建一个新标签,然后将其推送到 `git@github.com:apache/fury.git` + +### 构建 artifacts 并上传到 SVN dist/dev 仓库 + +首先,您需要通过 `python ci/release.py build -v $version` 构建预发布 artifacts。 + +然后您需要把它上传到 svn dist repo。dev 分支的 dist 仓库地址是:https://dist.apache.org/repos/dist/dev/incubator/fury + +```bash +# As this step will copy all the versions, it will take some time. If the network is broken, please use svn cleanup to delete the lock before re-execute it. +svn co https://dist.apache.org/repos/dist/dev/incubator/fury fury-dist-dev +``` + +然后,上传项目: + +```bash +cd fury-dist-dev +# create a directory named by version +mkdir ${release_version}-${rc_version} +# copy source code and signature package to the versioned directory +cp ${repo_dir}/dist/* ${release_version}-${rc_version} +# check svn status +svn status +# add to svn +svn add ${release_version}-${rc_version} +# check svn status +svn status +# commit to SVN remote server +svn commit -m "Prepare for fury ${release_version}-${rc_version}" +``` + +访问 https://dist.apache.org/repos/dist/dev/incubator/fury/ 以检查 artifacts 是否正确上传。 + +### 如果出现问题该怎么办 + +如果某些文件是意外出现或者发生某些错误,则需要删除相关内容并执行 `svn delete`,然后重复上述上传过程。 + +## 投票 + +作为一个孵化项目,新版本发布需要 Apache Fury 社区和孵化器社区的投票。 + +- release_version:Fury 的版本,如 0.5.0。 +- release_candidate_version:投票的版本,如 0.5.0-rc1。 +- maven_artifact_number:Maven 暂存 artifacts 的数量。如 1001. 具体来说,可以通过搜索 “fury” 来找到 maven_artifact_number https://repository.apache.org/#stagingRepositories. + +### Fury 社区投票 + +发送电子邮件至 Fury Community:dev@fury.apache.org: + +标题: + +``` +[VOTE] Release Apache Fury(incubating) v${release_version}-${rc_version} +``` + +内容: + +``` +Hello, Apache Fury(incubating) Community: + +This is a call for vote to release Apache Fury(Incubating) +version release-0.5.0-rc3. + +Apache Fury(incubating) - A blazingly fast multi-language serialization +framework powered by JIT and zero-copy. + +The change lists about this release: + +https://github.com/apache/fury/compare/v0.4.1...v0.5.0-rc3 + +The release candidates: +https://dist.apache.org/repos/dist/dev/incubator/fury/0.5.0-rc3/ + +The maven staging for this release: +https://repository.apache.org/content/repositories/orgapachefury-1003 + +Git tag for the release: +https://github.com/apache/fury/releases/tag/v0.5.0-rc3 + +Git commit for the release: +https://github.com/apache/fury/commit/fae06330edd049bb960536e978a45b97bca66faf + +The artifacts signed with PGP key [5E580BA4], corresponding to +[chaokunyang@apache.org], that can be found in keys file: +https://downloads.apache.org/incubator/fury/KEYS + +The vote will be open for at least 72 hours until the necessary number of votes are reached. + +Please vote accordingly: + +[ ] +1 approve +[ ] +0 no opinion +[ ] -1 disapprove with the reason + +To learn more about Fury, please see https://fury.apache.org/ + +*Valid check is a requirement for a vote. *Checklist for reference: + +[ ] Download Fury is valid. +[ ] Checksums and PGP signatures are valid. +[ ] Source code distributions have correct names matching the current release. +[ ] LICENSE and NOTICE files are correct. +[ ] All files have license headers if necessary. +[ ] No compiled archives bundled in source archive. +[ ] Can compile from source. + +More detail checklist please refer: +https://cwiki.apache.org/confluence/display/INCUBATOR/Incubator+Release+Checklist + +How to Build and Test, please refer to: https://github.com/apache/fury/blob/main/docs/guide/DEVELOPMENT.md + +Thanks, +Chaokun Yang +``` + +在至少获得 3 + 1 且具有约束力的投票(来自 Fury Podling PMC 成员和提交者)并没有收到否决票之后,发布投票结果: + +标题: + +``` +[RESULT][VOTE] Release Apache Fury(incubating) v${release_version}-${rc_version} +``` + +内容: + +``` +Hello, Apache Fury(incubating) Community, + +The vote to release Apache Fury(Incubating) v${release_version}-${rc_version} has passed. + +The vote PASSED with 3 binding +1 and 0 -1 vote: + +Binding votes: + +- xxx +- yyy +- zzz + +Vote thread: ${vote_thread_url} + +Thanks, + +${name} +``` + +### 孵化器社区投票 + +发送电子邮件至:general@incubator.apache.org: + +标题: + +``` +[VOTE] Release Apache Fury(incubating) v${release_version}-${rc_version} +``` + +内容: + +``` +Hello everyone, + +This is a call for the vote to release Apache Fury(Incubating) v${release_version}-${rc_version}. + +The Apache Fury community has voted and approved the release of Apache +Fury(incubating) v${release_version}-${rc_version}. We now kindly request the IPMC members +review and vote for this release. + +Apache Fury(incubating) - A blazingly fast multi-language serialization +framework powered by JIT and zero-copy. + +Fury community vote thread: +${community_vote_thread_url} + +Vote result thread: +${community_vote_result_thread_url} + +The release candidate: +https://dist.apache.org/repos/dist/dev/incubator/fury/${release_version}-${rc_version}/ + +This release has been signed with a PGP available here: +https://downloads.apache.org/incubator/fury/KEYS + +Git tag for the release: +https://github.com/apache/fury/releases/tag/v${release_version}-${rc_version}/ + +Git commit for the release: +https://github.com/apache/fury/commit/$xxx + +Maven staging repo: +https://repository.apache.org/content/repositories/orgapachefury-${maven_artifact_number}/ + +How to Build and Test, please refer to: +https://github.com/apache/fury/blob/main/docs/guide/DEVELOPMENT.md + +Please download, verify, and test. + +The VOTE will pass after 3 binding approve. + +[ ] +1 approve +[ ] +0 no opinion +[ ] -1 disapprove with the reason + +To learn more about apache fury, please see https://fury.apache.org/ + +Checklist for reference: + +[ ] Download links are valid. +[ ] Checksums and signatures. +[ ] LICENSE/NOTICE files exist +[ ] No unexpected binary files +[ ] All source files have ASF headers +[ ] Can compile from source + +Thanks, + +${name} +``` + +至少 72 小时后,至少有 3 + 1 具有约束力的投票(来自孵化器 PMC 成员)且没有否决票,发布投票结果: + +标题: + +``` +[RESULT][VOTE] Release Apache Fury(incubating) v${release_version}-${rc_version} +``` + +内容: + +``` +Hi Incubator PMC, + +The vote to release Apache Fury(incubating) v${release_version}-${rc_version} has passed with +4 +1 binding and 3 +1 non-binding votes, no +0 or -1 votes. + +Binding votes: + +- xxx +- yyy +- zzz + +Non-Binding votes: + +- aaa + +Vote thread: ${incubator_vote_thread_url} + +Thanks for reviewing and voting for our release candidate. + +We will proceed with publishing the approved artifacts and sending out the announcement soon. +``` + +### 如果投票失败怎么办 + +如果投票失败,请单击“删除”以删除暂存的 Maven artifacts。 + +解决提出的问题,然后再次提出 `rc_version` 的新投票。 + +## 官方发布 + +### 将 artifacts 发布到 SVN 发布目录 + +- release_version:Fury 的发布版本,如 0.5.0 +- release_candidate_version:投票版本,如 0.5.0-rc1 + +```bash +svn mv https://dist.apache.org/repos/dist/dev/incubator/fury/${release_version}-${rc_version} https://dist.apache.org/repos/dist/release/incubator/fury/${release_version} -m "Release fury ${release_version}" +``` + +### 更改 Fury 网站下载链接 + +提交 PR 到 https://github.com/apache/fury-site 仓库更新 Fury 版本,[下载页面](https://fury.apache.org/download) + +### 发布 Maven artifacts + +- maven_artifact_number:Maven 暂存 artifacts 的数量。如 1001。 +- 打开https://repository.apache.org/#stagingRepositories. +- 找到 artifacts `orgapachefury-${maven_artifact_number}`,点击“发布”。 + +### 发送公告 + +将发布公告发送给 dev@fury.apache.org 并且抄送给 announce@apache.org。 + +标题: + +``` +[ANNOUNCE] Release Apache Fury(incubating) ${release_version} +``` + +内容: + +``` +Hi all, + +The Apache Fury(incubating) community is pleased to announce +that Apache Fury(incubating) {release_version} has been released! + +Apache Fury(incubating) - A blazingly fast multi-language serialization +framework powered by JIT and zero-copy. + +The release notes are available here: +https://github.com/apache/fury/releases/tag/v${release_version} + +For the complete list of changes: +https://github.com/apache/fury/compare/v0.5.0...v${release_version} + +Apache Fury website: https://fury.apache.org/ + +Download Links: https://fury.apache.org/download + +Fury Resources: +- Fury github repo: https://github.com/apache/fury +- Issue: https://github.com/apache/fury/issues +- Mailing list: dev@fury.apache.org + +We are looking to grow our community and welcome new contributors. If +you are interested in contributing to Fury, please contact us on the +mailing list or on GitHub. We will be happy to help you get started. + +------------------ +Best Regards, +${your_name} +``` + +至此,整个发布流程结束。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_verify.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_verify.md new file mode 100644 index 00000000000..668063ca803 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/community/how_to_verify.md @@ -0,0 +1,121 @@ +--- +title: 如何验证 Apache Fury +sidebar_position: 0 +id: how_to_verify +--- + +详细的 Check list,请参阅[Apache 检查清单](https://cwiki.apache.org/confluence/display/INCUBATOR/Incubator+Release+Checklist) + +## 下载 Apache Fury + +```bash +# If there is svn locally, you can clone to the local +svn co https://dist.apache.org/repos/dist/dev/incubator/fury/${release_version}-${rc_version}/ +# You can download the material file directly +wget https://dist.apache.org/repos/dist/dev/incubator/fury/${release_version}-${rc_version}/xxx.xxx +``` + +## 验证 checksums 和 signatures + +首先,您需要安装 gpg: + +```bash +apt-get install gnupg +# or +yum install gnupg +# or +brew install gnupg +``` + +之后,导入 Apache Fury release manager 的公钥: + +```bash +curl https://downloads.apache.org/incubator/fury/KEYS > KEYS # Download KEYS +gpg --import KEYS # Import KEYS to local +# Then, trust the public key: +gpg --edit-key # Edit the key(mentioned in vote email) +# It will enter the interactive mode, use the following command to trust the key: +gpg (GnuPG) 2.0.22; Copyright (C) 2013 Free Software Foundation, Inc. +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. + + +pub 4096R/5E580BA4 created: 2024-03-27 expires: never usage: SC + trust: unknown validity: unknown +sub 4096R/A31EF728 created: 2024-03-27 expires: never usage: E +[ unknown] (1). chaokunyang (CODE SIGNING KEY) + +gpg> trust +pub 4096R/5E580BA4 created: 2024-03-27 expires: never usage: SC + trust: unknown validity: unknown +sub 4096R/A31EF728 created: 2024-03-27 expires: never usage: E +[ unknown] (1). chaokunyang (CODE SIGNING KEY) + +Please decide how far you trust this user to correctly verify other users' keys +(by looking at passports, checking fingerprints from different sources, etc.) + + 1 = I don't know or won't say + 2 = I do NOT trust + 3 = I trust marginally + 4 = I trust fully + 5 = I trust ultimately + m = back to the main menu + +Your decision? 5 +Do you really want to set this key to ultimate trust? (y/N) y + +pub 4096R/5E580BA4 created: 2024-03-27 expires: never usage: SC + trust: ultimate validity: unknown +sub 4096R/A31EF728 created: 2024-03-27 expires: never usage: E +[ unknown] (1). chaokunyang (CODE SIGNING KEY) +Please note that the shown key validity is not necessarily correct +unless you restart the program. +``` + +接下来验证签名: + +```bash +for i in *.tar.gz; do echo $i; gpg --verify $i.asc $i; done +``` + +如果出现如下内容,则表示签名正确: + +```bash +apache-fury-incubating-0.5.0-src.tar.gz +gpg: Signature made Wed 17 Apr 2024 11:49:45 PM CST using RSA key ID 5E580BA4 +gpg: checking the trustdb +gpg: 3 marginal(s) needed, 1 complete(s) needed, PGP trust model +gpg: depth: 0 valid: 1 signed: 0 trust: 0-, 0q, 0n, 0m, 0f, 1u +gpg: Good signature from "chaokunyang (CODE SIGNING KEY) " +``` + +然后验证 checksum: + +```bash +for i in *.tar.gz; do echo $i; sha512sum --check $i.sha512; done +``` + +它应该输出如下内容: + +```bash +apache-fury-incubating-0.5.0-src.tar.gz +apache-fury-incubating-0.5.0-src.tar.gz: OK +``` + +## 检查源码包中的文件 + +解压缩 `apache-fury-${release_version}-${rc_version}-src.tar.gz` 并检查以下内容: + +- 此存储库 LICENSE 和 NOTICE 文件是正确的; +- 如有必要,所有文件都有 ASF 许可证标头; +- 项目构建通过。 + +## 检查 fury-java 的 Maven artifacts + +下载 Apache Fury:https://repository.apache.org/content/repositories/orgapachefury-${maven_artifact_number}/. + +您可以检查以下内容: + +- JAR 的 Checksum 与项目绑定的 checksum 文件一致。 +- JAR 的 signature 与项目绑定的 signature 文件一致。 +- JAR 在本地是可重复的。这意味着您可以在计算机上构建 JAR,并验证 checksum 和与项目绑定的相同。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/DEVELOPMENT.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/DEVELOPMENT.md new file mode 100644 index 00000000000..01093d5619b --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/DEVELOPMENT.md @@ -0,0 +1,101 @@ +--- +title: 开发指南 +sidebar_position: 7 +id: development +--- + +## 本地构建 Apache Fury + +从 [Github 代码库](https://github.com/apache/fury) 拉取最新代码。 + +### 构建 Apache Fury Java + +```bash +cd java +mvn clean compile -DskipTests +``` + +#### 本地环境要求 + +- java 1.8+ +- maven 3.6.3+ + +### 构建 Apache Fury Python + +```bash +cd python +pip install pyarrow==14.0.0 Cython wheel numpy pytest +pip install -v -e . +``` + +#### 本地环境要求 + +- python 3.6+ + +### 构建 Apache Fury C++ + +Build fury row format: + +```bash +pip install pyarrow==14.0.0 +bazel build //cpp/fury/row:fury_row_format +``` + +Build fury row format encoder: + +```bash +pip install pyarrow==14.0.0 +bazel build //cpp/fury/encoder:fury_encoder +``` + +#### 本地环境要求 + +- compilers with C++17 support +- bazel 6.3.2 + +### 构建 Apache Fury GoLang + +```bash +cd go/fury +# run test +go test -v +# run xlang test +go test -v fury_xlang_test.go +``` + +#### 本地环境要求 + +- go 1.13+ + +### 构建 Apache Fury Rust + +```bash +cd rust +# build +cargo build +# run test +cargo test +``` + +#### 本地环境要求 + +```bash +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +### 构建 Apache Fury JavaScript + +```bash +cd javascript +npm install + +# run build +npm run build +# run test +npm run test +``` + +#### 本地环境要求 + +- node 14+ +- npm 8+ diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/graalvm_guide.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/graalvm_guide.md new file mode 100644 index 00000000000..b89ad663c8e --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/graalvm_guide.md @@ -0,0 +1,231 @@ +--- +title: GraalVM 指南 +sidebar_position: 6 +id: graalvm_guide +--- + +## GraalVM Native Image 介绍 + +GraalVM Native Image 能够将 Java 应用代码编译成为原生的本地应用程序代码,以构建更快、更小、更精简的应用程序。 +其不能使用 JIT 编译器将字节码编译为机器码,并且在没有配置相关反射文件的前提下不支持反射,在很多情况下使用较为复杂。 + +Apache Fury 对 GraalVM Native Image 支持非常完善。Apache Fury 在 Graalvm 构建时能够为 `Fury JIT framework` 和 `MethodHandle/LambdaMetafactory` 生成所有的序列化代码。然后在运行时使用这些生成的代码进行序列化,无需任何额外成本,性能非常出色。 + +为了在 Graalvm Native Images 上使用 Fury,您必须将 Apache Fury 创建为**静态**的类字段,并且在 `enclosing class` 初始化时间期间完成所有的类**注册**。 然后在`resources/META-INF/native-image/$xxx/` 目录下添加 `native-image.properties` 配置文件。指导 GraalVM 在构建 Native Images 时初始化配置的类。 + +例如,这里我们在配置文件中加入 `org.apache.fury.graalvm.Example` 类: + +```properties +Args = --initialize-at-build-time=org.apache.fury.graalvm.Example +``` + +使用 Apache Fury 的另一个好处是,您不必配置[反射 JSON](https://www.graalvm.org/latest/reference-manual/native-image/metadata/#specifying-reflection-metadata-in-json)和[序列化 JSON](https://www.graalvm.org/latest/reference-manual/native-image/metadata/#serialization),这非常乏味、繁琐且不方便。使用 Apache Fury 时,您只需为要序列化的每个类型调用 `org.apache.fury.Fury.register(Class, boolean)` 即可。 + +请注意,由于 GraalVM Native Image 在镜像运行时不支持 JIT,因此 Apache Fury 的 `asyncCompilationEnabled` 选项将在使用 GraalVM Native Image 构建应用时自动禁用。 + +## 线程不安全 + +Example: + +```java +import org.apache.fury.Fury; +import org.apache.fury.util.Preconditions; + +import java.util.List; +import java.util.Map; + +public class Example { + public record Record ( + int f1, + String f2, + List f3, + Map f4) { + } + + static Fury fury; + + static { + fury = Fury.builder().build(); + // register and generate serializer code. + fury.register(Record.class, true); + } + + public static void main(String[] args) { + Record record = new Record(10, "abc", List.of("str1", "str2"), Map.of("k1", 10L, "k2", 20L)); + System.out.println(record); + byte[] bytes = fury.serialize(record); + Object o = fury.deserialize(bytes); + System.out.println(o); + Preconditions.checkArgument(record.equals(o)); + } +} +``` + +之后在 `native-image.properties` 中加入 `org.apache.fury.graalvm.Example` 配置: + +```properties +Args = --initialize-at-build-time=org.apache.fury.graalvm.Example +``` + +## 线程安全 + +```java +import org.apache.fury.Fury; +import org.apache.fury.ThreadLocalFury; +import org.apache.fury.ThreadSafeFury; +import org.apache.fury.util.Preconditions; + +import java.util.List; +import java.util.Map; + +public class ThreadSafeExample { + public record Foo ( + int f1, + String f2, + List f3, + Map f4) { + } + + static ThreadSafeFury fury; + + static { + fury = new ThreadLocalFury(classLoader -> { + Fury f = Fury.builder().build(); + // register and generate serializer code. + f.register(Foo.class, true); + return f; + }); + } + + public static void main(String[] args) { + System.out.println(fury.deserialize(fury.serialize("abc"))); + System.out.println(fury.deserialize(fury.serialize(List.of(1,2,3)))); + System.out.println(fury.deserialize(fury.serialize(Map.of("k1", 1, "k2", 2)))); + Foo foo = new Foo(10, "abc", List.of("str1", "str2"), Map.of("k1", 10L, "k2", 20L)); + System.out.println(foo); + byte[] bytes = fury.serialize(foo); + Object o = fury.deserialize(bytes); + System.out.println(o); + } +} +``` + +之后在 `native-image.properties` 中加入 `org.apache.fury.graalvm.ThreadSafeExample` 配置: + +```properties +Args = --initialize-at-build-time=org.apache.fury.graalvm.ThreadSafeExample +``` + +## 框架集成 + +对于框架开发人员,如果您想集成 Apache Fury 进行序列化。您可以提供一个配置文件,让用户列出他们想要序列化的所有类,然后您可以加载这些类并调用 `org.apache.fury.Fury.register(Class, boolean)` 在您的 Fury 集成类中注册这些类,并配置该类在 GraalVM Native Image 构建时进行初始化。 + +## 基准测试 + +在这里,我们给出了 Apache Fury 和 Graalvm 序列化之间的两个类基准测试。 + +禁用 Apache Fury compression 时: + +- Struct:Fury 与 `46x speed, 43% size` JDK 进行比较。 +- Pojo:Fury 与 `12x speed, 56% size` JDK进行比较。 + +启用 Apache Fury compression 时: + +- Struct:Fury 与 `24x speed, 31% size` JDK进行比较。 +- Pojo:Fury 与 `12x speed, 48% size` JDK进行比较。 + +有关基准测试代码,请参阅 [Benchmark.java](https://github.com/apache/fury/blob/main/integration_tests/graalvm_tests/src/main/java/org/apache/fury/graalvm/Benchmark.java)。 + +### 结构体基准测试 + +#### 类字段 + +```java +public class Struct implements Serializable { + public int f1; + public long f2; + public float f3; + public double f4; + public int f5; + public long f6; + public float f7; + public double f8; + public int f9; + public long f10; + public float f11; + public double f12; +} +``` + +#### 基准测试结果 + +不开启压缩时测试结果: + +``` +Benchmark repeat number: 400000 +Object type: class org.apache.fury.graalvm.Struct +Compress number: false +Fury size: 76.0 +JDK size: 178.0 +Fury serialization took mills: 49 +JDK serialization took mills: 2254 +Compare speed: Fury is 45.70x speed of JDK +Compare size: Fury is 0.43x size of JDK +``` + +开启压缩时测试结果: + +``` +Benchmark repeat number: 400000 +Object type: class org.apache.fury.graalvm.Struct +Compress number: true +Fury size: 55.0 +JDK size: 178.0 +Fury serialization took mills: 130 +JDK serialization took mills: 3161 +Compare speed: Fury is 24.16x speed of JDK +Compare size: Fury is 0.31x size of JDK +``` + +### Pojo 基准测试 + +#### 类字段 + +```java +public class Foo implements Serializable { + int f1; + String f2; + List f3; + Map f4; +} +``` + +#### 基准测试结果 + +不开启压缩时测试结果: + +``` +Benchmark repeat number: 400000 +Object type: class org.apache.fury.graalvm.Foo +Compress number: false +Fury size: 541.0 +JDK size: 964.0 +Fury serialization took mills: 1663 +JDK serialization took mills: 16266 +Compare speed: Fury is 12.19x speed of JDK +Compare size: Fury is 0.56x size of JDK +``` + +开启压缩时测试结果: + +``` +Benchmark repeat number: 400000 +Object type: class org.apache.fury.graalvm.Foo +Compress number: true +Fury size: 459.0 +JDK size: 964.0 +Fury serialization took mills: 1289 +JDK serialization took mills: 15069 +Compare speed: Fury is 12.11x speed of JDK +Compare size: Fury is 0.48x size of JDK +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/java_serialization_guide.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/java_serialization_guide.md new file mode 100644 index 00000000000..d6daf42282d --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/java_serialization_guide.md @@ -0,0 +1,404 @@ +--- +title: Java 序列化指南 +sidebar_position: 0 +id: java_object_graph_guide +--- + +## Java 对象图序列化 + +当只需要 Java 对象序列化时,其相比跨语言的图序列化拥有更好的性能。 + +## 快速开始 + +注意:Fury 对象创建的代价很高, 因此 **Fury 对象应该尽可能被复用**,而不是每次都重新创建。 + +您应该为 Fury 创建一个全局的静态变量,或者有限的的 Fury 实例对象。Fury本身占用一定内存,请不要创建上万个Fury对象 + +使用单线程版本 Fury: + +```java +import java.util.List; +import java.util.Arrays; + +import org.apache.fury.*; +import org.apache.fury.config.*; + +public class Example { + public static void main(String[] args) { + SomeClass object = new SomeClass(); + // Note that Fury instances should be reused between + // multiple serializations of different objects. + Fury fury = Fury.builder().withLanguage(Language.JAVA) + .requireClassRegistration(true) + .build(); + // Registering types can reduce class name serialization overhead, but not mandatory. + // If class registration enabled, all custom types must be registered. + fury.register(SomeClass.class); + byte[] bytes = fury.serialize(object); + System.out.println(fury.deserialize(bytes)); + } +} +``` + +使用多线程版本 Fury: + +```java +import java.util.List; +import java.util.Arrays; + +import org.apache.fury.*; +import org.apache.fury.config.*; + +public class Example { + public static void main(String[] args) { + SomeClass object = new SomeClass(); + // Note that Fury instances should be reused between + // multiple serializations of different objects. + ThreadSafeFury fury = new ThreadLocalFury(classLoader -> { + Fury f = Fury.builder().withLanguage(Language.JAVA) + .withClassLoader(classLoader).build(); + f.register(SomeClass.class); + return f; + }); + byte[] bytes = fury.serialize(object); + System.out.println(fury.deserialize(bytes)); + } +} +``` + +Fury 对象复用示例: + +```java +import java.util.List; +import java.util.Arrays; + +import org.apache.fury.*; +import org.apache.fury.config.*; + +public class Example { + // reuse fury. + private static final ThreadSafeFury fury = new ThreadLocalFury(classLoader -> { + Fury f = Fury.builder().withLanguage(Language.JAVA) + .withClassLoader(classLoader).build(); + f.register(SomeClass.class); + return f; + }); + + public static void main(String[] args) { + SomeClass object = new SomeClass(); + byte[] bytes = fury.serialize(object); + System.out.println(fury.deserialize(bytes)); + } +} +``` + +## FuryBuilder 参数选项 + +| 参数选项名 | 描述 | 默认值 | +|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------| +| `timeRefIgnored` | 启用 reference tracking 时,是否忽略在 `TimeSerializers` 中注册的所有时间类型及其子类的引用跟踪。如果忽略,则可以通过调用 `Fury#registerSerializer(Class, Serializer)` 来启用对每种时间类型的引用跟踪。例如,`fury.registerSerializer(Date.class, new DateSerializer(fury, true))`。请注意,启用 ref tracking 功能应在任何包含时间字段的类型的序列化程序编码之前进行。否则,这些字段仍将跳过 reference tracking。 | `true` | +| `compressInt` | 启用或禁用 int 压缩,减小数据体积。 | `true` | +| `compressLong` | 启用或禁用 long 压缩,减小数据体积。 | `true` | +| `compressString` | 启用或禁用 String 压缩,减小数据体积。 | `true` | +| `classLoader` | 关联到当前 Fury 的类加载器,每个 Fury 会关联一个不可变的类加载器,用于缓存类元数据。如果需要切换类加载器,请使用 `LoaderBinding` 或 `ThreadSafeFury` 进行更新。 | `Thread.currentThread().getContextClassLoader()` | +| `compatibleMode` | 类型的向前/向后兼容性配置。也与 `checkClassVersion` 配置相关。`schema_consistent`: 类的Schema信息必须在序列化对等节点和反序列化对等节点之间保持一致。`COMPATIBLE`: 序列化对等节点和反序列化对等节点之间的类模式可以不同。它们可以独立添加/删除字段。 | `CompatibleMode.SCHEMA_CONSISTENT` | +| `checkClassVersion` | 决定是否检查类模式的一致性。如果启用,Fury 将写入 `classVersionHash` 和基于其检查类型一致性。当启用 `CompatibleMode#COMPATIBLE` 时,它将自动禁用。除非能确保类不会演化,否则不建议禁用。 | `false` | +| `checkJdkClassSerializable` | 启用或禁用 `java.*` 下类的 `Serializable` 接口检查。如果 `java.*` 下的类不是 `Serializable`,Fury 将抛出 `UnsupportedOperationException`。 | `true` | +| `registerGuavaTypes` | 是否预先注册 Guava 类型,如 `RegularImmutableMap`/`RegularImmutableList`。这些类型不是公共 API,但似乎非常稳定。 | `true` | +| `requireClassRegistration` | 禁用可能会允许未知类被反序列化,从而带来潜在的安全风险。 | `true` | +| `suppressClassRegistrationWarnings` | 是否抑制类注册警告。这些警告可用于安全审计,但可能会较琐碎,默认情况下将启用此抑制功能。 | `true` | +| `metaShareEnabled` | 是否否开启原元数据共享。 | `false` | +| `scopedMetaShareEnabled` | 范围元数据共享侧重于单一序列化流程。在此过程中创建或识别的元数据为该过程独有,不会与其他序列化过程共享。 | `false` | +| `metaCompressor` | 元数据压缩器。请注意,传递的元压缩器应是线程安全的。默认情况下,将使用基于 `Deflater` 的压缩器 `DeflaterMetaCompressor`。用户可以使用其他压缩器,如 `zstd` 以获得更好的压缩率。 | `DeflaterMetaCompressor` | +| `deserializeNonexistentClass` | 启用或禁用反序列化/跳转不存在类的数据。 | `true`, 如果设置了 `CompatibleMode.Compatible`,将会变为 `false`。 | +| `codeGenEnabled` | 禁用后,初始序列化速度会加快,但后续序列化速度会减慢。 | `true` | +| `asyncCompilationEnabled` | 如果启用,序列化会首先使用解释器模式,并在类的异步序列化 JIT 完成后切换到 JIT 序列化。 | `false` | +| `scalaOptimizationEnabled` | 启用或禁用特定于 Scala 的序列化优化。 | `false` | +| `copyRef` | 禁用后,复制性能会更好。但 Fury 深度复制将忽略循环引用和共享引用。对象图中的相同引用将在一次 `Fury#copy` 中复制到不同的对象中。 | `true` | + +## 高级用法 + +### Fury 创建 + +单线程 Fury 创建: + +```java +Fury fury=Fury.builder() + .withLanguage(Language.JAVA) + // enable reference tracking for shared/circular reference. + // Disable it will have better performance if no duplicate reference. + .withRefTracking(false) + .withCompatibleMode(CompatibleMode.SCHEMA_CONSISTENT) + // enable type forward/backward compatibility + // disable it for small size and better performance. + // .withCompatibleMode(CompatibleMode.COMPATIBLE) + // enable async multi-threaded compilation. + .withAsyncCompilation(true) + .build(); + byte[]bytes=fury.serialize(object); + System.out.println(fury.deserialize(bytes)); +``` + +多线程 Fury 创建: + +```java +ThreadSafeFury fury=Fury.builder() + .withLanguage(Language.JAVA) + // enable reference tracking for shared/circular reference. + // Disable it will have better performance if no duplicate reference. + .withRefTracking(false) + // compress int for smaller size + // .withIntCompressed(true) + // compress long for smaller size + // .withLongCompressed(true) + .withCompatibleMode(CompatibleMode.SCHEMA_CONSISTENT) + // enable type forward/backward compatibility + // disable it for small size and better performance. + // .withCompatibleMode(CompatibleMode.COMPATIBLE) + // enable async multi-threaded compilation. + .withAsyncCompilation(true) + .buildThreadSafeFury(); + byte[]bytes=fury.serialize(object); + System.out.println(fury.deserialize(bytes)); +``` + +### 配置Fury生成更小的序列化体积: + +`FuryBuilder#withIntCompressed`/`FuryBuilder#withLongCompressed` 可用于压缩 `int/long`,使其体积更小。通常压缩 int 类型就足够了。 + +这两个压缩属性默认启用。如果序列化大小不重要,比如你之前使用flatbuffers进行序列化,flatbuffers不会压缩任何东西,那么这种情况下建议关闭压缩。如果数据都是数字,压缩可能会带来 80%以上的性能损耗。 + +对于 int 压缩,Fury 使用 1~5 字节进行编码。每个字节的第一位表示是否有下一个字节位,如果下一个字节位被设置,则将读取下一个字节,直到下一个字节位未被设置时停止。 + +对于 long 压缩,Fury 支持两种编码方式: + +- Fury SLI(Small long as int)编码(**默认使用**): + - 如果 long 在 [-1073741824, 1073741823] 范围内,则编码为 4 字节 int:`| little-endian: ((int) value) << 1 |` + - 否则写成 9 字节: `| 0b1 | little-endian 8 bit long |` +- Fury PVL(渐进可变长)编码: + - 每个字节的第一位表示是否有下一个字节。如果第一位被设置,则将读取下一个字节。 + 直到下一字节的第一位未设置。 + - 负数将通过 `(v << 1) ^ (v >> 63)` 转换为正数,以减少小负数的编码空间占用。 + +如果一个数字是 `Long` 类型,大多不能用更小的字节表示,压缩效果就不够好。 +与占用的性能开销相比,这是不值得的。如果您发现`Long`类型压缩并没有带来多少好处,也许您应该尝试关闭`Long`类型压缩,以提升性能。 + +### 对象深拷贝 + +深度拷贝示例: + +```java +Fury fury=Fury.builder() + ... + .withRefCopy(true).build(); + SomeClass a=xxx; + SomeClass copied=fury.copy(a) +``` + +使 Fury 深度复制忽略循环引用和共享引用,此配置会将对象图中的相同引用在一次 `Fury#copy` 之后会被复制到不同的对象中。 + +```java +Fury fury=Fury.builder() + ... + .withRefCopy(false).build(); + SomeClass a=xxx; + SomeClass copied=fury.copy(a) +``` + +### 实现自定义的序列化器 + +在某些情况下,您可能希望为您的自定义类型实现一个序列化器,特别是某些通过 + JDK `writeObject/writeReplace/readObject/readResolve` 实现序列化的类,JDK序列化的性能和空间效率很低。比如说,如果您不想下面的 `Foo#writeObject` 被调用,你可以实现类型下面的 `FooSerializer` : + +```java +class Foo { + public long f1; + + private void writeObject(ObjectOutputStream s) throws IOException { + System.out.println(f1); + s.defaultWriteObject(); + } +} + +class FooSerializer extends Serializer { + public FooSerializer(Fury fury) { + super(fury, Foo.class); + } + + @Override + public void write(MemoryBuffer buffer, Foo value) { + buffer.writeInt64(value.f1); + } + + @Override + public Foo read(MemoryBuffer buffer) { + Foo foo = new Foo(); + foo.f1 = buffer.readInt64(); + return foo; + } +} +``` + +注册序列化器: + +```java +Fury fury=getFury(); + fury.registerSerializer(Foo.class,new FooSerializer(fury)); +``` + +### 安全与类注册 + +可以使用 `FuryBuilder#requireClassRegistration` 来禁用类注册,这将允许反序列化未知类型的对象,使用更灵活。**但如果类中包含恶意代码,就会出现安全漏洞**。 + +**除非能确保运行环境和外部交互环境安全,否则请勿禁用类注册检查**。 + +如果禁用此选项,在反序列化未知/不可信任的类型时,可能会执行`init/equals/hashCode`中的恶意代码。 +禁用。 + +类注册不仅可以降低安全风险,还可以避免类名序列化成本。 + +您可以使用 `Fury#register` API 来注册类。 + +> 请注意:类注册顺序很重要,序列化和反序列化对,应具有相同的注册顺序。 + +```java +Fury fury=xxx; + fury.register(SomeClass.class); + fury.register(SomeClass1.class,200); +``` + +如果调用 `FuryBuilder#requireClassRegistration(false)` 来禁用类注册检查、 +可以通过 `ClassResolver#setClassChecker` 设置 `org.apache.fury.resolver.ClassChecker` 来控制哪些类是允许序列化。例如,可以通过以下方式允许以 `org.example.*` 开头的类: + +```java +Fury fury=xxx; + fury.getClassResolver().setClassChecker((classResolver,className)->className.startsWith("org.example.")); +``` + +```java +AllowListChecker checker=new AllowListChecker(AllowListChecker.CheckLevel.STRICT); + ThreadSafeFury fury=new ThreadLocalFury(classLoader->{ + Fury f=Fury.builder().requireClassRegistration(true).withClassLoader(classLoader).build(); + f.getClassResolver().setClassChecker(checker); + checker.addListener(f.getClassResolver()); + return f; + }); + checker.allowClass("org.example.*"); +``` + +Aapche Fury 还提供了一个 `org.apache.fury.resolver.AllowListChecker`,它是一个基于允许/禁止列表的检查器,用于简化类检查机制的定制。您可以使用此检查器或自行实现更复杂的检查器。 + +### 序列化器注册 + +您还可以通过 `Fury#registerSerializer` API 为类注册自定义序列化器。或者为类实现 `java.io.Externalizable`。 + +### 零拷贝序列化 + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; +import org.apache.fury.serializers.BufferObject; +import org.apache.fury.memory.MemoryBuffer; + +import java.util.*; +import java.util.stream.Collectors; + +public class ZeroCopyExample { + // Note that fury instance should be reused instead of creation every time. + static Fury fury = Fury.builder() + .withLanguage(Language.JAVA) + .build(); + + // mvn exec:java -Dexec.mainClass="io.ray.fury.examples.ZeroCopyExample" + public static void main(String[] args) { + List list = Arrays.asList("str", new byte[1000], new int[100], new double[100]); + Collection bufferObjects = new ArrayList<>(); + byte[] bytes = fury.serialize(list, e -> !bufferObjects.add(e)); + List buffers = bufferObjects.stream() + .map(BufferObject::toBuffer).collect(Collectors.toList()); + System.out.println(fury.deserialize(bytes, buffers)); + } +} +``` + +### Meta 共享 + +Apache Fury 支持在同一个上下文(例如:`TCP Connection`)中的多个序列中共享类型元数据(例如:类名称,字段名称,字段类型信息 等),这些信息将在上下文中第一次序列化时发送给 对端。根据这些元数据,对端方可重建相同的反序列化器,从而避免为后续序列化传输元数据,减少网络流量压力,并支持类型向前/向后兼容。 + +```java +// Fury.builder() +// .withLanguage(Language.JAVA) +// .withRefTracking(false) +// // share meta across serialization. +// .withMetaContextShare(true) +// Not thread-safe fury. +MetaContext context=xxx; + fury.getSerializationContext().setMetaContext(context); + byte[]bytes=fury.serialize(o); +// Not thread-safe fury. + MetaContext context=xxx; + fury.getSerializationContext().setMetaContext(context); + fury.deserialize(bytes) + +// Thread-safe fury + fury.setClassLoader(beanA.getClass().getClassLoader()); + byte[]serialized=fury.execute( + f->{ + f.getSerializationContext().setMetaContext(context); + return f.serialize(beanA); + } + ); +// thread-safe fury + fury.setClassLoader(beanA.getClass().getClassLoader()); + Object newObj=fury.execute( + f->{ + f.getSerializationContext().setMetaContext(context); + return f.deserialize(serialized); + } + ); +``` + +### 反序列化不存在的类 + +Apache Fury 支持反序列化不存在的类,通过`FuryBuilder#deserializeNonexistentClass(true)` 选项开启。当此选项开启的时候,同时也会开启元数据共享。Apache Fury 会将该类型的反序列化数据存储在 lazy Map 子类中。通过使用 Fury 实现的 lazy Map,可以避免在反序列化过程中填充 map 时 map 内部节点的rebalance来下,从而进一步提高性能。如果这些数据被发送到另一个进程,而该进程中存在该类,那么数据将被反序列化为该类型的对象,而不会丢失任何信息。 + +如果未启用元数据共享,新类数据将被跳过,并返回一个 `NonexistentSkipClass` 的stub 对象。 + +## 序列化库迁移 + +### JDK 迁移 + +如果您之前使用 JDK 序列化,并且没有同时升级 client 和 server。这在线上应用很常见,Apache Fury 提供了一个 `org.apache.fury.serializer.JavaSerializer.serializedByJDK` 工具方法来检查二进制文件是否由 JDK 序列化生成。您可以使用以下模式使已有的序列化具有探测运行协议的能力、然后以异步滚动升级的方式将序列化器逐步升级至 Apache Fury: + +```java +if(JavaSerializer.serializedByJDK(bytes)){ + ObjectInputStream objectInputStream=xxx; + return objectInputStream.readObject(); + }else{ + return fury.deserialize(bytes); + } +``` + +### Apache Fury 更新 + +当前只保证小版本之间的兼容性。例如:您使用的 Fury 版本为 `0.9.0`,当升级到 Fury `0.8.1` 版本,可以确保二进制协议的兼容性。但是,如果更新到 Fury `0.9.0` 版本,二进制协议兼容性能力不能得到保证。我们计划在1.0.0版本开始提供大版本内的二进制兼容性。 + +## 常见问题排查 + +### 类不一致和类版本检查 + +如果您在创建 fury 时未将 `CompatibleMode` 设置为 `org.apache.fury.config.CompatibleMode.COMPATIBLE` 而出现奇怪的序列化错误,可能是由于序列化对和反序列化对之间的类不一致造成的。 + +在这种情况下,您可以调用 `FuryBuilder#withClassVersionCheck` 来创建 Fury 以验证它,如果反序列化时抛出`org.apache.fury.exception.ClassNotCompatibleException`,则表明类是不一致的,您应该通过 +`FuryBuilder#withCompaibleMode(CompatibleMode.COMPATIBLE)` 创建 Fury 对象。 + +`CompatibleMode.COMPATIBLE` 会带来更多的性能和空间代价,如果您的类在序列化和反序列化之间保持一致,请不要设置此选项。 + +### 使用错误的 API 反序列化 + +如果您调用 `Fury#serialize` 来序列化对象,则应调用 `Fury#deserialize` 来反序列化对象,而不是使用 `Fury#deserializeJavaObject`。 + +如果调用 `Fury#serializeJavaObject` 来序列化对象,则应调用 `Fury#deserializeJavaObject` 来进行反序列化。而不是使用`Fury#deserializeJavaObjectAndClass` 或者 `Fury#deserialize`。 + +如果调用 `Fury#serializeJavaObjectAndClass` 来序列化对象,则应 +调用 `Fury#deserializeJavaObjectAndClass` 进行反序列化,而不是使用`Fury#deserializeJavaObject` 或者 `Fury#deserialize`。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/row_format_guide.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/row_format_guide.md new file mode 100644 index 00000000000..f45119fd723 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/row_format_guide.md @@ -0,0 +1,139 @@ +--- +title: Row format 指南 +sidebar_position: 1 +id: row_format_guide +--- + +## Row format protocol + +### Java + +```java +public class Bar { + String f1; + List f2; +} + +public class Foo { + int f1; + List f2; + Map f3; + List f4; +} + +RowEncoder encoder = Encoders.bean(Foo.class); +Foo foo = new Foo(); +foo.f1 = 10; +foo.f2 = IntStream.range(0, 1000000).boxed().collect(Collectors.toList()); +foo.f3 = IntStream.range(0, 1000000).boxed().collect(Collectors.toMap(i -> "k"+i, i->i)); +List bars = new ArrayList<>(1000000); +for (int i = 0; i < 1000000; i++) { + Bar bar = new Bar(); + bar.f1 = "s"+i; + bar.f2 = LongStream.range(0, 10).boxed().collect(Collectors.toList()); + bars.add(bar); +} +foo.f4 = bars; +// Can be zero-copy read by python +BinaryRow binaryRow = encoder.toRow(foo); +// can be data from python +Foo newFoo = encoder.fromRow(binaryRow); +// zero-copy read List f2 +BinaryArray binaryArray2 = binaryRow.getArray(1); +// zero-copy read List f4 +BinaryArray binaryArray4 = binaryRow.getArray(3); +// zero-copy read 11th element of `readList f4` +BinaryRow barStruct = binaryArray4.getStruct(10); + +// zero-copy read 6th of f2 of 11th element of `readList f4` +barStruct.getArray(1).getInt64(5); +RowEncoder barEncoder = Encoders.bean(Bar.class); +// deserialize part of data. +Bar newBar = barEncoder.fromRow(barStruct); +Bar newBar2 = barEncoder.fromRow(binaryArray4.getStruct(20)); +``` + +### Python + +```python +@dataclass +class Bar: + f1: str + f2: List[pa.int64] +@dataclass +class Foo: + f1: pa.int32 + f2: List[pa.int32] + f3: Dict[str, pa.int32] + f4: List[Bar] + +encoder = pyfury.encoder(Foo) +foo = Foo(f1=10, f2=list(range(1000_000)), + f3={f"k{i}": i for i in range(1000_000)}, + f4=[Bar(f1=f"s{i}", f2=list(range(10))) for i in range(1000_000)]) +binary: bytes = encoder.to_row(foo).to_bytes() +print(f"start: {datetime.datetime.now()}") +foo_row = pyfury.RowData(encoder.schema, binary) +print(foo_row.f2[100000], foo_row.f4[100000].f1, foo_row.f4[200000].f2[5]) +print(f"end: {datetime.datetime.now()}") + +binary = pickle.dumps(foo) +print(f"pickle start: {datetime.datetime.now()}") +new_foo = pickle.loads(binary) +print(new_foo.f2[100000], new_foo.f4[100000].f1, new_foo.f4[200000].f2[5]) +print(f"pickle end: {datetime.datetime.now()}") +``` + +### Apache Arrow 支持 + +Apache Fury Format 还支持从 Arrow Table/RecordBatch 自动转换。 + +Java: + +```java +Schema schema = TypeInference.inferSchema(BeanA.class); +ArrowWriter arrowWriter = ArrowUtils.createArrowWriter(schema); +Encoder encoder = Encoders.rowEncoder(BeanA.class); +for (int i = 0; i < 10; i++) { + BeanA beanA = BeanA.createBeanA(2); + arrowWriter.write(encoder.toRow(beanA)); +} +return arrowWriter.finishAsRecordBatch(); +``` + +Python: + +```python +import pyfury +encoder = pyfury.encoder(Foo) +encoder.to_arrow_record_batch([foo] * 10000) +encoder.to_arrow_table([foo] * 10000) +``` + +C++: + +```c++ +std::shared_ptr arrow_writer; +EXPECT_TRUE( + ArrowWriter::Make(schema, ::arrow::default_memory_pool(), &arrow_writer) + .ok()); +for (auto &row : rows) { + EXPECT_TRUE(arrow_writer->Write(row).ok()); +} +std::shared_ptr<::arrow::RecordBatch> record_batch; +EXPECT_TRUE(arrow_writer->Finish(&record_batch).ok()); +EXPECT_TRUE(record_batch->Validate().ok()); +EXPECT_EQ(record_batch->num_columns(), schema->num_fields()); +EXPECT_EQ(record_batch->num_rows(), row_nums); +``` + +```java +Schema schema = TypeInference.inferSchema(BeanA.class); +ArrowWriter arrowWriter = ArrowUtils.createArrowWriter(schema); +Encoder encoder = Encoders.rowEncoder(BeanA.class); +for (int i = 0; i < 10; i++) { + BeanA beanA = BeanA.createBeanA(2); + arrowWriter.write(encoder.toRow(beanA)); +} +return arrowWriter.finishAsRecordBatch(); +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/scala_guide.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/scala_guide.md new file mode 100644 index 00000000000..c243bce7407 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/scala_guide.md @@ -0,0 +1,138 @@ +--- +title: Scala 序列化指南 +sidebar_position: 4 +id: scala_guide +--- + +Apache Fury 支持所有 Scala 对象序列化: + +- `case` 支持类序列化; +- `pojo/bean` 支持类序列化; +- `object` 支持单例序列化; +- `collection` 支持序列化; +- 其他类型(如 `tuple/either` AND BASIC 类型)也都受支持。 + +Scala 2 和 3 均支持。 + +## 安装 + +```sbt +libraryDependencies += "org.apache.fury" % "fury-core" % "0.7.1" +``` + +## Fury 对象创建 + +当使用 Apache Fury 进行 Scala 序列化时,您应该至少使用以下选项创建 Fury 对象: + +```scala +val fury = Fury.builder() + .withScalaOptimizationEnabled(true) + .requireClassRegistration(true) + .withRefTracking(true) + .build() +``` + +根据您序列化的对象类型,您可能需要注册一些 Scala 的内部类型: + +```scala +fury.register(Class.forName("scala.collection.generic.DefaultSerializationProxy")) +fury.register(Class.forName("scala.Enumeration.Val")) +``` + +如果要避免此类注册,可以通过禁用类 `FuryBuilder#requireClassRegistration(false)` 来完成。 + +> 请注意:此选项可以反序列化未知的对象类型,使用更灵活。但如果类包含任何的恶意代码,会有安全风险。 + +循环引用在 Scala 中很常见,`Reference tracking` 应该由 `FuryBuilder#withRefTracking(true)` 配置选项开启。如果不启用 `Reference tracking`,则在序列化 Scala Enumeration 时,某些 Scala 版本可能会发生 [StackOverflowError 错误](https://github.com/apache/fury/issues/1032)。 + +> 注意:Fury 实例应该在多个序列化之间共享,创建 Fury 实例开销很大,应该尽量复用。 + +如果您在多个线程中使用共享的 Fury 实例,您应该使用 `ThreadSafeFury` 代替 `FuryBuilder#buildThreadSafeFury()`。 + +## 序列化 case 对象 + +```scala +case class Person(github: String, age: Int, id: Long) +val p = Person("https://github.com/chaokunyang", 18, 1) +println(fury.deserialize(fury.serialize(p))) +println(fury.deserializeJavaObject(fury.serializeJavaObject(p))) +``` + +## 序列化 pojo + +```scala +class Foo(f1: Int, f2: String) { + override def toString: String = s"Foo($f1, $f2)" +} +println(fury.deserialize(fury.serialize(Foo(1, "chaokunyang")))) +``` + +## 序列化对象单例 + +```scala +object singleton { +} +val o1 = fury.deserialize(fury.serialize(singleton)) +val o2 = fury.deserialize(fury.serialize(singleton)) +println(o1 == o2) +``` + +## 序列化集合 + +```scala +val seq = Seq(1,2) +val list = List("a", "b") +val map = Map("a" -> 1, "b" -> 2) +println(fury.deserialize(fury.serialize(seq))) +println(fury.deserialize(fury.serialize(list))) +println(fury.deserialize(fury.serialize(map))) +``` + +## 序列化元组 + +```scala +val tuple = Tuple2(100, 10000L) +println(fury.deserialize(fury.serialize(tuple))) +val tuple = Tuple4(100, 10000L, 10000L, "str") +println(fury.deserialize(fury.serialize(tuple))) +``` + +## 序列化枚举 + +### Scala3 枚举 + +```scala +enum Color { case Red, Green, Blue } +println(fury.deserialize(fury.serialize(Color.Green))) +``` + +### Scala2 枚举 + +```scala +object ColorEnum extends Enumeration { + type ColorEnum = Value + val Red, Green, Blue = Value +} +println(fury.deserialize(fury.serialize(ColorEnum.Green))) +``` + +## 序列化 Option 类型 + +```scala +val opt: Option[Long] = Some(100) +println(fury.deserialize(fury.serialize(opt))) +val opt1: Option[Long] = None +println(fury.deserialize(fury.serialize(opt1))) +``` + +## 性能 + + `pojo/bean/case/object` Scala 对 Apache Fury JIT 的支持很好,性能与 Apache Fury Java 一样优异。 + +Scala 集合和泛型不遵循 Java 集合框架,并且未与当前发行版中的 Apache Fury JIT 完全集成。性能不会像 Java 的 Fury collections 序列化那么好。 + +scala 集合的执行将调用 Java 序列化 API `writeObject/readObject/writeReplace/readResolve/readObjectNoData/Externalizable` 和 Fury `ObjectStream` 实现。虽然 `org.apache.fury.serializer.ObjectStreamSerializer` 比 JDK `ObjectOutputStream/ObjectInputStream` 快很多,但它仍然不知道如何使用 Scala 集合泛型。 + +未来我们计划为 Scala 类型提供更多优化,敬请期待,更多信息请参看 [#682](https://github.com/apache/fury/issues/682)! + +Scala 集合序列化已在 [#1073](https://github.com/apache/fury/pull/1073) 完成 ,如果您想获得更好的性能,请使用 Apache Fury snapshot 版本。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/xlang_serialization_guide.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/xlang_serialization_guide.md new file mode 100644 index 00000000000..dce3867683f --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/xlang_serialization_guide.md @@ -0,0 +1,597 @@ +--- +title: 多语言序列化指南 +sidebar_position: 2 +id: xlang_object_graph_guide +--- + +## 跨语言对象图序列化 + +### 序列化内置类型 + +Apache Fury可以自动序列化编程语言的常见数据类型:primitive numeric types, string, binary, array, list, map 等。 + +**Java** + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; + +import java.util.*; + +public class Example1 { + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG).build(); + List list = ofArrayList(true, false, "str", -1.1, 1, new int[100], new double[20]); + byte[] bytes = fury.serialize(list); + // bytes can be data serialized by other languages. + fury.deserialize(bytes); + Map map = new HashMap<>(); + map.put("k1", "v1"); + map.put("k2", list); + map.put("k3", -1); + bytes = fury.serialize(map); + // bytes can be data serialized by other languages. + fury.deserialize(bytes); + } +} +``` + +**Python** + +```python +import pyfury +import numpy as np + +fury = pyfury.Fury() +object_list = [True, False, "str", -1.1, 1, + np.full(100, 0, dtype=np.int32), np.full(20, 0.0, dtype=np.double)] +data = fury.serialize(object_list) +# bytes can be data serialized by other languages. +new_list = fury.deserialize(data) +object_map = {"k1": "v1", "k2": object_list, "k3": -1} +data = fury.serialize(object_map) +# bytes can be data serialized by other languages. +new_map = fury.deserialize(data) +print(new_map) +``` + +**Golang** + +```go +package main + +import furygo "github.com/apache/fury/fury/go/fury" +import "fmt" + +func main() { + list := []interface{}{true, false, "str", -1.1, 1, make([]int32, 10), make([]float64, 20)} + fury := furygo.NewFury() + bytes, err := fury.Marshal(list) + if err != nil { + panic(err) + } + var newValue interface{} + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) + dict := map[string]interface{}{ + "k1": "v1", + "k2": list, + "k3": -1, + } + bytes, err = fury.Marshal(dict) + if err != nil { + panic(err) + } + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) +} +``` + +**JavaScript** + +```javascript +import Fury from '@furyjs/fury'; + +/** + * @furyjs/hps use v8's fast-calls-api that can be called directly by jit, ensure that the version of Node is 20 or above. + * Experimental feature, installation success cannot be guaranteed at this moment + * If you are unable to install the module, replace it with `const hps = null;` + **/ +import hps from '@furyjs/hps'; + +const fury = new Fury({ hps }); +const input = fury.serialize('hello fury'); +const result = fury.deserialize(input); +console.log(result); +``` + +**Rust** + +```rust +use chrono::{NaiveDate, NaiveDateTime}; +use fury::{from_buffer, to_buffer, Fury}; +use std::collections::HashMap; + +fn run() { + let bin: Vec = to_buffer(&"hello".to_string()); + let obj: String = from_buffer(&bin).expect("should success"); + assert_eq!("hello".to_string(), obj); +} +``` + +### 序列化自定义类型 + +序列化用户定义的类型需要使用注册 API 注册自定义类型,以便在不同语言中建立类型之间的映射关系。 + +**Java** + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; +import java.util.*; + +public class Example2 { + public static class SomeClass1 { + Object f1; + Map f2; + } + + public static class SomeClass2 { + Object f1; + String f2; + List f3; + Map f4; + Byte f5; + Short f6; + Integer f7; + Long f8; + Float f9; + Double f10; + short[] f11; + List f12; + } + + public static Object createObject() { + SomeClass1 obj1 = new SomeClass1(); + obj1.f1 = true; + obj1.f2 = ofHashMap((byte) -1, 2); + SomeClass2 obj = new SomeClass2(); + obj.f1 = obj1; + obj.f2 = "abc"; + obj.f3 = ofArrayList("abc", "abc"); + obj.f4 = ofHashMap((byte) 1, 2); + obj.f5 = Byte.MAX_VALUE; + obj.f6 = Short.MAX_VALUE; + obj.f7 = Integer.MAX_VALUE; + obj.f8 = Long.MAX_VALUE; + obj.f9 = 1.0f / 2; + obj.f10 = 1 / 3.0; + obj.f11 = new short[]{(short) 1, (short) 2}; + obj.f12 = ofArrayList((short) -1, (short) 4); + return obj; + } + + // mvn exec:java -Dexec.mainClass="org.apache.fury.examples.Example2" + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG).build(); + fury.register(SomeClass1.class, "example.SomeClass1"); + fury.register(SomeClass2.class, "example.SomeClass2"); + byte[] bytes = fury.serialize(createObject()); + // bytes can be data serialized by other languages. + System.out.println(fury.deserialize(bytes)); + } +} +``` + +**Python** + +```python +from dataclasses import dataclass +from typing import List, Dict, Any +import pyfury, array + + +@dataclass +class SomeClass1: + f1: Any + f2: Dict[pyfury.Int8Type, pyfury.Int32Type] + + +@dataclass +class SomeClass2: + f1: Any = None + f2: str = None + f3: List[str] = None + f4: Dict[pyfury.Int8Type, pyfury.Int32Type] = None + f5: pyfury.Int8Type = None + f6: pyfury.Int16Type = None + f7: pyfury.Int32Type = None + # int type will be taken as `pyfury.Int64Type`. + # use `pyfury.Int32Type` for type hint if peer + # are more narrow type. + f8: int = None + f9: pyfury.Float32Type = None + # float type will be taken as `pyfury.Float64Type` + f10: float = None + f11: pyfury.Int16ArrayType = None + f12: List[pyfury.Int16Type] = None + + +if __name__ == "__main__": + f = pyfury.Fury() + f.register_class(SomeClass1, type_tag="example.SomeClass1") + f.register_class(SomeClass2, type_tag="example.SomeClass2") + obj1 = SomeClass1(f1=True, f2={-1: 2}) + obj = SomeClass2( + f1=obj1, + f2="abc", + f3=["abc", "abc"], + f4={1: 2}, + f5=2 ** 7 - 1, + f6=2 ** 15 - 1, + f7=2 ** 31 - 1, + f8=2 ** 63 - 1, + f9=1.0 / 2, + f10=1 / 3.0, + f11=array.array("h", [1, 2]), + f12=[-1, 4], + ) + data = f.serialize(obj) + # bytes can be data serialized by other languages. + print(f.deserialize(data)) +``` + +**Golang** + +```go +package main + +import furygo "github.com/apache/fury/fury/go/fury" +import "fmt" + +func main() { + type SomeClass1 struct { + F1 interface{} + F2 string + F3 []interface{} + F4 map[int8]int32 + F5 int8 + F6 int16 + F7 int32 + F8 int64 + F9 float32 + F10 float64 + F11 []int16 + F12 fury.Int16Slice + } + + type SomeClas2 struct { + F1 interface{} + F2 map[int8]int32 + } + fury := furygo.NewFury() + if err := fury.RegisterTagType("example.SomeClass1", SomeClass1{}); err != nil { + panic(err) + } + if err := fury.RegisterTagType("example.SomeClass2", SomeClass2{}); err != nil { + panic(err) + } + obj1 := &SomeClass1{} + obj1.F1 = true + obj1.F2 = map[int8]int32{-1: 2} + obj := &SomeClass1{} + obj.F1 = obj1 + obj.F2 = "abc" + obj.F3 = []interface{}{"abc", "abc"} + f4 := map[int8]int32{1: 2} + obj.F4 = f4 + obj.F5 = fury.MaxInt8 + obj.F6 = fury.MaxInt16 + obj.F7 = fury.MaxInt32 + obj.F8 = fury.MaxInt64 + obj.F9 = 1.0 / 2 + obj.F10 = 1 / 3.0 + obj.F11 = []int16{1, 2} + obj.F12 = []int16{-1, 4} + bytes, err := fury.Marshal(obj); + if err != nil { + panic(err) + } + var newValue interface{} + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) +} +``` + +**JavaScript** + +```javascript +import Fury, { Type, InternalSerializerType } from '@furyjs/fury'; + +/** + * @furyjs/hps use v8's fast-calls-api that can be called directly by jit, ensure that the version of Node is 20 or above. + * Experimental feature, installation success cannot be guaranteed at this moment + * If you are unable to install the module, replace it with `const hps = null;` + **/ +import hps from '@furyjs/hps'; + +// Now we describe data structures using JSON, but in the future, we will use more ways. +const description = Type.object('example.foo', { + foo: Type.string(), +}); +const fury = new Fury({ hps }); +const { serialize, deserialize } = fury.registerSerializer(description); +const input = serialize({ foo: 'hello fury' }); +const result = deserialize(input); +console.log(result); +``` + +**Rust** + +```rust +use chrono::{NaiveDate, NaiveDateTime}; +use fury::{from_buffer, to_buffer, Fury}; +use std::collections::HashMap; + +#[test] +fn complex_struct() { + #[derive(Fury, Debug, PartialEq)] + #[tag("example.foo2")] + struct Animal { + category: String, + } + + #[derive(Fury, Debug, PartialEq)] + #[tag("example.foo")] + struct Person { + c1: Vec, // binary + c2: Vec, // primitive array + animal: Vec, + c3: Vec>, + name: String, + c4: HashMap, + age: u16, + op: Option, + op2: Option, + date: NaiveDate, + time: NaiveDateTime, + c5: f32, + c6: f64, + } + let person: Person = Person { + c1: vec![1, 2, 3], + c2: vec![5, 6, 7], + c3: vec![vec![1, 2], vec![1, 3]], + animal: vec![Animal { + category: "Dog".to_string(), + }], + c4: HashMap::from([ + ("hello1".to_string(), "hello2".to_string()), + ("hello2".to_string(), "hello3".to_string()), + ]), + age: 12, + name: "helo".to_string(), + op: Some("option".to_string()), + op2: None, + date: NaiveDate::from_ymd_opt(2025, 12, 12).unwrap(), + time: NaiveDateTime::from_timestamp_opt(1689912359, 0).unwrap(), + c5: 2.0, + c6: 4.0, + }; + + let bin: Vec = to_buffer(&person); + let obj: Person = from_buffer(&bin).expect("should success"); + assert_eq!(person, obj); +} +``` + +### 序列化共享引用和循环引用 + +共享引用和循环引用可自动序列化,不会出现重复数据或递归错误。 + +**Java** + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; +import java.util.*; + +public class ReferenceExample { + public static class SomeClass { + SomeClass f1; + Map f2; + Map f3; + } + + public static Object createObject() { + SomeClass obj = new SomeClass(); + obj.f1 = obj; + obj.f2 = ofHashMap("k1", "v1", "k2", "v2"); + obj.f3 = obj.f2; + return obj; + } + + // mvn exec:java -Dexec.mainClass="org.apache.fury.examples.ReferenceExample" + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG) + .withRefTracking(true).build(); + fury.register(SomeClass.class, "example.SomeClass"); + byte[] bytes = fury.serialize(createObject()); + // bytes can be data serialized by other languages. + System.out.println(fury.deserialize(bytes)); + } +} +``` + +**Python** + +```python +from typing import Dict +import pyfury + +class SomeClass: + f1: "SomeClass" + f2: Dict[str, str] + f3: Dict[str, str] + +fury = pyfury.Fury(ref_tracking=True) +fury.register_class(SomeClass, type_tag="example.SomeClass") +obj = SomeClass() +obj.f2 = {"k1": "v1", "k2": "v2"} +obj.f1, obj.f3 = obj, obj.f2 +data = fury.serialize(obj) +# bytes can be data serialized by other languages. +print(fury.deserialize(data)) +``` + +**Golang** + +```go +package main + +import furygo "github.com/apache/fury/fury/go/fury" +import "fmt" + +func main() { + type SomeClass struct { + F1 *SomeClass + F2 map[string]string + F3 map[string]string + } + fury := furygo.NewFury(true) + if err := fury.RegisterTagType("example.SomeClass", SomeClass{}); err != nil { + panic(err) + } + value := &SomeClass{F2: map[string]string{"k1": "v1", "k2": "v2"}} + value.F3 = value.F2 + value.F1 = value + bytes, err := fury.Marshal(value) + if err != nil { + } + var newValue interface{} + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) +} +``` + +**JavaScript** + +```javascript +import Fury, { Type } from '@furyjs/fury'; +/** + * @furyjs/hps use v8's fast-calls-api that can be called directly by jit, ensure that the version of Node is 20 or above. + * Experimental feature, installation success cannot be guaranteed at this moment + * If you are unable to install the module, replace it with `const hps = null;` + **/ +import hps from '@furyjs/hps'; + +const description = Type.object('example.foo', { + foo: Type.string(), + bar: Type.object('example.foo'), +}); + +const fury = new Fury({ hps }); +const { serialize, deserialize } = fury.registerSerializer(description); +const data: any = { + foo: 'hello fury', +}; +data.bar = data; +const input = serialize(data); +const result = deserialize(input); +console.log(result.bar.foo === result.foo); +``` + +**JavaScript** +Reference cannot be implemented because of rust ownership restrictions + +### Zero-Copy Serialization + +**Java** + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; +import org.apache.fury.serializers.BufferObject; +import org.apache.fury.memory.MemoryBuffer; + +import java.util.*; +import java.util.stream.Collectors; + +public class ZeroCopyExample { + // mvn exec:java -Dexec.mainClass="io.ray.fury.examples.ZeroCopyExample" + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG).build(); + List list = ofArrayList("str", new byte[1000], new int[100], new double[100]); + Collection bufferObjects = new ArrayList<>(); + byte[] bytes = fury.serialize(list, e -> !bufferObjects.add(e)); + // bytes can be data serialized by other languages. + List buffers = bufferObjects.stream() + .map(BufferObject::toBuffer).collect(Collectors.toList()); + System.out.println(fury.deserialize(bytes, buffers)); + } +} +``` + +**Python** + +```python +import array +import pyfury +import numpy as np + +fury = pyfury.Fury() +list_ = ["str", bytes(bytearray(1000)), + array.array("i", range(100)), np.full(100, 0.0, dtype=np.double)] +serialized_objects = [] +data = fury.serialize(list_, buffer_callback=serialized_objects.append) +buffers = [o.to_buffer() for o in serialized_objects] +# bytes can be data serialized by other languages. +print(fury.deserialize(data, buffers=buffers)) +``` + +**Golang** + +```go +package main + +import furygo "github.com/apache/fury/fury/go/fury" +import "fmt" + +func main() { + fury := furygo.NewFury() + list := []interface{}{"str", make([]byte, 1000)} + buf := fury.NewByteBuffer(nil) + var bufferObjects []fury.BufferObject + fury.Serialize(buf, list, func(o fury.BufferObject) bool { + bufferObjects = append(bufferObjects, o) + return false + }) + var newList []interface{} + var buffers []*fury.ByteBuffer + for _, o := range bufferObjects { + buffers = append(buffers, o.ToBuffer()) + } + if err := fury.Deserialize(buf, &newList, buffers); err != nil { + panic(err) + } + fmt.Println(newList) +} +``` + +**JavaScript** + +```javascript +// Coming soon +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/xlang_type_mapping.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/xlang_type_mapping.md new file mode 100644 index 00000000000..82b36cb83a4 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/guide/xlang_type_mapping.md @@ -0,0 +1,95 @@ +--- +title: 多语言序列化的类型映射 +sidebar_position: 3 +id: xlang_type_mapping +--- + +注意: + +- 有关类型定义,请参阅 [Spec 中的类型系统](https://fury.apache.org/docs/specification/fury_xlang_serialization_spec#type-systems) +- `int16_t[n]/vector` 表示 `int16_t[n]/vector`. +- 跨语言序列化并不稳定,请勿在生产环境中使用。 + +## Type Mapping + +| Fury 类型 | Fury 类型 ID | Java | Python | Javascript | C++ | Golang | Rust | +|--------------------|--------------|-----------------|----------------------|-----------------|--------------------------------|------------------|------------------| +| bool | 1 | bool/Boolean | bool | Boolean | bool | bool | bool | +| int8 | 2 | byte/Byte | int/pyfury.Int8 | Type.int8() | int8_t | int8 | i8 | +| int16 | 3 | short/Short | int/pyfury.Int16 | Type.int16() | int16_t | int16 | i6 | +| int32 | 4 | int/Integer | int/pyfury.Int32 | Type.int32() | int32_t | int32 | i32 | +| var_int32 | 5 | int/Integer | int/pyfury.VarInt32 | Type.varint32() | fury::varint32_t | fury.varint32 | fury::varint32 | +| int64 | 6 | long/Long | int/pyfury.Int64 | Type.int64() | int64_t | int64 | i64 | +| var_int64 | 7 | long/Long | int/pyfury.VarInt64 | Type.varint64() | fury::varint64_t | fury.varint64 | fury::varint64 | +| sli_int64 | 8 | long/Long | int/pyfury.SliInt64 | Type.sliint64() | fury::sliint64_t | fury.sliint64 | fury::sliint64 | +| float16 | 9 | float/Float | float/pyfury.Float16 | Type.float16() | fury::float16_t | fury.float16 | fury::f16 | +| float32 | 10 | float/Float | float/pyfury.Float32 | Type.float32() | float | float32 | f32 | +| float64 | 11 | double/Double | float/pyfury.Float64 | Type.float64() | double | float64 | f64 | +| string | 12 | String | str | String | string | string | String/str | +| enum | 13 | Enum subclasses | enum subclasses | / | enum | / | enum | +| list | 14 | List/Collection | list/tuple | array | vector | slice | Vec | +| set | 15 | Set | set | / | set | fury.Set | Set | +| map | 16 | Map | dict | Map | unordered_map | map | HashMap | +| duration | 17 | Duration | timedelta | Number | duration | Duration | Duration | +| timestamp | 18 | Instant | datetime | Number | std::chrono::nanoseconds | Time | DateTime | +| decimal | 19 | BigDecimal | Decimal | bigint | / | / | / | +| binary | 20 | byte[] | bytes | / | `uint8_t[n]/vector` | `[n]uint8/[]T` | `Vec` | +| array | 21 | array | np.ndarray | / | / | array/slice | Vec | +| bool_array | 22 | bool[] | ndarray(np.bool_) | / | `bool[n]` | `[n]bool/[]T` | `Vec` | +| int8_array | 23 | byte[] | ndarray(int8) | / | `int8_t[n]/vector` | `[n]int8/[]T` | `Vec` | +| int16_array | 24 | short[] | ndarray(int16) | / | `int16_t[n]/vector` | `[n]int16/[]T` | `Vec` | +| int32_array | 25 | int[] | ndarray(int32) | / | `int32_t[n]/vector` | `[n]int32/[]T` | `Vec` | +| int64_array | 26 | long[] | ndarray(int64) | / | `int64_t[n]/vector` | `[n]int64/[]T` | `Vec` | +| float16_array | 27 | float[] | ndarray(float16) | / | `fury::float16_t[n]/vector` | `[n]float16/[]T` | `Vec` | +| float32_array | 28 | float[] | ndarray(float32) | / | `float[n]/vector` | `[n]float32/[]T` | `Vec` | +| float64_array | 29 | double[] | ndarray(float64) | / | `double[n]/vector` | `[n]float64/[]T` | `Vec` | +| tensor | 30 | / | / | / | / | / | / | +| sparse tensor | 31 | / | / | / | / | / | / | +| arrow record batch | 32 | / | / | / | / | / | / | +| arrow table | 33 | / | / | / | / | / | / | + +### 类型信息(目前尚未实现) + +由于语言类型系统之间的差异,这些类型无法在语言之间一对一地映射。 + +如果用户看到一种语言中的一种类型对应 Apache Fury 类型系统中的多种类型。 + +例如:java 中的 `long` 类型对应 `int64/varint64/sliint64` 类型。类型为 `int64/varint64/sliint64` 这意味着该语言缺少某些类型,用户在使用 Fury 时必须提供额外的类型信息。 + +### 类型注解 + +如果类型是另一个类的字段,用户可以为类型的字段或整个类型提供 meta hints。 +这些信息也可以用其他语言提供: + +- java:使用注解; +- cpp:使用宏和模板; +- golang:使用 struct tag; +- python: 使用 typehint; +- rust:使用宏。 + +下面是一个例子: + +- Java: + + ```java + class Foo { + @Int32Type(varint = true) + int f1; + List<@Int32Type(varint = true) Integer> f2; + } + ``` + +- Python: + + ```python + class Foo: + f1: Int32Type(varint=True) + f2: List[Int32Type(varint=True)] + ``` + +## 类型包装器 + +如果类型不是类的字段,用户必须用 Fury 类型来包装该类型,以传递额外的类型信息。 + +例如:假设 Apache Fury Java 提供了 `VarInt64` 类型,当用户调用 `fury.serialize(long_value)` 时,需要像下面这样调用 +调用 `fury.serialize(new VarInt64(long_value))`。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/benchmark.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/benchmark.md new file mode 100644 index 00000000000..5723f9e53f7 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/benchmark.md @@ -0,0 +1,35 @@ +--- +id: benchmark +title: Benchmark +sidebar_position: 2 +--- + +不同的序列化框架适用于不同的场景,这里的 benchmark 结果仅供参考。 + +如果你需要针对特定场景进行 benchmark 测试,请确保所有序列化框架都针对该场景进行了适当配置。 + +动态序列化框架支持多态和引用,与静态序列化框架相比,这成本更高,除非它和 Fury 一样使用 JIT 技术。由于 Fury 在运行时生成代码,因此请在收集 benchmark 统计信息之前进行系统预热。 + +### Java 序列化 + + + + + + +### Java 反序列化 + + + + + + +有关类型向前/向后兼容性、堆外支持、零拷贝序列化的更多 benchmark 测试,请参见[benchmarks](https://github.com/apache/fury/tree/main/docs/benchmarks) + +### JavaScript + + + +该条形图使用的数据包括一个具有多种字段类型的复杂对象,JSON 数据的大小为 3KB。 + +请参阅[benchmarks](https://github.com/apache/fury/blob/main/javascript/benchmark/index.js) benchmark 测试代码。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/features.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/features.md new file mode 100644 index 00000000000..f4c467b9f0d --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/features.md @@ -0,0 +1,21 @@ +--- +id: features +title: Features +sidebar_position: 3 +--- + +- 多种语言:Java/Python/C++/Golang/Javascript/Rust。 +- 零拷贝:类似[pickle5](https://peps.python.org/pep-0574/),支持堆外读/写的跨语言序列化。 +- 高性能:高度可扩展的 JIT 框架,可在运行时以异步多线程方式生成序列化程序代码,以加快序列化速度,通过以下方式提升 20-170 倍的速度: + - 减少在生成代码中内联变量减少内存访问; + - 通过在生成的代码中内联调用来减少虚拟方法调用; + - 减少条件分支; + - 减少哈希查找; +- 二进制协议:对象图、行格式等。 + +除了跨语言序列化之外,Fury 还支持以下功能: + +- 直接替换 Java 序列化框架,如 JDK/Kryo/Hessian,无需修改任何代码,但速度提高 100 倍。它可以大大提高 RPC 调用性能、数据传输和对象持久化的效率; +- JDK 序列化 100% 兼容,原生支持 java 自定义序列化 `writeObject/readObject/writeReplace/readResolve/readObjectNoData`; +- 支持 golang 的共享和循环引用对象序列化; +- 支持 golang 的自动对象序列化。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/introduction.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/introduction.md new file mode 100644 index 00000000000..e20d40dc873 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/introduction/introduction.md @@ -0,0 +1,56 @@ +--- +id: introduction +title: Apache Fury 介绍 +sidebar_position: 1 +--- + +Fury 是一个由 JIT(即时编译)和零拷贝提供支持的高性能多语言序列化框架。 + +## 协议 + +不同的场景有不同的序列化要求。Apache Fury 为这些需求设计并实现了多个二进制协议: + +- 跨语言对象图协议: + - 跨语言自动序列化任何对象,无需 IDL 定义、Schema编译和对象协议转换; + - 支持共享引用和循环引用,无重复数据或递归错误; + - 原生支持对象多态。 +- 原生 java/python 对象图协议:基于单一语言的完整类型系统进行高度优化; +- 行格式协议:缓存友好的二进制随机访问格式,支持跳过序列化和部分序列化,并且可以自动转换为列格式。 + +基于 fury 现有的 buffer、encoding、meta、codegen 和其他功能,可以轻松添加新协议。所有这些协议都共享相同的代码库,并且一个协议的优化可以被另一个协议重用。 + +## 兼容性 + +### Schema兼容性 + +Apache Fury java 对象图序列化支持类架构向前/向后兼容。序列化 Peer 节点和反序列化 Peer 节点可以独立添加/删除字段。 + +我们计划在[元数据压缩](https://github.com/apache/fury/issues/203)完成后支持跨语言序列化Schema兼容性。 + +### 二进制兼容性 + +我们仍在改进我们的协议,目前无法确保 fury 版本之间的二进制兼容性。如果您将来要升级 fury,请 `shade` fury。 + +在 fury 1.0 之前将确保二进制兼容性。 + +## 安全 + +静态序列化(如行格式)本质上是安全的。但动态对象图序列化支持反序列化未注册的类型,这可能会带来安全风险。 + +例如:反序列化可能会调用 `init` constructor 或 `equals`/ `hashCode` 方法,如果方法体中包含恶意代码,系统将处于危险之中。 + +Apache Fury 提供了一个类注册选项,并默认开启该选项,它只允许反序列化受信任的注册类型或内置类型。**不要禁用类注册或类注册检查,除非您可以确保您的环境确实是安全的**。如果您禁用了 class 注册选项,你需要自行负责序列化的安全性。 + +## 路线图 + +- 元压缩、自动元共享和跨语言兼容性; +- 用于 c++/golang 的 AOT 框架,用于静态生成代码; +- C++/Rust 对象图序列化支持; +- Golang/Rust/NodeJS 行存格式支持; +- ProtoBuffer 兼容性支持; +- 特征和模型序列化协议; +- 不断改进我们的序列化基础设施,以更快支持任何新的协议。 + +## 如何贡献 + +请阅读[贡献](https://github.com/apache/fury/blob/main/CONTRIBUTING.md)指南以获取有关如何贡献的说明。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/java_serialization_spec.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/java_serialization_spec.md new file mode 100644 index 00000000000..82709374608 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/java_serialization_spec.md @@ -0,0 +1,557 @@ +--- +title: Fury Java Serialization Format +sidebar_position: 1 +id: fury_java_serialization_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Spec overview + +Fury Java Serialization is an automatic object serialization framework that supports reference and polymorphism. Fury +will +convert an object from/to fury java serialization binary format. Fury has two core concepts for java serialization: + +- **Fury Java Binary format** +- **Framework to convert object to/from Fury Java Binary format** + +The serialization format is a dynamic binary format. The dynamics and reference/polymorphism support make Fury flexible, +much more easy to use, but +also introduce more complexities compared to static serialization frameworks. So the format will be more complex. + +Here is the overall format: + +``` +| fury header | object ref meta | object class meta | object value data | +``` + +The data are serialized using little endian byte order overall. If bytes swap is costly for some object, +Fury will write the byte order for that object into the data instead of converting it to little endian. + +## Fury header + +Fury header consists starts one byte: + +``` +| 4 bits | 1 bit | 1 bit | 1 bit | 1 bit | optional 4 bytes | ++---------------+-------+-------+--------+-------+------------------------------------+ +| reserved bits | oob | xlang | endian | null | unsigned int for meta start offset | +``` + +- null flag: 1 when object is null, 0 otherwise. If an object is null, other bits won't be set. +- endian flag: 1 when data is encoded by little endian, 0 for big endian. +- xlang flag: 1 when serialization uses xlang format, 0 when serialization uses Fury java format. +- oob flag: 1 when passed `BufferCallback` is not null, 0 otherwise. + +If meta share mode is enabled, an uncompressed unsigned int is appended to indicate the start offset of metadata. + +## Reference Meta + +Reference tracking handles whether the object is null, and whether to track reference for the object by writing +corresponding flags and maintaining internal state. + +Reference flags: + +| Flag | Byte Value | Description | +|---------------------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| +| NULL FLAG | `-3` | This flag indicates the object is a null value. We don't use another byte to indicate REF, so that we can save one byte. | +| REF FLAG | `-2` | This flag indicates the object is already serialized previously, and fury will write a ref id with unsigned varint format instead of serialize it again | +| NOT_NULL VALUE FLAG | `-1` | This flag indicates the object is a non-null value and fury doesn't track ref for this type of object. | +| REF VALUE FLAG | `0` | This flag indicates the object is referencable and the first time to serialize. | + +When reference tracking is disabled globally or for specific types, or for certain types within a particular +context(e.g., a field of a class), only the `NULL` and `NOT_NULL VALUE` flags will be used for reference meta. + +## Class Meta + +Fury supports to register class by an optional id, the registration can be used for security check and class +identification. +If a class is registered, it will have a user-provided or an auto-growing unsigned int i.e. `class_id`. + +Depending on whether meta share mode and registration is enabled for current class, Fury will write class meta +differently. + +### Schema consistent + +If schema consistent mode is enabled globally or enabled for current class, class meta will be written as follows: + +- If class is registered, it will be written as a fury unsigned varint: `class_id << 1`. +- If class is not registered: + - If class is not an array, fury will write one byte `0bxxxxxxx1` first, then write class name. + - The first little bit is `1`, which is different from first bit `0` of + encoded class id. Fury can use this information to determine whether to read class by class id for + deserialization. + - If class is not registered and class is an array, fury will write one byte `dimensions << 1 | 1` first, then write + component + class subsequently. This can reduce array class name cost if component class is or will be serialized. + - Class will be written as two enumerated fury unsigned by default: `package name` and `class name`. If meta share + mode is + enabled, + class will be written as an unsigned varint which points to index in `MetaContext`. + +### Schema evolution + +If schema evolution mode is enabled globally or enabled for current class, class meta will be written as follows: + +- If meta share mode is not enabled, class meta will be written as schema consistent mode. Additionally, field meta such + as field type + and name will be written with the field value using a key-value like layout. +- If meta share mode is enabled, class meta will be written as a meta-share encoded binary if class hasn't been written + before, otherwise an unsigned varint id which references to previous written class meta will be written. + +## Meta share + +> This mode will forbid streaming writing since it needs to look back for update the start offset after the whole object +> graph +> writing and meta collecting is finished. Only in this way we can ensure deserialization failure doesn't lost shared +> meta. +> Meta streamline will be supported in the future for enclosed meta sharing which doesn't cross multiple serializations +> of different objects. + +For Schema consistent mode, class will be encoded as an enumerated string by full class name. Here we mainly describe +the meta layout for schema evolution mode: + +``` +| 8 bytes meta header | meta size | variable bytes | variable bytes | variable bytes | ++-------------------------------+-----------|--------------------+-------------------+----------------+ +| 7 bytes hash + 1 bytes header | 1~2 bytes | current class meta | parent class meta | ... | +``` + +Class meta are encoded from parent class to leaf class, only class with serializable fields will be encoded. + +### Meta header + +Meta header is a 64 bits number value encoded in little endian order. + +- Lowest 4 digits `0b0000~0b1110` are used to record num classes. `0b1111` is preserved to indicate that Fury need to + read more bytes for length using Fury unsigned int encoding. If current class doesn't has parent class, or parent + class doesn't have fields to serialize, or we're in a context which serialize fields of current class + only( `ObjectStreamSerializer#SlotInfo` is an example), num classes will be 1. +- 5rd bit is used to indicate whether this class needs schema evolution. +- 6rd bit is used to indicate whether the size sum of all layers meta is less than 256. +- Other 56 bits is used to store the unique hash of `flags + all layers class meta`. + +### Meta size + +- If the size sum of all layers meta is less than 256, then one byte is written next to indicate the length of meta. +- Otherwise, write size as two bytes in little endian. + +### Single layer class meta + +``` +| unsigned varint | meta string | meta string | field info: variable bytes | variable bytes | ... | ++----------------------------+-----------------------+---------------------+-------------------------------+-----------------+-----+ +| num fields + register flag | header + package name | header + class name | header + type id + field name | next field info | ... | +``` + +- num fields: encode `num fields << 1 | register flag(1 when class registered)` as unsigned varint. + - If class is registered, then an unsigned varint class id will be written next, package and class name will be + omitted. + - If current class is schema consistent, then num field will be `0` to flag it. + - If current class isn't schema consistent, then num field will be the number of compatible fields. For example, + users + can use tag id to mark some field as compatible field in schema consistent context. In such cases, schema + consistent + fields will be serialized first, then compatible fields will be serialized next. At deserialization, Fury will use + fields info of those fields which aren't annotated by tag id for deserializing schema consistent fields, then use + fields info in meta for deserializing compatible fields. +- Package name encoding(omitted when class is registered): + - encoding algorithm: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL` + - Header: `6 bits size | 2 bits encoding flags`. The `6 bits size: 0~63` will be used to indicate size `0~63`, + the value `63` the size need more byte to read, the encoding will encode `size - 63` as a varint next. +- Class name encoding(omitted when class is registered): + - encoding algorithm: `UTF8/LOWER_UPPER_DIGIT_SPECIAL/FIRST_TO_LOWER_SPECIAL/ALL_TO_LOWER_SPECIAL` + - header: `6 bits size | 2 bits encoding flags`. The `6 bits size: 0~63` will be used to indicate size `0~63`, + the value `63` the size need more byte to read, the encoding will encode `size - 63` as a varint next. +- Field info: + - header(8 + bits): `3 bits size + 2 bits field name encoding + polymorphism flag + nullability flag + ref tracking flag`. + Users can use annotation to provide those info. + - 2 bits field name encoding: + - encoding: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL/TAG_ID` + - If tag id is used, i.e. field name is written by an unsigned varint tag id. 2 bits encoding will be `11`. + - size of field name: + - The `3 bits size: 0~7` will be used to indicate length `1~7`, the value `6` the size read more bytes, + the encoding will encode `size - 7` as a varint next. + - If encoding is `TAG_ID`, then num_bytes of field name will be used to store tag id. + - ref tracking: when set to 1, ref tracking will be enabled for this field. + - nullability: when set to 1, this field can be null. + - polymorphism: when set to 1, the actual type of field will be the declared field type even the type if + not `final`. + - type id: + - For registered type-consistent classes, it will be the registered class id. + - Otherwise it will be encoded as `OBJECT_ID` if it isn't `final` and `FINAL_OBJECT_ID` if it's `final`. The + meta for such types is written separately instead of inlining here is to reduce meta space cost if object of + this type is serialized in current object graph multiple times, and the field value may be null too. + - Field name: If type id is set, type id will be used instead. Otherwise meta string encoding length and data will + be written instead. + +Field order are left as implementation details, which is not exposed to specification, the deserialization need to +resort fields based on Fury field comparator. In this way, fury can compute statistics for field names or types and +using a more compact encoding. + +### Other layers class meta + +Same encoding algorithm as the previous layer except: + +- header + package name: + - Header: + - If package name has been written before: `varint index + sharing flag(set)` will be written + - If package name hasn't been written before: + - If meta string encoding is `LOWER_SPECIAL` and the length of encoded string `<=` 64, then header will be + `6 bits size + encoding flag(set) + sharing flag(unset)`. + - Otherwise, header will + be `3 bits unset + 3 bits encoding flags + encoding flag(unset) + sharing flag(unset)` + +## Meta String + +Meta string is mainly used to encode meta strings such as class name and field names. + +### Encoding Algorithms + +String binary encoding algorithm: + +| Algorithm | Pattern | Description | +|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| UTF-8 | any chars | UTF-8 encoding | + +Encoding flags: + +| Encoding Flag | Pattern | Encoding Algorithm | +|---------------------------|---------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | every char is in `a-z._$\|` | `LOWER_SPECIAL` | +| FIRST_TO_LOWER_SPECIAL | every char is in `a-z[c1,c2]` except first char is upper case | replace first upper case char to lower case, then use `LOWER_SPECIAL` | +| ALL_TO_LOWER_SPECIAL | every char is in `a-zA-Z[c1,c2]` | replace every upper case char by `\|` + `lower case`, then use `LOWER_SPECIAL`, use this encoding if it's smaller than Encoding `LOWER_UPPER_DIGIT_SPECIAL` | +| LOWER_UPPER_DIGIT_SPECIAL | every char is in `a-zA-Z[c1,c2]` | use `LOWER_UPPER_DIGIT_SPECIAL` encoding if it's smaller than Encoding `FIRST_TO_LOWER_SPECIAL` | +| UTF8 | any utf-8 char | use `UTF-8` encoding | +| Compression | any utf-8 char | lossless compression | + +Notes: + +- For package name encoding, `c1,c2` should be `._`; For field/type name encoding, `c1,c2` should be `_$`; +- Depending on cases, one can choose encoding `flags + data` jointly, uses 3 bits of first byte for flags and other + bytes + for data. + +### Shared meta string + +The shared meta string format consists of header and encoded string binary. Header of encoded string binary will be +inlined +in shared meta header. + +Header is written using little endian order, Fury can read this flag first to determine how to deserialize the data. + +#### Write by data + +If string hasn't been written before, the data will be written as follows: + +``` +| unsigned varint: string binary size + 1 bit: not written before | 56 bits: unique hash | 3 bits encoding flags + string binary | +``` + +If string binary size is less than `16` bytes, the hash will be omitted to save spaces. Unique hash can be omitted too +if caller pass a flag to disable it. In such cases, the format will be: + +``` +| unsigned varint: string binary size + 1 bit: not written before | 3 bits encoding flags + string binary | +``` + +#### Write by ref + +If string has been written before, the data will be written as follows: + +``` +| unsigned varint: written string id + 1 bit: written before | +``` + +## Value Format + +### Basic types + +#### Bool + +- size: 1 byte +- format: 0 for `false`, 1 for `true` + +#### Byte + +- size: 1 byte +- format: write as pure byte. + +#### Short + +- size: 2 byte +- byte order: little endian order + +#### Char + +- size: 2 byte +- byte order: little endian order + +#### Unsigned int + +- size: 1~5 byte +- Format: The most significant bit (MSB) in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then + the next byte should be read until the first bit of the next byte is unset. + +#### Signed int + +- size: 1~5 byte +- Format: First convert the number into positive unsigned int by `(v << 1) ^ (v >> 31)` ZigZag algorithm, then encoding + it as an unsigned int. + +#### Unsigned long + +- size: 1~9 byte +- Fury PVL(Progressive Variable-length Long) Encoding: + - positive long format: first bit in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then the next byte should be read until the first bit is unset. + +#### Signed long + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in [-1073741824, 1073741823], encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - First convert the number into positive unsigned long by `(v << 1) ^ (v >> 63)` ZigZag algorithm to reduce cost of + small negative numbers, then encoding it as an unsigned long. + +#### Float + +- size: 4 byte +- format: convert float to 4 bytes int by `Float.floatToRawIntBits`, then write as binary by little endian order. + +#### Double + +- size: 8 byte +- format: convert double to 8 bytes int by `Double.doubleToRawLongBits`, then write as binary by little endian order. + +### String + +Format: + +``` +| header: size << 2 | 2 bits encoding flags | binary data | +``` + +- `size + encoding` will be concat as a long and encoded as an unsigned var long. The little 2 bits is used for + encoding: + 0 for `latin`, 1 for `utf-16`, 2 for `utf-8`. +- encoded string binary data based on encoding: `latin/utf-16/utf-8`. + +Which encoding to choose: + +- For JDK8: fury detect `latin` at runtime, if string is `latin` string, then use `latin` encoding, otherwise + use `utf-16`. +- For JDK9+: fury use `coder` in `String` object for encoding, `latin`/`utf-16` will be used for encoding. +- If the string is encoded by `utf-8`, then fury will use `utf-8` to decode the data. But currently fury doesn't enable + utf-8 encoding by default for java. Cross-language string serialization of fury uses `utf-8` by default. + +### Collection + +> All collection serializers must extend `AbstractCollectionSerializer`. + +Format: + +``` +length(unsigned varint) | collection header | elements header | elements data +``` + +#### Collection header + +- For `ArrayList/LinkedArrayList/HashSet/LinkedHashSet`, this will be empty. +- For `TreeSet`, this will be `Comparator` +- For subclass of `ArrayList`, this may be extra object field info. + +#### Elements header + +In most cases, all collection elements are same type and not null, elements header will encode those homogeneous +information to avoid the cost of writing it for every element. Specifically, there are four kinds of information +which will be encoded by elements header, each use one bit: + +- If track elements ref, use the first bit `0b1` of the header to flag it. +- If the collection has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + element type, this flag is invalid. +- If the collection element types are not declared type, use the 3rd bit `0b100` of the header to flag it. +- If the collection element types are different, use the 4rd bit `0b1000` header to flag it. + +By default, all bits are unset, which means all elements won't track ref, all elements are same type, not null and +the actual element is the declared type in the custom class field. + +The implementation can generate different deserialization code based read header, and look up the generated code from a +linear map/list. + +#### Elements data + +Based on the elements header, the serialization of elements data may skip `ref flag`/`null flag`/`element class info`. + +`CollectionSerializer#write/read` can be taken as an example. + +### Array + +#### Primitive array + +Primitive array are taken as a binary buffer, serialization will just write the length of array size as an unsigned int, +then copy the whole buffer into the stream. + +Such serialization won't compress the array. If users want to compress primitive array, users need to register custom +serializers for such types. + +#### Object array + +Object array is serialized using the collection format. Object component type will be taken as collection element +generic +type. + +### Map + +> All Map serializers must extend `AbstractMapSerializer`. + +Format: + +``` +| length(unsigned varint) | map header | key value pairs data | +``` + +#### Map header + +- For `HashMap/LinkedHashMap`, this will be empty. +- For `TreeMap`, this will be `Comparator` +- For other `Map`, this may be extra object field info. + +#### Map Key-Value data + +Map iteration is too expensive, Fury won't compute the header like for collection before since it introduce +[considerable overhead](https://github.com/apache/fury/issues/925). +Users can use `MapFieldInfo` annotation to provide header in advance. Otherwise Fury will use first key-value pair to +predict header optimistically, and update the chunk header if the prediction failed at some pair. + +Fury will serialize map chunk by chunk, every chunk has 127 pairs at most. + +``` +| 1 byte | 1 byte | variable bytes | ++----------------+----------------+-----------------+ +| KV header | chunk size: N | N*2 objects | +``` + +KV header: + +- If track key ref, use the first bit `0b1` of the header to flag it. +- If the key has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + key type, this flag is invalid. +- If the actual key type of map is not the declared key type, use the 3rd bit `0b100` of the header to flag it. +- If track value ref, use the 4th bit `0b1000` of the header to flag it. +- If the value has null, use the 5th bit `0b10000` of the header to flag it. If ref tracking is enabled for this + value type, this flag is invalid. +- If the value type of map is not the declared value type, use the 6rd bit `0b100000` of the header to flag it. +- If key or value is null, that key and value will be written as a separate chunk, and chunk size writing will be + skipped too. + +If streaming write is enabled, which means Fury can't update written `chunk size`. In such cases, map key-value data +format will be: + +``` +| 1 byte | variable bytes | ++----------------+-----------------+ +| KV header | N*2 objects | +``` + +`KV header` will be a header marked by `MapFieldInfo` in java. The implementation can generate different deserialization +code based read header, and look up the generated code from a linear map/list. + +### Enum + +Enums are serialized as an unsigned var int. If the order of enum values change, the deserialized enum value may not be +the value users expect. In such cases, users must register enum serializer by make it write enum value as an enumerated +string with unique hash disabled. + +### Object + +Object means object of `pojo/struct/bean/record` type. +Object will be serialized by writing its fields data in fury order. + +Depending on schema compatibility, objects will have different formats. + +#### Field order + +Field will be ordered as following, every group of fields will have its own order: + +- primitive fields: larger size type first, smaller later, variable size type last. +- boxed primitive fields: same order as primitive fields +- final fields: same type together, then sorted by field name lexicographically. +- collection fields: same order as final fields +- map fields: same order as final fields +- other fields: same order as final fields + +#### Schema consistent + +Object fields will be serialized one by one using following format: + +``` +Primitive field value: +| var bytes | ++----------------+ +| value data | ++----------------+ +Boxed field value: +| one byte | var bytes | ++-----------+---------------+ +| null flag | field value | ++-----------+---------------+ +field value of final type with ref tracking: +| var bytes | var objects | ++-----------+-------------+ +| ref meta | value data | ++-----------+-------------+ +field value of final type without ref tracking: +| one byte | var objects | ++-----------+-------------+ +| null flag | field value | ++-----------+-------------+ +field value of non-final type with ref tracking: +| one byte | var bytes | var objects | ++-----------+-------------+-------------+ +| ref meta | class meta | value data | ++-----------+-------------+-------------+ +field value of non-final type without ref tracking: +| one byte | var bytes | var objects | ++-----------+------------+------------+ +| null flag | class meta | value data | ++-----------+------------+------------+ +``` + +#### Schema evolution + +Schema evolution have similar format as schema consistent mode for object except: + +- For this object type itself, `schema consistent` mode will write class by id/name, but `schema evolution` mode will + write class field names, types and other meta too, see [Class meta](#class-meta). +- Class meta of `final custom type` needs to be written too, because peers may not have this class defined. + +### Class + +Class will be serialized using class meta format. + +## Implementation guidelines + +- Try to merge multiple bytes into an int/long write before writing to reduce memory IO and bound check cost. +- Read multiple bytes as an int/long, then split into multiple bytes to reduce memory IO and bound check cost. +- Try to use one varint/long to write flags and length together to save one byte cost and reduce memory io. +- Condition branches are less expensive compared to memory IO cost unless there are too many branches. diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/row_format_spec.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/row_format_spec.md new file mode 100644 index 00000000000..eefd9d9793b --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/row_format_spec.md @@ -0,0 +1,24 @@ +--- +title: Fury Row Format +sidebar_position: 2 +id: fury_row_format_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Row Format + +Coming soon diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/xlang_serialization_spec.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/xlang_serialization_spec.md new file mode 100644 index 00000000000..d15a3da9fd3 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/specification/xlang_serialization_spec.md @@ -0,0 +1,807 @@ +--- +title: Fury Xlang Serialization Format +sidebar_position: 0 +id: fury_xlang_serialization_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Cross-language Serialization Specification + +> Format Version History: +> +> - Version 0.1 - serialization spec formalized + +Fury xlang serialization is an automatic object serialization framework that supports reference and polymorphism. +Fury will convert an object from/to fury xlang serialization binary format. +Fury has two core concepts for xlang serialization: + +- **Fury xlang binary format** +- **Framework implemented in different languages to convert object to/from Fury xlang binary format** + +The serialization format is a dynamic binary format. The dynamics and reference/polymorphism support make Fury flexible, +much more easy to use, but +also introduce more complexities compared to static serialization frameworks. So the format will be more complex. + +## Type Systems + +### Data Types + +- bool: a boolean value (true or false). +- int8: a 8-bit signed integer. +- int16: a 16-bit signed integer. +- int32: a 32-bit signed integer. +- var_int32: a 32-bit signed integer which use fury var_int32 encoding. +- int64: a 64-bit signed integer. +- var_int64: a 64-bit signed integer which use fury PVL encoding. +- sli_int64: a 64-bit signed integer which use fury SLI encoding. +- float16: a 16-bit floating point number. +- float32: a 32-bit floating point number. +- float64: a 64-bit floating point number including NaN and Infinity. +- string: a text string encoded using Latin1/UTF16/UTF-8 encoding. +- enum: a data type consisting of a set of named values. Rust enum with non-predefined field values are not supported as + an enum. +- named_enum: an enum whose value will be serialized as the registered name. +- struct: a morphic(final) type serialized by Fury Struct serializer. i.e. it doesn't have subclasses. Suppose we're + deserializing `List`, we can save dynamic serializer dispatch since `SomeClass` is morphic(final). +- compatible_struct: a morphic(final) type serialized by Fury compatible Struct serializer. +- named_struct: a `struct` whose type mapping will be encoded as a name. +- named_compatible_struct: a `compatible_struct` whose type mapping will be encoded as a name. +- ext: a type which will be serialized by a customized serializer. +- named_ext: an `ext` type whose type mapping will be encoded as a name. +- list: a sequence of objects. +- set: an unordered set of unique elements. +- map: a map of key-value pairs. Mutable types such as `list/map/set/array/tensor/arrow` are not allowed as key of map. +- duration: an absolute length of time, independent of any calendar/timezone, as a count of nanoseconds. +- timestamp: a point in time, independent of any calendar/timezone, as a count of nanoseconds. The count is relative + to an epoch at UTC midnight on January 1, 1970. +- local_date: a naive date without timezone. The count is days relative to an epoch at UTC midnight on Jan 1, 1970. +- decimal: exact decimal value represented as an integer value in two's complement. +- binary: an variable-length array of bytes. +- array: only allow numeric components. Other arrays will be taken as List. The implementation should support the + interoperability between array and list. +- array: multidimensional array which every sub-array can have different sizes but all have same type. +- bool_array: one dimensional int16 array. +- int8_array: one dimensional int8 array. +- int16_array: one dimensional int16 array. +- int32_array: one dimensional int32 array. +- int64_array: one dimensional int64 array. +- float16_array: one dimensional half_float_16 array. +- float32_array: one dimensional float32 array. +- float64_array: one dimensional float64 array. +- arrow record batch: an arrow [record batch](https://arrow.apache.org/docs/cpp/tables.html#record-batches) object. +- arrow table: an arrow [table](https://arrow.apache.org/docs/cpp/tables.html#tables) object. + +Note: + +- Unsigned int/long are not added here, since not every language support those types. + +### Polymorphisms + +For polymorphism, if one non-final class is registered, and only one subclass is registered, then we can take all +elements in List/Map have same type, thus reduce runtime check cost. + +Collection/Array polymorphism are not fully supported, since some languages such as golang have only one collection +type. If users want to get exactly the type he passed, he must pass that type when deserializing or annotate that type +to the field of struct. + +### Type disambiguation + +Due to differences between type systems of languages, those types can't be mapped one-to-one between languages. When +deserializing, Fury use the target data structure type and the data type in the data jointly to determine how to +deserialize and populate the target data structure. For example: + +```java +class Foo { + int[] intArray; + Object[] objects; + List objectList; +} + +class Foo2 { + int[] intArray; + List objects; + List objectList; +} +``` + +`intArray` has an `int32_array` type. But both `objects` and `objectList` fields in the serialize data have `list` data +type. When deserializing, the implementation will create an `Object` array for `objects`, but create a `ArrayList` +for `objectList` to populate its elements. And the serialized data of `Foo` can be deserialized into `Foo2` too. + +Users can also provide meta hints for fields of a type, or the type whole. Here is an example in java which use +annotation to provide such information. + +```java +@FuryObject(fieldsNullable = false, trackingRef = false) +class Foo { + @FuryField(trackingRef = false) + int[] intArray; + @FuryField(polymorphic = true) + Object object; + @FuryField(tagId = 1, nullable = true) + List objectList; +} +``` + +Such information can be provided in other languages too: + +- cpp: use macro and template. +- golang: use struct tag. +- python: use typehint. +- rust: use macro. + +### Type ID + +All internal data types are expressed using an ID in range `0~64`. Users can use `0~4096` for representing their +types. + +### Type mapping + +See [Type mapping](../guide/xlang_type_mapping.md) + +## Spec overview + +Here is the overall format: + +``` +| fury header | object ref meta | object type meta | object value data | +``` + +The data are serialized using little endian byte order overall. If bytes swap is costly for some object, +Fury will write the byte order for that object into the data instead of converting it to little endian. + +## Fury header + +Fury header consists starts one byte: + +``` +| 2 bytes | 4 bits | 1 bit | 1 bit | 1 bit | 1 bit | 1 byte | optional 4 bytes | ++--------------+---------------+-------+-------+--------+-------+------------+------------------------------------+ +| magic number | reserved bits | oob | xlang | endian | null | language | unsigned int for meta start offset | +``` + +- magic number: used to identify fury serialization protocol, current version use `0x62d4`. +- null flag: 1 when object is null, 0 otherwise. If an object is null, other bits won't be set. +- endian flag: 1 when data is encoded by little endian, 0 for big endian. +- xlang flag: 1 when serialization uses xlang format, 0 when serialization uses Fury java format. +- oob flag: 1 when passed `BufferCallback` is not null, 0 otherwise. +- language: the language when serializing objects, such as JAVA, PYTHON, GO, etc. Fury can use this flag to determine whether spend more time on serialization to make the deserialization faster for dynamic languages. + +If meta share mode is enabled, an uncompressed unsigned int is appended to indicate the start offset of metadata. + +## Reference Meta + +Reference tracking handles whether the object is null, and whether to track reference for the object by writing +corresponding flags and maintaining internal state. + +Reference flags: + +| Flag | Byte Value | Description | +|---------------------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| +| NULL FLAG | `-3` | This flag indicates the object is a null value. We don't use another byte to indicate REF, so that we can save one byte. | +| REF FLAG | `-2` | This flag indicates the object is already serialized previously, and fury will write a ref id with unsigned varint format instead of serialize it again | +| NOT_NULL VALUE FLAG | `-1` | This flag indicates the object is a non-null value and fury doesn't track ref for this type of object. | +| REF VALUE FLAG | `0` | This flag indicates the object is referencable and the first time to serialize. | + +When reference tracking is disabled globally or for specific types, or for certain types within a particular +context(e.g., a field of a type), only the `NULL` and `NOT_NULL VALUE` flags will be used for reference meta. + +For languages which doesn't support reference such as rust, reference tracking must be disabled for correct +deserialization by fury rust implementation. + +For languages whose object values are not null by default: + +- In rust, Fury takes `Option:None` as a null value +- In c++, Fury takes `std::nullopt` as a null value +- In golang, Fury takes `null interface/pointer` as a null value + +If one want to deserialize in languages like `Java/Python/JavaScript`, he should mark the type with all fields +not-null by default, or using schema-evolution mode to carry the not-null fields info in the data. + +## Type Meta + +For every type to be serialized, it must be registered with an optional ID first. The registered type will have a +user-provided or an auto-growing unsigned int i.e. `type_id`. The registration can be used for security check and type +identification. The id of user registered type will be added by `64` to make space for Fury internal data types. + +Depending on whether meta share mode and registration is enabled for current type, Fury will write type meta +differently. + +### Schema consistent + +- If schema consistent mode is enabled globally when creating fury, type meta will be written as a fury unsigned varint + of `type_id`. Schema evolution related meta will be ignored. +- If schema evolution mode is enabled globally when creating fury, and current class is configured to use schema + consistent mode like `struct` vs `table` in flatbuffers: + - Type meta will be add to `captured_type_defs`: `captured_type_defs[type def stub] = map size` ahead when + registering type. + - Get index of the meta in `captured_type_defs`, write that index as `| unsigned varint: index |`. + +### Schema evolution + +If schema evolution mode is enabled globally when creating fury, and enabled for current type, type meta will be written +using one of the following mode. Which mode to use is configured when creating fury. + +- Normal mode(meta share not enabled): + - If type meta hasn't been written before, add `type def` + to `captured_type_defs`: `captured_type_defs[type def] = map size`. + - Get index of the meta in `captured_type_defs`, write that index as `| unsigned varint: index |`. + - After finished the serialization of the object graph, fury will start to write `captured_type_defs`: + - Firstly, set current to `meta start offset` of fury header + - Then write `captured_type_defs` one by one: + + ```python + buffer.write_var_uint32(len(writting_type_defs) - len(schema_consistent_type_def_stubs)) + for type_meta in writting_type_defs: + if not type_meta.is_stub(): + type_meta.write_type_def(buffer) + writing_type_defs = copy(schema_consistent_type_def_stubs) + ``` + +- Meta share mode: the writing steps are same as the normal mode, but `captured_type_defs` will be shared across + multiple serializations of different objects. For example, suppose we have a batch to serialize: + + ```python + captured_type_defs = {} + stream = ... + # add `Type1` to `captured_type_defs` and write `Type1` + fury.serialize(stream, [Type1()]) + # add `Type2` to `captured_type_defs` and write `Type2`, `Type1` is written before. + fury.serialize(stream, [Type1(), Type2()]) + # `Type1` and `Type2` are written before, no need to write meta. + fury.serialize(stream, [Type1(), Type2()]) + ``` + +- Streaming mode(streaming mode doesn't support meta share): + - If type meta hasn't been written before, the data will be written as: + + ``` + | unsigned varint: 0b11111111 | type def | + ``` + + - If type meta has been written before, the data will be written as: + + ``` + | unsigned varint: written index << 1 | + ``` + + `written index` is the id in `captured_type_defs`. + - With this mode, `meta start offset` can be omitted. + +> The normal mode and meta share mode will forbid streaming writing since it needs to look back for update the start +> offset after the whole object graph writing and meta collecting is finished. Only in this way we can ensure +> deserialization failure in meta share mode doesn't lost shared meta. + +#### Type Def + +Here we mainly describe the meta layout for schema evolution mode: + +``` +| 8 bytes meta header | variable bytes | variable bytes | variable bytes | ++-------------------------------+--------------------+-------------------+----------------+ +| 7 bytes hash + 1 bytes header | current type meta | parent type meta | ... | +``` + +Type meta are encoded from parent type to leaf type, only type with serializable fields will be encoded. + +##### Meta header + +Meta header is a 64 bits number value encoded in little endian order. + +- Lowest 4 digits `0b0000~0b1110` are used to record num classes. `0b1111` is preserved to indicate that Fury need to + read more bytes for length using Fury unsigned int encoding. If current type doesn't has parent type, or parent + type doesn't have fields to serialize, or we're in a context which serialize fields of current type + only, num classes will be 1. +- The 5th bit is used to indicate whether this type needs schema evolution. +- Other 56 bits are used to store the unique hash of `flags + all layers type meta`. + +##### Single layer type meta + +``` +| unsigned varint | var uint | field info: variable bytes | variable bytes | ... | ++-----------------+----------+-------------------------------+-----------------+-----+ +| num_fields | type id | header + type id + field name | next field info | ... | +``` + +- num fields: encode `num fields` as unsigned varint. + - If the current type is schema consistent, then num_fields will be `0` to flag it. + - If the current type isn't schema consistent, then num_fields will be the number of compatible fields. For example, + users can use tag id to mark some fields as compatible fields in schema consistent context. In such cases, schema + consistent fields will be serialized first, then compatible fields will be serialized next. At deserialization, + Fury will use fields info of those fields which aren't annotated by tag id for deserializing schema consistent + fields, then use fields info in meta for deserializing compatible fields. +- type id: the registered id for the current type, which will be written as an unsigned varint. +- field info: + - header(8 + bits): `4 bits size + 2 bits field name encoding + nullability flag + ref tracking flag`. + Users can use annotation to provide those info. + - 2 bits field name encoding: + - encoding: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL/TAG_ID` + - If tag id is used, i.e. field name is written by an unsigned varint tag id. 2 bits encoding will be `11`. + - size of field name: + - The `4 bits size: 0~14` will be used to indicate length `1~15`, the value `15` indicates to read more bytes, + the encoding will encode `size - 15` as a varint next. + - If encoding is `TAG_ID`, then num_bytes of field name will be used to store tag id. + - ref tracking: when set to 1, ref tracking will be enabled for this field. + - nullability: when set to 1, this field can be null. + - field name: If tag id is set, tag id will be used instead. Otherwise meta string encoding `[length]` and data will + be written instead. + - type id: + - Format: `id << 1 | polymorphic flag`. If field type is polymorphic, this flag is set to `0b1`, otherwise it's + `0b0` + - For registered type-consistent classes, it will be the registered type id. + - For struct type it will be written as `STRUCT`. + - The meta for struct type is written separately instead of inlining here is to reduce meta space cost if object of + this type is serialized in current object graph multiple times, and the field value may be null too. + - For enum type, it will be written as `ENUM`. + - For collection type, it will be written as `COLLECTION`, then write element type recursively. + - For map type, it will be written as `MAP`, then write key and value type recursively. + +Field order are left as implementation details, which is not exposed to specification, the deserialization need to +resort fields based on Fury field comparator. In this way, fury can compute statistics for field names or types and +using a more compact encoding. + +##### Other layers type meta + +Same encoding algorithm as the previous layer. + +## Meta String + +Meta string is mainly used to encode meta strings such as field names. + +### Encoding Algorithms + +String binary encoding algorithm: + +| Algorithm | Pattern | Description | +|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| UTF-8 | any chars | UTF-8 encoding | + +Encoding flags: + +| Encoding Flag | Pattern | Encoding Algorithm | +|---------------------------|----------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | every char is in `a-z._\|` | `LOWER_SPECIAL` | +| FIRST_TO_LOWER_SPECIAL | every char is in `a-z._` except first char is upper case | replace first upper case char to lower case, then use `LOWER_SPECIAL` | +| ALL_TO_LOWER_SPECIAL | every char is in `a-zA-Z._` | replace every upper case char by `\|` + `lower case`, then use `LOWER_SPECIAL`, use this encoding if it's smaller than Encoding `LOWER_UPPER_DIGIT_SPECIAL` | +| LOWER_UPPER_DIGIT_SPECIAL | every char is in `a-zA-Z._` | use `LOWER_UPPER_DIGIT_SPECIAL` encoding if it's smaller than Encoding `FIRST_TO_LOWER_SPECIAL` | +| UTF8 | any utf-8 char | use `UTF-8` encoding | +| Compression | any utf-8 char | lossless compression | + +Notes: + +- Depending on cases, one can choose encoding `flags + data` jointly, uses 3 bits of first byte for flags and other + bytes + for data. + +## Value Format + +### Basic types + +#### bool + +- size: 1 byte +- format: 0 for `false`, 1 for `true` + +#### int8 + +- size: 1 byte +- format: write as pure byte. + +#### int16 + +- size: 2 byte +- byte order: raw bytes of little endian order + +#### unsigned int32 + +- size: 4 byte +- byte order: raw bytes of little endian order + +#### unsigned varint32 + +- size: 1~5 byte +- Format: The most significant bit (MSB) in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then + the next byte should be read until the first bit of the next byte is unset. + +#### signed int32 + +- size: 4 byte +- byte order: raw bytes of little endian order + +#### signed varint32 + +- size: 1~5 byte +- Format: First convert the number into positive unsigned int by `(v << 1) ^ (v >> 31)` ZigZag algorithm, then encode + it as an unsigned varint. + +#### unsigned int64 + +- size: 8 byte +- byte order: raw bytes of little endian order + +#### unsigned varint64 + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in `[0, 2147483647]`, encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - positive long format: first bit in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then the next byte should be read until the first bit is unset. + +#### signed int64 + +- size: 8 byte +- byte order: raw bytes of little endian order + +#### signed varint64 + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in `[-1073741824, 1073741823]`, encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - First convert the number into positive unsigned long by `(v << 1) ^ (v >> 63)` ZigZag algorithm to reduce cost of + small negative numbers, then encoding it as an unsigned long. + +#### float32 + +- size: 4 byte +- format: encode the specified floating-point value according to the IEEE 754 floating-point "single format" bit layout, + preserving Not-a-Number (NaN) values, then write as binary by little endian order. + +#### float64 + +- size: 8 byte +- format: encode the specified floating-point value according to the IEEE 754 floating-point "double format" bit layout, + preserving Not-a-Number (NaN) values. then write as binary by little endian order. + +### string + +Format: + +``` +| unsigned varint64: size << 2 `bitor` 2 bits encoding flags | binary data | +``` + +- `size + encoding` will be concat as a long and encoded as an unsigned varint64. The little 2 bits is used for + encoding: + 0 for `latin1(ISO-8859-1)`, 1 for `utf-16`, 2 for `utf-8`. +- encoded string binary data based on encoding: `latin/utf-16/utf-8`. + +Which encoding to choose: + +- For JDK8: fury detect `latin` at runtime, if string is `latin` string, then use `latin` encoding, otherwise + use `utf-16`. +- For JDK9+: fury use `coder` in `String` object for encoding, `latin`/`utf-16` will be used for encoding. +- If the string is encoded by `utf-8`, then fury will use `utf-8` to decode the data. Cross-language string + serialization of fury uses `utf-8` by default. + +### list + +Format: + +``` +| unsigned varint64: length << 4 `bitor` 4 bits elements header | elements data | +``` + +#### elements header + +In most cases, all elements are same type and not null, elements header will encode those homogeneous +information to avoid the cost of writing it for every element. Specifically, there are four kinds of information +which will be encoded by elements header, each use one bit: + +- If track elements ref, use the first bit `0b1` of the header to flag it. +- If the elements have null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + element type, this flag is invalid. +- If the element types are not the declared type, use the 3rd bit `0b100` of the header to flag it. +- If the element types are different, use the 4rd bit `0b1000` header to flag it. + +By default, all bits are unset, which means all elements won't track ref, all elements are same type, not null and +the actual element is the declared type in the custom type field. + +The implementation can generate different deserialization code based read header, and look up the generated code from +a linear map/list. + +#### elements data + +Based on the elements header, the serialization of elements data may skip `ref flag`/`null flag`/`element type info`. + +```python +fury = ... +buffer = ... +elems = ... +if element_type_is_same: + if not is_declared_type: + fury.write_type(buffer, elem_type) + elem_serializer = get_serializer(...) + if track_ref: + for elem in elems: + if not ref_resolver.write_ref_or_null(buffer, elem): + elem_serializer.write(buffer, elem) + elif has_null: + for elem in elems: + if elem is None: + buffer.write_byte(null_flag) + else: + buffer.write_byte(not_null_flag) + elem_serializer.write(buffer, elem) + else: + for elem in elems: + elem_serializer.write(buffer, elem) +else: + if track_ref: + for elem in elems: + fury.write_ref(buffer, elem) + elif has_null: + for elem in elems: + fury.write_nullable(buffer, elem) + else: + for elem in elems: + fury.write_value(buffer, elem) +``` + +[`CollectionSerializer#writeElements`](https://github.com/apache/fury/blob/20a1a78b17a75a123a6f5b7094c06ff77defc0fe/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractCollectionSerializer.java#L302) +can be taken as an example. + +### array + +#### primitive array + +Primitive array are taken as a binary buffer, serialization will just write the length of array size as an unsigned int, +then copy the whole buffer into the stream. + +Such serialization won't compress the array. If users want to compress primitive array, users need to register custom +serializers for such types or mark it as list type. + +#### object array + +Object array is serialized using the list format. Object component type will be taken as list element +generic type. + +### map + +> All Map serializers must extend `AbstractMapSerializer`. + +Format: + +``` +| length(unsigned varint) | key value chunk data | ... | key value chunk data | +``` + +#### map key-value chunk data + +Map iteration is too expensive, Fury won't compute the header like for list since it introduce +[considerable overhead](https://github.com/apache/fury/issues/925). +Users can use `MapFieldInfo` annotation to provide the header in advance. Otherwise Fury will use first key-value pair +to predict header optimistically, and update the chunk header if the prediction failed at some pair. + +Fury will serialize the map chunk by chunk, every chunk has 255 pairs at most. + +``` +| 1 byte | 1 byte | variable bytes | ++----------------+----------------+-----------------+ +| KV header | chunk size: N | N*2 objects | +``` + +KV header: + +- If track key ref, use the first bit `0b1` of the header to flag it. +- If the key has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + key type, this flag is invalid. +- If the actual key type of map is not the declared key type, use the 3rd bit `0b100` of the header to flag it. +- If track value ref, use the 4th bit `0b1000` of the header to flag it. +- If the value has null, use the 5th bit `0b10000` of the header to flag it. If ref tracking is enabled for this + value type, this flag is invalid. +- If the value type of map is not the declared value type, use the 6rd bit `0b100000` of the header to flag it. +- If key or value is null, that key and value will be written as a separate chunk, and chunk size writing will be + skipped too. + +If streaming write is enabled, which means Fury can't update written `chunk size`. In such cases, map key-value data +format will be: + +``` +| 1 byte | variable bytes | ++----------------+-----------------+ +| KV header | N*2 objects | +``` + +`KV header` will be a header marked by `MapFieldInfo` in java. For languages such as golang, this can be computed in +advance for non-interface types most times. The implementation can generate different deserialization code based read +header, and look up the generated code from a linear map/list. + +#### Why serialize chunk by chunk? + +When fury will use first key-value pair to predict header optimistically, it can't know how many pairs have same +meta(tracking kef ref, key has null and so on). If we don't write chunk by chunk with max chunk size, we must write at +least `X` bytes to take up a place for later to update the number which has same elements, `X` is the num_bytes for +encoding varint encoding of map size. + +And most map size are smaller than 255, if all pairs have same data, the chunk will be 1. This is common in golang/rust, +which object are not reference by default. + +Also, if only one or two keys have different meta, we can make it into a different chunk, so that most pairs can share +meta. + +The implementation can accumulate read count with map size to decide whether to read more chunks. + +### enum + +Enums are serialized as an unsigned var int. If the order of enum values change, the deserialized enum value may not be +the value users expect. In such cases, users must register enum serializer by make it write enum value as an enumerated +string with unique hash disabled. + +### decimal + +Not supported for now. + +### struct + +Struct means object of `class/pojo/struct/bean/record` type. +Struct will be serialized by writing its fields data in fury order. + +Depending on schema compatibility, structs will have different formats. + +#### field order + +Field will be ordered as following, every group of fields will have its own order: + +- primitive fields: larger size type first, smaller later, variable size type last. +- boxed primitive fields: same order as primitive fields +- final fields: same type together, then sorted by field name lexicographically. +- list fields: same order as final fields +- map fields: same order as final fields +- other fields: same order as final fields + +#### schema consistent + +Object will be written as: + +``` +| 4 byte | variable bytes | ++---------------+------------------+ +| type hash | field values | +``` + +Type hash is used to check the type schema consistency across languages. Type hash will be the first 32 bits of 56 bits +value of the type meta. + +Object fields will be serialized one by one using following format: + +``` +not null primitive field value: +| var bytes | ++----------------+ +| value data | ++----------------+ +nullable primitive field value: +| one byte | var bytes | ++-----------+---------------+ +| null flag | field value | ++-----------+---------------+ +field value of final type with ref tracking: +| var bytes | var objects | ++-----------+-------------+ +| ref meta | value data | ++-----------+-------------+ +field value of final type without ref tracking: +| one byte | var objects | ++-----------+-------------+ +| null flag | field value | ++-----------+-------------+ +field value of non-final type with ref tracking: +| one byte | var bytes | var objects | ++-----------+-------------+-------------+ +| ref meta | type meta | value data | ++-----------+-------------+-------------+ +field value of non-final type without ref tracking: +| one byte | var bytes | var objects | ++-----------+------------+------------+ +| null flag | type meta | value data | ++-----------+------------+------------+ +``` + +#### Schema evolution + +Schema evolution have similar format as schema consistent mode for object except: + +- For the object type, `schema consistent` mode will write type by id only, but `schema evolution` mode will + write type consisting of field names, types and other meta too, see [Type meta](#type-meta). +- Type meta of `final custom type` needs to be written too, because peers may not have this type defined. + +### Type + +Type will be serialized using type meta format. + +## Implementation guidelines + +### How to reduce memory read/write code + +- Try to merge multiple bytes into an int/long write before writing to reduce memory IO and bound check cost. +- Read multiple bytes as an int/long, then split into multiple bytes to reduce memory IO and bound check cost. +- Try to use one varint/long to write flags and length together to save one byte cost and reduce memory io. +- Condition branches are less expensive compared to memory IO cost unless there are too many branches. + +### Fast deserialization for static languages without runtime codegen support + +For type evolution, the serializer will encode the type meta into the serialized data. The deserializer will compare +this meta with class meta in the current process, and use the diff to determine how to deserialize the data. + +For java/javascript/python, we can use the diff to generate serializer code at runtime and load it as class/function for +deserialization. In this way, the type evolution will be as fast as type consist mode. + +For C++/Rust, we can't generate the serializer code at runtime. So we need to generate the code at compile-time using +meta programming. But at that time, we don't know the type schema in other processes, so we can't generate the +serializer code for such inconsistent types. We may need to generate the code which has a loop and compare field name +one by one to decide whether to deserialize and assign the field or skip the field value. + +One fast way is that we can optimize the string comparison into `jump` instructions: + +- Assume the current type has `n` fields, and the peer type has `n1` fields. +- Generate an auto growing `field id` from `0` for every sorted field in the current type at the compile time. +- Compare the received type meta with current type, generate same id if the field name is same, otherwise generate an + auto growing id starting from `n`, cache this meta at runtime. +- Iterate the fields of received type meta, use a `switch` to compare the `field id` to deserialize data + and `assign/skip` field value. **Continuous** field id will be optimized into `jump` in `switch` block, so it will + very fast. + +Here is an example, suppose process A has a class `Foo` with version 1 defined as `Foo1`, process B has a class `Foo` +with version 2 defined as `Foo2`: + +```c++ +// class Foo with version 1 +class Foo1 { + int32_t v1; // id 0 + std::string v2; // id 1 +}; +// class Foo with version 2 +class Foo2 { + // id 0, but will have id 2 in process A + bool v0; + // id 1, but will have id 0 in process A + int32_t v1; + // id 2, but will have id 3 in process A + int64_t long_value; + // id 3, but will have id 1 in process A + std::string v2; + // id 4, but will have id 4 in process A + std::vector list; +}; +``` + +When process A received serialized `Foo2` from process B, here is how it deserialize the data: + +```c++ +Foo1 foo1 = ...; +const std::vector &field_infos = type_meta.field_infos; +for (const auto &field_info : field_infos) { + switch (field_info.field_id) { + case 0: + foo1.v1 = buffer.read_varint32(); + break; + case 1: + foo1.v2 = fury.read_string(); + break; + default: + fury.skip_data(field_info); + } +} +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/start/install.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/start/install.md new file mode 100644 index 00000000000..373b73f597a --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/start/install.md @@ -0,0 +1,71 @@ +--- +id: install +title: 安装 Apache Fury +sidebar_position: 0 +--- + +Apache Fury 源码下载请参见 Apache Fury [download](https://github.com/apache/fury/releases)页面。 + +### Fury Java 安装 + +要使用 Maven 添加对 Apache Fury 的依赖,请使用以下配置: + +```xml + + org.apache.fury + fury-core + 0.10.0 + + + +``` + +### Fury Scala 安装 + +要使用 Maven 添加 scala 2.13 的 Fury scala 依赖,请使用以下配置: + +```xml + + org.apache.fury + fury-scala_2.13 + 0.10.0 + +``` + +要使用 Maven 添加 scala 3 的 Fury scala 依赖,请使用以下配置: + +```xml + + org.apache.fury + fury-scala_3 + 0.10.0 + +``` + +要使用 sbt 添加 scala 2.13 的 Fury scala 依赖,请使用以下配置: + +```sbt +libraryDependencies += "org.apache.fury" % "fury-scala_2.13" % "0.10.0" +``` + +要使用 sbt 添加 scala 3 的 Fury scala 依赖,请使用以下配置: + +```sbt +libraryDependencies += "org.apache.fury" % "fury-scala_3" % "0.10.0" +``` + +## Fury Kotlin 安装 + +To add a dependency on Fury kotlin with maven, use the following: + +```xml + + org.apache.fury + fury-kotlin + 0.10.0 + +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/start/usage.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/start/usage.md new file mode 100644 index 00000000000..134acb2c39b --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-0.10.0/start/usage.md @@ -0,0 +1,236 @@ +--- +id: usage +title: Apache Fury 使用 +sidebar_position: 1 +--- + +本章节演示不同编程语言使用 Apache Fury 进行序列化。 + +## Java 序列化 + +```java +import java.util.List; +import java.util.Arrays; +import io.fury.*; + +public class Example { + public static void main(String[] args) { + SomeClass object = new SomeClass(); + // Note that Fury instances should be reused between + // multiple serializations of different objects. + Fury fury = Fury.builder().withLanguage(Language.JAVA) + // Allow to deserialize objects unknown types, + // more flexible but less secure. + // .requireClassRegistration(false) + .build(); + // Registering types can reduce class name serialization overhead, but not mandatory. + // If secure mode enabled, all custom types must be registered. + fury.register(SomeClass.class); + byte[] bytes = fury.serialize(object); + System.out.println(fury.deserialize(bytes)); + } +} +``` + +## Scala序列化 + +```scala +import org.apache.fury.Fury +import org.apache.fury.serializer.scala.ScalaSerializers + +case class Person(name: String, id: Long, github: String) +case class Point(x : Int, y : Int, z : Int) + +object ScalaExample { + val fury: Fury = Fury.builder().withScalaOptimizationEnabled(true).build() + // Register optimized fury serializers for scala + ScalaSerializers.registerSerializers(fury) + fury.register(classOf[Person]) + fury.register(classOf[Point]) + + def main(args: Array[String]): Unit = { + val p = Person("Shawn Yang", 1, "https://github.com/chaokunyang") + println(fury.deserialize(fury.serialize(p))) + println(fury.deserialize(fury.serialize(Point(1, 2, 3)))) + } +} + +## Kotlin序列化 +```kotlin +import org.apache.fury.Fury +import org.apache.fury.ThreadSafeFury +import org.apache.fury.serializer.kotlin.KotlinSerializers + +data class Person(val name: String, val id: Long, val github: String) +data class Point(val x : Int, val y : Int, val z : Int) + +fun main(args: Array) { + // 注意: 下面的Fury初始化代码应该只执行一次,而不是在每次序列化前都运行 + val fury: ThreadSafeFury = Fury.builder().requireClassRegistration(true).buildThreadSafeFury() + KotlinSerializers.registerSerializers(fury) + fury.register(Person::class.java) + fury.register(Point::class.java) + + val p = Person("Shawn Yang", 1, "https://github.com/chaokunyang") + println(fury.deserialize(fury.serialize(p))) + println(fury.deserialize(fury.serialize(Point(1, 2, 3)))) +} +``` + +## 跨语言序列化 + +### Java + +```java +import com.google.common.collect.ImmutableMap; +import io.fury.*; + +import java.util.Map; + +public class ReferenceExample { + public static class SomeClass { + SomeClass f1; + Map f2; + Map f3; + } + + public static Object createObject() { + SomeClass obj = new SomeClass(); + obj.f1 = obj; + obj.f2 = ImmutableMap.of("k1", "v1", "k2", "v2"); + obj.f3 = obj.f2; + return obj; + } + + // mvn exec:java -Dexec.mainClass="io.fury.examples.ReferenceExample" + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG) + .withRefTracking(true).build(); + fury.register(SomeClass.class, "example.SomeClass"); + byte[] bytes = fury.serialize(createObject()); + // bytes can be data serialized by other languages. + System.out.println(fury.deserialize(bytes)); + ; + } +} +``` + +### Python + +```python +from typing import Dict +import pyfury + +class SomeClass: + f1: "SomeClass" + f2: Dict[str, str] + f3: Dict[str, str] + +fury = pyfury.Fury(ref_tracking=True) +fury.register_class(SomeClass, "example.SomeClass") +obj = SomeClass() +obj.f2 = {"k1": "v1", "k2": "v2"} +obj.f1, obj.f3 = obj, obj.f2 +data = fury.serialize(obj) +# bytes can be data serialized by other languages. +print(fury.deserialize(data)) +``` + +### Golangs + +```go +package main + +import ( + "fmt" + furygo "github.com/apache/fury/go/fury" +) + +func main() { + type SomeClass struct { + F1 *SomeClass + F2 map[string]string + F3 map[string]string + } + fury := furygo.NewFury(true) + if err := fury.RegisterTagType("example.SomeClass", SomeClass{}); err != nil { + panic(err) + } + value := &SomeClass{F2: map[string]string{"k1": "v1", "k2": "v2"}} + value.F3 = value.F2 + value.F1 = value + bytes, err := fury.Marshal(value) + if err != nil { + } + var newValue interface{} + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) +} +``` + +### JavaScript + +```typescript +import Fury, { Type } from '@furyjs/fury'; + +/** + * @furyjs/hps use v8's fast-calls-api that can be called directly by jit, ensure that the version of Node is 20 or above. + * Experimental feature, installation success cannot be guaranteed at this moment + * If you are unable to install the module, replace it with `const hps = null;` + **/ +import hps from '@furyjs/hps'; + +// Now we describe data structures using JSON, but in the future, we will use more ways. +const description = Type.object('example.foo', { + foo: Type.string(), +}); +const fury = new Fury({ hps }); +const { serialize, deserialize } = fury.registerSerializer(description); +const input = serialize({ foo: 'hello fury' }); +const result = deserialize(input); +console.log(result); +``` + +### Rust + +```rust +use fury::{from_buffer, to_buffer, Fury}; + +#[derive(Fury, Debug, PartialEq)] +#[tag("example.foo")] +struct Animal { + name: String, + category: String, +} + +#[derive(Fury, Debug, PartialEq)] +#[tag("example.bar")] +struct Person { + name: String, + age: u32, + pets: Vec, +} + +fn main() { + let penson = Person { + name: "hello".to_string(), + age: 12, + pets: vec![ + Animal { + name: "world1".to_string(), + category: "cat".to_string(), + }, + Animal { + name: "world2".to_string(), + category: "dog".to_string(), + }, + ], + }; + let bin = to_buffer(&penson); + let obj: Person = from_buffer(&bin).expect("should success"); + assert_eq!(obj, penson); +} +``` diff --git a/src/pages/user/index.css b/src/pages/user/index.css index 45cdfdb5256..eefda5ebdae 100644 --- a/src/pages/user/index.css +++ b/src/pages/user/index.css @@ -5,8 +5,8 @@ line-height: 46px; display: flex; flex-direction: column; - align-items: center; /* 垂直居中 */ - text-align: center; /* 水平居中文本 */ + align-items: center; + text-align: center; } .user-main h3, .user-main .divider, diff --git a/versioned_docs/version-0.10.0/community/community.md b/versioned_docs/version-0.10.0/community/community.md new file mode 100644 index 00000000000..ef3f56eb61d --- /dev/null +++ b/versioned_docs/version-0.10.0/community/community.md @@ -0,0 +1,91 @@ +--- +title: Community +sidebar_position: 0 +id: community +--- + + +Apache Fury is a volunteer project and it thrives on the contributions of its community. +We invite you to participate as much or as little as you wish. Here are several ways to contribute: + +- Use our project and share feedback. +- Provide use-cases for the project. +- Report bugs and contribute fixes. +- Contribute code and documentation improvements. + +## Mailing list + +| Name | Desc | Subscribe | Unsubscribe | Post | Archive | +|-------------------------|---------------------------------------------|-------------------------------------------------------|-----------------------------------------------------------|------------------------------------|-----------------------------------------------------------------------| +| dev@fury.apache.org | Development related discussions | [Subscribe](mailto:dev-subscribe@fury.apache.org) | [Unsubscribe](mailto:dev-unsubscribe@fury.apache.org) | [Post](mailto:dev@fury.apache.org) | [Archive](https://lists.apache.org/list.html?dev@fury.apache.org) | +| commits@fury.apache.org | All commits to our repositories | [Subscribe](mailto:commits-subscribe@fury.apache.org) | [Unsubscribe](mailto:commits-unsubscribe@fury.apache.org) | Read only list | [Archive](https://lists.apache.org/list.html?commits@fury.apache.org) | + +Please make sure subscribe to any list before attempting to post. + +If you are not subscribed to the mailing list, your message will either be rejected or you won't receive the response. + +### How to subscribe to a mailing list + +To post messages, subscribe first by: + +1. Sending an email to listname-subscribe@fury.apache.org with `listname` replaced accordingly. +2. Replying to the confirmation email you'll receive, keeping the subject line intact. +3. You'll then get a welcome email, and the subscription succeeds. + +When discussing code snippets in emails, ensure: + +- You do not link to files in external services, as such files can change, get deleted or the link might break and thus + make an archived email thread useless. +- You paste text instead of screenshots of text. +- You keep formatting when pasting code in order to keep the code readable. +- There are enough import statements to avoid ambiguities. + +## Slack + +You can join +the [Apache Fury™ community on Slack](https://join.slack.com/t/fury-project/shared_invite/zt-1u8soj4qc-ieYEu7ciHOqA2mo47llS8A). + +There are a couple of community rules: + +- Be respectful and nice. +- All important decisions and conclusions must be reflected back to the mailing lists. "If it didn't happen on a mailing + list, it didn't happen." - The [Apache Mottos](https://theapacheway.com/on-list/). +- Use Slack threads to keep parallel conversations from overwhelming a channel. +- Please do not direct message people for troubleshooting, issue assigning and PR review. These should be picked-up + voluntarily. + +## Issue tracker + +We use GitHub Issues to track all issues: + +- code related issues: https://github.com/apache/fury/issues +- website related issues: https://github.com/apache/fury-site/issues + +You need to have a [GitHub account](https://github.com/signup) in order to create issues. +If you don't have a [GitHub account](https://github.com/signup), you can post an email to dev@fury.apache.org. + +### Bug reports + +To report a bug: + +- Verify that the bug does in fact exist. +- Search the [issue tracker](https://github.com/apache/fury/issues) to verify there is no existing issue reporting the bug you've found. +- Create a [bug report](https://github.com/apache/fury/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml) on issue tracker. +- If possible, dive into the source code of fury, and submit a patch for the bug you reported, this helps ensure the bug + will be fixed quickly. + +### Reporting a Vulnerability + +Apache Fury is a project of the [Apache Software Foundation](https://apache.org/) and follows the [ASF vulnerability handling process](https://apache.org/security/#vulnerability-handling). + +To report a new vulnerability you have discovered please follow the [ASF vulnerability reporting process](https://apache.org/security/#reporting-a-vulnerability), which explains how to send us details privately. + +### Enhancement + +Enhancements or new feature proposals are also welcome. The more concrete and rationale the proposal is, the greater the +chance it will be incorporated into future releases. + +## Source code + +- fury core repository: https://github.com/apache/fury +- fury website repository: https://github.com/apache/fury-site diff --git a/versioned_docs/version-0.10.0/community/how_to_join_community.md b/versioned_docs/version-0.10.0/community/how_to_join_community.md new file mode 100644 index 00000000000..d41c2e71959 --- /dev/null +++ b/versioned_docs/version-0.10.0/community/how_to_join_community.md @@ -0,0 +1,105 @@ +--- +title: How to join Fury +sidebar_position: 0 +id: how_to_join_community +--- + +First of all, kudos to you for choosing to join the open source contribution ranks. Secondly, we are very grateful that you have chosen to participate in the Fury community and contribute to this open source project. + +## Fury Contribution Guide + +The Fury team usually conducts development and issue maintenance on GitHub. Please open the [GitHub website](https://github.com/), click the `Sign up` button in the upper right corner, register your own account, and take the first step of your open source journey. + +In the [Fury repository](https://github.com/apache/fury), we have a [guide](https://fury.apache.org/zh-CN/docs/community/) for all open source contributors, introducing contents such as version management and branch management. **Please take a few minutes to read and understand it**. + +## Your First Pull Request + +### Step 0: Install Git + +Git is a version control system used to track and manage code changes in software development projects. It helps developers record and manage the history of the code, facilitating team collaboration, code version control, code merging, and other operations. With Git, you can track each version of each file and easily switch and compare between different versions. Git also provides branch management functionality, allowing multiple concurrent development tasks to be carried out simultaneously. + +- Visit the official Git website: [https://git-scm.com/] (https://git-scm.com/) +- Download the latest version of the Git installer. +- Run the downloaded installer and follow the prompts of the installation wizard to install. +- After the installation is complete, you can use the `git version` command in the command line to confirm the successful installation. + +### Step 1: Fork the Project + +- First, you need to fork this project. Enter the [Fury project page](https://github.com/apache/fury), and click the Fork button in the upper right corner. +- In your GitHub account, the project xxxx (your GitHub username)/fury will appear. +- On your local computer, use the following commands to obtain a fury folder: + +``` +// ssh +git clone git@github.com:xxxx (your GitHub username)/fury.git +// https +git clone https://github.com/xxxx (your GitHub username)/fury.git +``` + +### Step 2: Obtain the Project Code + +- Enter the fury folder and add the remote address of fury: + +``` +git remote add upstream https://github.com/apache/fury.git +``` + +### Step 3: Create a Branch + +- Alright, now you can start contributing our code. The default branch of Fury is the main branch. Whether it is for function development, bug fixes, or documentation writing, please create a new branch and then merge it to the main branch. Use the following code to create a branch: + +``` +// Create a function development branch +git checkout -b feat/xxxx + +// Create a problem-fixing development branch +git checkout -b fix/xxxx + +// Create a documentation, demo branch +git checkout -b docs/add-java-demo +``` + +Suppose we have created the documentation modification branch `docs/add-java-demo` and we have added some code and submitted it to the code repository. + +- `git add .` +- `git commit -a -m "docs: add java demo and related docs"` + +### Step 4: Merge the Modifications + +- Switch back to your development branch: + +``` +git checkout docs/add-java-demo +``` + +- Submit the updated code to your branch: + +``` +git push origin docs/add-java-demo +``` + +### Step 5: Submit a Pull Request + +You can click the `Compare & pull request` button on your GitHub code repository page. Or create it through the `contribute` button. + +- Fill in what type of modification this is. +- Fill in the associated issue. +- If there are complex changes, please explain the background and solution. + +After filling in the relevant information, click Create pull request to submit. + +## **Easily Step into the Fury Open Source Contribution Journey** + +"**good first issue**" is a common label in the open source community, and the purpose of this label is to help new contributors find entry-level issues that are suitable for them. + +The entry-level issues of Fury can be viewed through the [issue list](https://github.com/apache/fury/issues). + +If you currently **have the time and willingness** to participate in community contributions, you can take a look at **good first issue** in the issues and select one that interests you and is suitable for you to claim. + +## Embrace the Apache Fury Community + +While you contribute code to Fury, we encourage you to participate in other things that make the community more prosperous, such as: + +- Offer suggestions for the project's development, functional planning, etc. +- Create articles, videos, and hold lectures to promote Fury. +- Write promotion plans and execute them together with the team. diff --git a/versioned_docs/version-0.10.0/community/how_to_release.md b/versioned_docs/version-0.10.0/community/how_to_release.md new file mode 100644 index 00000000000..a954142d16e --- /dev/null +++ b/versioned_docs/version-0.10.0/community/how_to_release.md @@ -0,0 +1,531 @@ +--- +title: How to release +sidebar_position: 0 +id: how_to_release +--- + +This document mainly introduces how the release manager releases a new version of Apache Fury. + +## Introduction + +Source Release is the most important part which Apache values. + +Please pay more attention to license and signing issues. +Publishing software is a serious thing and has legal consequences. + +## First-time as a release manager + +### Environmental requirements + +This release process is operated in the Ubuntu OS, and the following tools are required: + +- JDK 1.8 +- Apache Maven 3.x +- Python 3.8 +- GnuPG 2.x +- Git +- SVN (apache uses svn to host project releases) +- Pay attention to setting environment variables: if you configure gpg keys under a different directory, please `export GNUPGHOME=$(xxx)` + +### Prepare GPG Key + +If you are the first to become a release manager, you need to prepare a gpg key. + +Following is a quick setup, you can refer to [Apache openpgp doc](https://infra.apache.org/openpgp.html) for further details. + +#### Install GPG + +```bash +sudo apt install gnupg2 +``` + +#### Generate GPG Key + +Please use your apache name and email for generate key + +```bash +$ gpg --full-gen-key +gpg (GnuPG) 2.2.20; Copyright (C) 2020 Free Software Foundation, Inc. +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. + +Please select what kind of key you want: + (1) RSA and RSA (default) + (2) DSA and Elgamal + (3) DSA (sign only) + (4) RSA (sign only) + (14) Existing key from card +Your selection? 1 # input 1 +RSA keys may be between 1024 and 4096 bits long. +What keysize do you want? (2048) 4096 # input 4096 +Requested keysize is 4096 bits +Please specify how long the key should be valid. + 0 = key does not expire + = key expires in n days + w = key expires in n weeks + m = key expires in n months + y = key expires in n years +Key is valid for? (0) 0 # input 0 +Key does not expire at all +Is this correct? (y/N) y # input y + +GnuPG needs to construct a user ID to identify your key. + +Real name: Chaokun Yang # input your name +Email address: chaokunyang@apache.org # input your email +Comment: CODE SIGNING KEY # input some annotations, optional +You selected this USER-ID: + "Chaokun " + +Change (N)ame, (C)omment, (E)mail or (O)kay/(Q)uit? O # input O +We need to generate a lot of random bytes. It is a good idea to perform +some other action (type on the keyboard, move the mouse, utilize the +disks) during the prime generation; this gives the random number +generator a better chance to gain enough entropy. +We need to generate a lot of random bytes. It is a good idea to perform +some other action (type on the keyboard, move the mouse, utilize the +disks) during the prime generation; this gives the random number +generator a better chance to gain enough entropy. + +# Input the security key +┌──────────────────────────────────────────────────────┐ +│ Please enter this passphrase │ +│ │ +│ Passphrase: _______________________________ │ +│ │ +│ │ +└──────────────────────────────────────────────────────┘ +# key generation will be done after your inputting the key with the following output +gpg: key E49B00F626B marked as ultimately trusted +gpg: revocation certificate stored as '/Users/chaokunyang/.gnupg/openpgp-revocs.d/1E2CDAE4C08AD7D694D1CB139D7BE8E45E580BA4.rev' +public and secret key created and signed. + +pub rsa4096 2022-07-12 [SC] + 1E2CDAE4C08AD7D694D1CB139D7BE8E45E580BA4 +uid [ultimate] Chaokun +sub rsa4096 2022-07-12 [E] +``` + +#### Upload your public key to public GPG keyserver + +Firstly, list your key: + +```bash +gpg --list-keys +``` + +The output is like: + +```bash +-------------------------------------------------- +pub rsa4096 2024-03-27 [SC] + 1E2CDAE4C08AD7D694D1CB139D7BE8E45E580BA4 +uid [ultimate] chaokunyang (CODE SIGNING KEY) +sub rsa4096 2024-03-27 [E] +``` + +Then, send your key id to key server: + +```bash +gpg --keyserver keys.openpgp.org --send-key # e.g., 1E2CDAE4C08AD7D694D1CB139D7BE8E45E580BA4 +``` + +Among them, `keys.openpgp.org` is a randomly selected keyserver, you can use keyserver.ubuntu.com or any other full-featured keyserver. + +#### Check whether the key is created successfully + +Uploading takes about one minute; after that, you can check by email at the corresponding keyserver. + +Uploading keys to the keyserver is mainly for joining a [Web of Trust](https://infra.apache.org/release-signing.html#web-of-trust). + +#### Add your GPG public key to the project KEYS file + +The svn repository of the release branch is: https://dist.apache.org/repos/dist/release/incubator/fury + +Please add the public key to KEYS in the release branch: + +```bash +svn co https://dist.apache.org/repos/dist/release/incubator/fury fury-dist +# As this step will copy all the versions, it will take some time. If the network is broken, please use svn cleanup to delete the lock before re-execute it. +cd fury-dist +(gpg --list-sigs YOUR_NAME@apache.org && gpg --export --armor YOUR_NAME@apache.org) >> KEYS # Append your key to the KEYS file +svn add . # It is not needed if the KEYS document exists before. +svn ci -m "add gpg key for YOUR_NAME" # Later on, if you are asked to enter a username and password, just use your apache username and password. +``` + +#### Upload the GPG public key to your GitHub account + +- Enter https://github.com/settings/keys to add your GPG key. +- Please remember to bind the email address used in the GPG key to your GitHub account (https://github.com/settings/emails) if you find "unverified" after adding it. + +### Further reading + +It's recommended but not mandatory to read following documents before making a release to know more details about apache release: + +- Release policy: https://www.apache.org/legal/release-policy.html +- Incubator release: http://incubator.apache.org/guides/releasemanagement.html +- TLP release: https://infra.apache.org/release-distribution +- Release sign: https://infra.apache.org/release-signing.html +- Release publish: https://infra.apache.org/release-publishing.html +- Release download pages: https://infra.apache.org/release-download-pages.html +- Publishing maven artifacts: https://infra.apache.org/publishing-maven-artifacts.html + +## Start discussion about the release + +Start a discussion about the next release via sending email to: dev@fury.apache.org: + +Title: + +``` +[DISCUSS] Release Apache Fury(incubating) ${release_version} +``` + +Content: + +``` +Hello, Apache Fury(incubating) Community, + +This is a call for a discussion to release Apache Fury(incubating) version ${release_version}. + +The change lists about this release: + +https://github.com/apache/fury/compare/v0.4.1...v0.5.0 + +Please leave your comments here about this release plan. We will bump the version in repo and start the release process after the discussion. + +Thanks, + +${name} +``` + +## Preparing for release + +If the discussion goes positive, you will need to prepare the release artifiacts. + +### Github branch and tag + +- Create a new branch named `releases-0.5.0` +- Bump version to `$version` by executing command `python ci/release.py -l all -version $version` +- Make a git commit and push the branch to `git@github.com:apache/fury.git` +- Create a new tag by `git tag v0.5.0-rc1`, then push it to `git@github.com:apache/fury.git` + +### Build and upload artifacts to SVN dist/dev repo + +First you need to build source release artifacts by `python ci/release.py build -v $version`. + +Then you need to upload it to svn dist repo. The dist repo of the dev branch is: https://dist.apache.org/repos/dist/dev/incubator/fury + +```bash +# As this step will copy all the versions, it will take some time. If the network is broken, please use svn cleanup to delete the lock before re-execute it. +svn co https://dist.apache.org/repos/dist/dev/incubator/fury fury-dist-dev +``` + +Then, upload the artifacts: + +```bash +cd fury-dist-dev +# create a directory named by version +mkdir ${release_version}-${rc_version} +# copy source code and signature package to the versioned directory +cp ${repo_dir}/dist/* ${release_version}-${rc_version} +# check svn status +svn status +# add to svn +svn add ${release_version}-${rc_version} +# check svn status +svn status +# commit to SVN remote server +svn commit -m "Prepare for fury ${release_version}-${rc_version}" +``` + +Visit https://dist.apache.org/repos/dist/dev/incubator/fury/ to check the artifacts are uploaded correctly. + +### What to do if something goes wrong + +If some files are unexpected, you need to remove by `svn delete` and repeat the above upload process. + +## Voting + +As an incubating project, Fury requires votes from both the FUry Community and Incubator Community. + +- release_version: the version for fury, like 0.5.0. +- release_candidate_version: the version for voting, like 0.5.0-rc1. +- maven_artifact_number: the number for Maven staging artifacts, like 1001. Specifically, the maven_artifact_number can be found by searching "fury" on https://repository.apache.org/#stagingRepositories. + +### Fury Community Vote + +Send an email to Fury Community: dev@fury.apache.org: + +Title: + +``` +[VOTE] Release Apache Fury(incubating) v${release_version}-${rc_version} +``` + +Content: + +``` +Hello, Apache Fury(incubating) Community: + +This is a call for vote to release Apache Fury(Incubating) +version release-0.5.0-rc3. + +Apache Fury(incubating) - A blazingly fast multi-language serialization +framework powered by JIT and zero-copy. + +The change lists about this release: + +https://github.com/apache/fury/compare/v0.4.1...v0.5.0-rc3 + +The release candidates: +https://dist.apache.org/repos/dist/dev/incubator/fury/0.5.0-rc3/ + +The maven staging for this release: +https://repository.apache.org/content/repositories/orgapachefury-1003 + +Git tag for the release: +https://github.com/apache/fury/releases/tag/v0.5.0-rc3 + +Git commit for the release: +https://github.com/apache/fury/commit/fae06330edd049bb960536e978a45b97bca66faf + +The artifacts signed with PGP key [5E580BA4], corresponding to +[chaokunyang@apache.org], that can be found in keys file: +https://downloads.apache.org/incubator/fury/KEYS + +The vote will be open for at least 72 hours until the necessary number of votes are reached. + +Please vote accordingly: + +[ ] +1 approve +[ ] +0 no opinion +[ ] -1 disapprove with the reason + +To learn more about Fury, please see https://fury.apache.org/ + +*Valid check is a requirement for a vote. *Checklist for reference: + +[ ] Download Fury is valid. +[ ] Checksums and PGP signatures are valid. +[ ] Source code distributions have correct names matching the current release. +[ ] LICENSE and NOTICE files are correct. +[ ] All files have license headers if necessary. +[ ] No compiled archives bundled in source archive. +[ ] Can compile from source. + +More detail checklist please refer: +https://cwiki.apache.org/confluence/display/INCUBATOR/Incubator+Release+Checklist + +How to Build and Test, please refer to: https://github.com/apache/fury/blob/main/docs/guide/DEVELOPMENT.md + + +Thanks, +Chaokun Yang +``` + +After at least 3 +1 binding vote (from Fury Podling PMC member and committers) and no veto, claim the vote result: + +Title: + +``` +[RESULT][VOTE] Release Apache Fury(incubating) v${release_version}-${rc_version} +``` + +Content: + +``` +Hello, Apache Fury(incubating) Community, + +The vote to release Apache Fury(Incubating) v${release_version}-${rc_version} has passed. + +The vote PASSED with 3 binding +1 and 0 -1 vote: + +Binding votes: + +- xxx +- yyy +- zzz + +Vote thread: ${vote_thread_url} + +Thanks, + +${name} +``` + +### Incubator Community Vote + +Send an email to: general@incubator.apache.org: + +Title: + +``` +[VOTE] Release Apache Fury(incubating) v${release_version}-${rc_version} +``` + +Content: + +``` +Hello everyone, + +This is a call for the vote to release Apache Fury(Incubating) v${release_version}-${rc_version}. + +The Apache Fury community has voted and approved the release of Apache +Fury(incubating) v${release_version}-${rc_version}. We now kindly request the IPMC members +review and vote for this release. + +Apache Fury(incubating) - A blazingly fast multi-language serialization +framework powered by JIT and zero-copy. + +Fury community vote thread: +${community_vote_thread_url} + +Vote result thread: +${community_vote_result_thread_url} + +The release candidate: +https://dist.apache.org/repos/dist/dev/incubator/fury/${release_version}-${rc_version}/ + +This release has been signed with a PGP available here: +https://downloads.apache.org/incubator/fury/KEYS + +Git tag for the release: +https://github.com/apache/fury/releases/tag/v${release_version}-${rc_version}/ + +Git commit for the release: +https://github.com/apache/fury/commit/$xxx + +Maven staging repo: +https://repository.apache.org/content/repositories/orgapachefury-${maven_artifact_number}/ + +How to Build and Test, please refer to: +https://github.com/apache/fury/blob/main/docs/guide/DEVELOPMENT.md + +Please download, verify, and test. + +The VOTE will pass after 3 binding approve. + +[ ] +1 approve +[ ] +0 no opinion +[ ] -1 disapprove with the reason + +To learn more about apache fury, please see https://fury.apache.org/ + +Checklist for reference: + +[ ] Download links are valid. +[ ] Checksums and signatures. +[ ] LICENSE/NOTICE files exist +[ ] No unexpected binary files +[ ] All source files have ASF headers +[ ] Can compile from source + +Thanks, + +${name} +``` + +After at least 72 hours with at least 3 +1 binding vote (from Incubator PMC member) and no veto, claim the vote result: + +Title: + +``` +[RESULT][VOTE] Release Apache Fury(incubating) v${release_version}-${rc_version} +``` + +Content: + +``` +Hi Incubator PMC, + +The vote to release Apache Fury(incubating) v${release_version}-${rc_version} has passed with +4 +1 binding and 3 +1 non-binding votes, no +0 or -1 votes. + +Binding votes: + +- xxx +- yyy +- zzz + +Non-Binding votes: + +- aaa + +Vote thread: ${incubator_vote_thread_url} + +Thanks for reviewing and voting for our release candidate. + +We will proceed with publishing the approved artifacts and sending out the announcement soon. +``` + +### What if vote fail + +If the vote failed, click "Drop" to drop the staging Maven artifacts. + +Address the raised issues, then bump `rc_version` and file a new vote again. + +## Official Release + +### Publish artifacts to SVN Release Directory + +- release_version: the release version for fury, like 0.5.0 +- release_candidate_version: the version for voting, like 0.5.0-rc1 + +```bash +svn mv https://dist.apache.org/repos/dist/dev/incubator/fury/${release_version}-${rc_version} https://dist.apache.org/repos/dist/release/incubator/fury/${release_version} -m "Release fury ${release_version}" +``` + +### Change Fury Website download link + +Submit a PR to https://github.com/apache/fury-site to update [Download page](https://fury.apache.org/download) + +### Release Maven artifacts + +- maven_artifact_number: the number for Maven staging artifacts, like 1001. +- Open https://repository.apache.org/#stagingRepositories. +- Find the artifact `orgapachefury-${maven_artifact_number}`, click "Release". + +### Send the announcement + +Send the release announcement to dev@fury.apache.org and CC announce@apache.org. + +Title: + +``` +[ANNOUNCE] Release Apache Fury(incubating) ${release_version} +``` + +Content: + +``` +Hi all, + +The Apache Fury(incubating) community is pleased to announce +that Apache Fury(incubating) {release_version} has been released! + +Apache Fury(incubating) - A blazingly fast multi-language serialization +framework powered by JIT and zero-copy. + +The release notes are available here: +https://github.com/apache/fury/releases/tag/v${release_version} + +For the complete list of changes: +https://github.com/apache/fury/compare/v0.5.0...v${release_version} + +Apache Fury website: https://fury.apache.org/ + +Download Links: https://fury.apache.org/download + +Fury Resources: +- Fury github repo: https://github.com/apache/fury +- Issue: https://github.com/apache/fury/issues +- Mailing list: dev@fury.apache.org + +We are looking to grow our community and welcome new contributors. If +you are interested in contributing to Fury, please contact us on the +mailing list or on GitHub. We will be happy to help you get started. + +------------------ +Best Regards, +${your_name} +``` diff --git a/versioned_docs/version-0.10.0/community/how_to_verify.md b/versioned_docs/version-0.10.0/community/how_to_verify.md new file mode 100644 index 00000000000..23b7395b1bc --- /dev/null +++ b/versioned_docs/version-0.10.0/community/how_to_verify.md @@ -0,0 +1,122 @@ +--- +title: How to verify +sidebar_position: 0 +id: how_to_verify +--- + + +For detailed check list, please refer to the [official check list](https://cwiki.apache.org/confluence/display/INCUBATOR/Incubator+Release+Checklist) + +## Download the candidate version + +```bash +#If there is svn locally, you can clone to the local +svn co https://dist.apache.org/repos/dist/dev/incubator/fury/${release_version}-${rc_version}/ +# You can download the material file directly +wget https://dist.apache.org/repos/dist/dev/incubator/fury/${release_version}-${rc_version}/xxx.xxx +``` + +## Verify checksums and signatures + +First you need to install gpg: + +```bash +apt-get install gnupg +# or +yum install gnupg +# or +brew install gnupg +``` + +Then import the Fury release manager's public key: + +```bash +curl https://downloads.apache.org/incubator/fury/KEYS > KEYS # Download KEYS +gpg --import KEYS # Import KEYS to local +# Then, trust the public key: +gpg --edit-key # Edit the key(mentioned in vote email) +# It will enter the interactive mode, use the following command to trust the key: +gpg (GnuPG) 2.0.22; Copyright (C) 2013 Free Software Foundation, Inc. +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. + + +pub 4096R/5E580BA4 created: 2024-03-27 expires: never usage: SC + trust: unknown validity: unknown +sub 4096R/A31EF728 created: 2024-03-27 expires: never usage: E +[ unknown] (1). chaokunyang (CODE SIGNING KEY) + +gpg> trust +pub 4096R/5E580BA4 created: 2024-03-27 expires: never usage: SC + trust: unknown validity: unknown +sub 4096R/A31EF728 created: 2024-03-27 expires: never usage: E +[ unknown] (1). chaokunyang (CODE SIGNING KEY) + +Please decide how far you trust this user to correctly verify other users' keys +(by looking at passports, checking fingerprints from different sources, etc.) + + 1 = I don't know or won't say + 2 = I do NOT trust + 3 = I trust marginally + 4 = I trust fully + 5 = I trust ultimately + m = back to the main menu + +Your decision? 5 +Do you really want to set this key to ultimate trust? (y/N) y + +pub 4096R/5E580BA4 created: 2024-03-27 expires: never usage: SC + trust: ultimate validity: unknown +sub 4096R/A31EF728 created: 2024-03-27 expires: never usage: E +[ unknown] (1). chaokunyang (CODE SIGNING KEY) +Please note that the shown key validity is not necessarily correct +unless you restart the program. +``` + +Next verify signature: + +```bash +for i in *.tar.gz; do echo $i; gpg --verify $i.asc $i; done +``` + +If something like the following appears, it means the signature is correct: + +```bash +apache-fury-incubating-0.5.0-src.tar.gz +gpg: Signature made Wed 17 Apr 2024 11:49:45 PM CST using RSA key ID 5E580BA4 +gpg: checking the trustdb +gpg: 3 marginal(s) needed, 1 complete(s) needed, PGP trust model +gpg: depth: 0 valid: 1 signed: 0 trust: 0-, 0q, 0n, 0m, 0f, 1u +gpg: Good signature from "chaokunyang (CODE SIGNING KEY) " +``` + +Then verify checksum: + +```bash +for i in *.tar.gz; do echo $i; sha512sum --check $i.sha512; done +``` + +It should output something like: + +```bash +apache-fury-incubating-0.5.0-src.tar.gz +apache-fury-incubating-0.5.0-src.tar.gz: OK +``` + +## Check the file content of the source package + +Unzip `apache-fury-${release_version}-${rc_version}-src.tar.gz` and check the follows: + +- LICENSE and NOTICE files are correct for the repository. +- All files have ASF license headers if necessary. +- Building is OK. + +## Check the Maven artifacts of fury-java + +Download the artifacts from https://repository.apache.org/content/repositories/orgapachefury-${maven_artifact_number}/. + +You can check the follows: + +- Checksum of JARs match the bundled checksum file. +- Signature of JARs match the bundled signature file. +- JARs is reproducible locally. This means you can build the JARs on your machine and verify the checksum is the same with the bundled one. diff --git a/versioned_docs/version-0.10.0/guide/DEVELOPMENT.md b/versioned_docs/version-0.10.0/guide/DEVELOPMENT.md new file mode 100644 index 00000000000..013811fa7fa --- /dev/null +++ b/versioned_docs/version-0.10.0/guide/DEVELOPMENT.md @@ -0,0 +1,122 @@ +--- +title: Development +sidebar_position: 7 +id: development +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## How to build Fury + +Please checkout the source tree from https://github.com/apache/fury. + +### Build Fury Java + +```bash +cd java +mvn clean compile -DskipTests +``` + +#### Environment Requirements + +- java 1.8+ +- maven 3.6.3+ + +### Build Fury Python + +```bash +cd python +# Uninstall numpy first so that when we install pyarrow, it will install the correct numpy version automatically. +# For Python versions less than 3.13, numpy 2 is not currently supported. +pip uninstall -y numpy +# Install necessary environment for Python < 3.13. +pip install pyarrow==15.0.0 Cython wheel pytest +# For Python 3.13, pyarrow 18.0.0 is available and requires numpy version greater than 2. +# pip install pyarrow==18.0.0 Cython wheel pytest +pip install -v -e . +``` + +#### Environment Requirements + +- python 3.6+ + +### Build Fury C++ + +Build fury row format: + +```bash +pip install pyarrow==15.0.0 +bazel build //cpp/fury/row:fury_row_format +``` + +Build fury row format encoder: + +```bash +pip install pyarrow==15.0.0 +bazel build //cpp/fury/encoder:fury_encoder +``` + +#### Environment Requirements + +- compilers with C++17 support +- bazel 6.3.2 + +### Build Fury GoLang + +```bash +cd go/fury +# run test +go test -v +# run xlang test +go test -v fury_xlang_test.go +``` + +#### Environment Requirements + +- go 1.13+ + +### Build Fury Rust + +```bash +cd rust +# build +cargo build +# run test +cargo test +``` + +#### Environment Requirements + +```bash +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +### Build Fury JavaScript + +```bash +cd javascript +npm install + +# run build +npm run build +# run test +npm run test +``` + +#### Environment Requirements + +- node 14+ +- npm 8+ diff --git a/versioned_docs/version-0.10.0/guide/graalvm_guide.md b/versioned_docs/version-0.10.0/guide/graalvm_guide.md new file mode 100644 index 00000000000..e55bcb6a082 --- /dev/null +++ b/versioned_docs/version-0.10.0/guide/graalvm_guide.md @@ -0,0 +1,256 @@ +--- +title: GraalVM Guide +sidebar_position: 6 +id: graalvm_guide +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## GraalVM Native Image + +GraalVM `native image` can compile java code into native code ahead to build faster, smaller, leaner applications. +The native image doesn't have a JIT compiler to compile bytecode into machine code, and doesn't support +reflection unless configure reflection file. + +Fury runs on GraalVM native image pretty well. Fury generates all serializer code for `Fury JIT framework` and `MethodHandle/LambdaMetafactory` at graalvm build time. Then use those generated code for serialization at runtime without +any extra cost, the performance is great. + +In order to use Fury on graalvm native image, you must create Fury as an **static** field of a class, and **register** all classes at + the enclosing class initialize time. Then configure `native-image.properties` under +`resources/META-INF/native-image/$xxx/native-image.propertie` to tell graalvm to init the class at native image +build time. For example, here we configure `org.apache.fury.graalvm.Example` class be init at build time: + +```properties +Args = --initialize-at-build-time=org.apache.fury.graalvm.Example +``` + +Another benefit using fury is that you don't have to configure [reflection json](https://www.graalvm.org/latest/reference-manual/native-image/metadata/#specifying-reflection-metadata-in-json) and +[serialization json](https://www.graalvm.org/latest/reference-manual/native-image/metadata/#serialization), which is +very tedious, cumbersome and inconvenient. When using fury, you just need to invoke +`org.apache.fury.Fury.register(Class, boolean)` for every type you want to serialize. + +Note that Fury `asyncCompilationEnabled` option will be disabled automatically for graalvm native image since graalvm +native image doesn't support JIT at the image run time. + +## Not thread-safe Fury + +Example: + +```java +import org.apache.fury.Fury; +import org.apache.fury.util.Preconditions; + +import java.util.List; +import java.util.Map; + +public class Example { + public record Record ( + int f1, + String f2, + List f3, + Map f4) { + } + + static Fury fury; + + static { + fury = Fury.builder().build(); + // register and generate serializer code. + fury.register(Record.class, true); + } + + public static void main(String[] args) { + Record record = new Record(10, "abc", List.of("str1", "str2"), Map.of("k1", 10L, "k2", 20L)); + System.out.println(record); + byte[] bytes = fury.serialize(record); + Object o = fury.deserialize(bytes); + System.out.println(o); + Preconditions.checkArgument(record.equals(o)); + } +} +``` + +Then add `org.apache.fury.graalvm.Example` build time init to `native-image.properties` configuration: + +```properties +Args = --initialize-at-build-time=org.apache.fury.graalvm.Example +``` + +## Thread-safe Fury + +```java +import org.apache.fury.Fury; +import org.apache.fury.ThreadLocalFury; +import org.apache.fury.ThreadSafeFury; +import org.apache.fury.util.Preconditions; + +import java.util.List; +import java.util.Map; + +public class ThreadSafeExample { + public record Foo ( + int f1, + String f2, + List f3, + Map f4) { + } + + static ThreadSafeFury fury; + + static { + fury = new ThreadLocalFury(classLoader -> { + Fury f = Fury.builder().build(); + // register and generate serializer code. + f.register(Foo.class, true); + return f; + }); + } + + public static void main(String[] args) { + System.out.println(fury.deserialize(fury.serialize("abc"))); + System.out.println(fury.deserialize(fury.serialize(List.of(1,2,3)))); + System.out.println(fury.deserialize(fury.serialize(Map.of("k1", 1, "k2", 2)))); + Foo foo = new Foo(10, "abc", List.of("str1", "str2"), Map.of("k1", 10L, "k2", 20L)); + System.out.println(foo); + byte[] bytes = fury.serialize(foo); + Object o = fury.deserialize(bytes); + System.out.println(o); + } +} +``` + +Then add `org.apache.fury.graalvm.ThreadSafeExample` build time init to `native-image.properties` configuration: + +```properties +Args = --initialize-at-build-time=org.apache.fury.graalvm.ThreadSafeExample +``` + +## Framework Integration + +For framework developers, if you want to integrate fury for serialization, you can provided a configuration file to let +the users to list all the classes they want to serialize, then you can load those classes and invoke +`org.apache.fury.Fury.register(Class, boolean)` to register those classes in your Fury integration class, and configure that +class be initialized at graalvm native image build time. + +## Benchmark + +Here we give two class benchmarks between Fury and Graalvm Serialization. + +When Fury compression is disabled: + +- Struct: Fury is `46x speed, 43% size` compared to JDK. +- Pojo: Fury is `12x speed, 56% size` compared to JDK. + +When Fury compression is enabled: + +- Struct: Fury is `24x speed, 31% size` compared to JDK. +- Pojo: Fury is `12x speed, 48% size` compared to JDK. + +See [[Benchmark.java](https://github.com/apache/fury/blob/main/integration_tests/graalvm_tests/src/main/java/org/apache/fury/graalvm/Benchmark.java)] for benchmark code. + +### Struct Benchmark + +#### Class Fields + +```java +public class Struct implements Serializable { + public int f1; + public long f2; + public float f3; + public double f4; + public int f5; + public long f6; + public float f7; + public double f8; + public int f9; + public long f10; + public float f11; + public double f12; +} +``` + +#### Benchmark Results + +No compression: + +``` +Benchmark repeat number: 400000 +Object type: class org.apache.fury.graalvm.Struct +Compress number: false +Fury size: 76.0 +JDK size: 178.0 +Fury serialization took mills: 49 +JDK serialization took mills: 2254 +Compare speed: Fury is 45.70x speed of JDK +Compare size: Fury is 0.43x size of JDK +``` + +Compress number: + +``` +Benchmark repeat number: 400000 +Object type: class org.apache.fury.graalvm.Struct +Compress number: true +Fury size: 55.0 +JDK size: 178.0 +Fury serialization took mills: 130 +JDK serialization took mills: 3161 +Compare speed: Fury is 24.16x speed of JDK +Compare size: Fury is 0.31x size of JDK +``` + +### Pojo Benchmark + +#### Class Fields + +```java +public class Foo implements Serializable { + int f1; + String f2; + List f3; + Map f4; +} +``` + +#### Benchmark Results + +No compression: + +``` +Benchmark repeat number: 400000 +Object type: class org.apache.fury.graalvm.Foo +Compress number: false +Fury size: 541.0 +JDK size: 964.0 +Fury serialization took mills: 1663 +JDK serialization took mills: 16266 +Compare speed: Fury is 12.19x speed of JDK +Compare size: Fury is 0.56x size of JDK +``` + +Compress number: + +``` +Benchmark repeat number: 400000 +Object type: class org.apache.fury.graalvm.Foo +Compress number: true +Fury size: 459.0 +JDK size: 964.0 +Fury serialization took mills: 1289 +JDK serialization took mills: 15069 +Compare speed: Fury is 12.11x speed of JDK +Compare size: Fury is 0.48x size of JDK +``` diff --git a/versioned_docs/version-0.10.0/guide/java_serialization_guide.md b/versioned_docs/version-0.10.0/guide/java_serialization_guide.md new file mode 100644 index 00000000000..b9ece7e9bae --- /dev/null +++ b/versioned_docs/version-0.10.0/guide/java_serialization_guide.md @@ -0,0 +1,628 @@ +--- +title: Java Serialization Guide +sidebar_position: 0 +id: java_object_graph_guide +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Java object graph serialization + +When only java object serialization needed, this mode will have better performance compared to cross-language object +graph serialization. + +## Quick Start + +Note that fury creation is not cheap, the **fury instances should be reused between serializations** instead of creating +it everytime. +You should keep fury to a static global variable, or instance variable of some singleton object or limited objects. + +Fury for single-thread usage: + +```java +import java.util.List; +import java.util.Arrays; + +import org.apache.fury.*; +import org.apache.fury.config.*; + +public class Example { + public static void main(String[] args) { + SomeClass object = new SomeClass(); + // Note that Fury instances should be reused between + // multiple serializations of different objects. + Fury fury = Fury.builder().withLanguage(Language.JAVA) + .requireClassRegistration(true) + .build(); + // Registering types can reduce class name serialization overhead, but not mandatory. + // If class registration enabled, all custom types must be registered. + fury.register(SomeClass.class); + byte[] bytes = fury.serialize(object); + System.out.println(fury.deserialize(bytes)); + } +} +``` + +Fury for multiple-thread usage: + +```java +import java.util.List; +import java.util.Arrays; + +import org.apache.fury.*; +import org.apache.fury.config.*; + +public class Example { + public static void main(String[] args) { + SomeClass object = new SomeClass(); + // Note that Fury instances should be reused between + // multiple serializations of different objects. + ThreadSafeFury fury = new ThreadLocalFury(classLoader -> { + Fury f = Fury.builder().withLanguage(Language.JAVA) + .withClassLoader(classLoader).build(); + f.register(SomeClass.class); + return f; + }); + byte[] bytes = fury.serialize(object); + System.out.println(fury.deserialize(bytes)); + } +} +``` + +Fury instances reuse example: + +```java +import java.util.List; +import java.util.Arrays; + +import org.apache.fury.*; +import org.apache.fury.config.*; + +public class Example { + // reuse fury. + private static final ThreadSafeFury fury = new ThreadLocalFury(classLoader -> { + Fury f = Fury.builder().withLanguage(Language.JAVA) + .withClassLoader(classLoader).build(); + f.register(SomeClass.class); + return f; + }); + + public static void main(String[] args) { + SomeClass object = new SomeClass(); + byte[] bytes = fury.serialize(object); + System.out.println(fury.deserialize(bytes)); + } +} +``` + +## FuryBuilder options + +| Option Name | Description | Default Value | +|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------| +| `timeRefIgnored` | Whether to ignore reference tracking of all time types registered in `TimeSerializers` and subclasses of those types when ref tracking is enabled. If ignored, ref tracking of every time type can be enabled by invoking `Fury#registerSerializer(Class, Serializer)`. For example, `fury.registerSerializer(Date.class, new DateSerializer(fury, true))`. Note that enabling ref tracking should happen before serializer codegen of any types which contain time fields. Otherwise, those fields will still skip ref tracking. | `true` | +| `compressInt` | Enables or disables int compression for smaller size. | `true` | +| `compressLong` | Enables or disables long compression for smaller size. | `true` | +| `compressString` | Enables or disables string compression for smaller size. | `false` | +| `classLoader` | The classloader should not be updated; Fury caches class metadata. Use `LoaderBinding` or `ThreadSafeFury` for classloader updates. | `Thread.currentThread().getContextClassLoader()` | +| `compatibleMode` | Type forward/backward compatibility config. Also Related to `checkClassVersion` config. `SCHEMA_CONSISTENT`: Class schema must be consistent between serialization peer and deserialization peer. `COMPATIBLE`: Class schema can be different between serialization peer and deserialization peer. They can add/delete fields independently. [See more](#class-inconsistency-and-class-version-check). | `CompatibleMode.SCHEMA_CONSISTENT` | +| `checkClassVersion` | Determines whether to check the consistency of the class schema. If enabled, Fury checks, writes, and checks consistency using the `classVersionHash`. It will be automatically disabled when `CompatibleMode#COMPATIBLE` is enabled. Disabling is not recommended unless you can ensure the class won't evolve. | `false` | +| `checkJdkClassSerializable` | Enables or disables checking of `Serializable` interface for classes under `java.*`. If a class under `java.*` is not `Serializable`, Fury will throw an `UnsupportedOperationException`. | `true` | +| `registerGuavaTypes` | Whether to pre-register Guava types such as `RegularImmutableMap`/`RegularImmutableList`. These types are not public API, but seem pretty stable. | `true` | +| `requireClassRegistration` | Disabling may allow unknown classes to be deserialized, potentially causing security risks. | `true` | +| `suppressClassRegistrationWarnings` | Whether to suppress class registration warnings. The warnings can be used for security audit, but may be annoying, this suppression will be enabled by default. | `true` | +| `metaShareEnabled` | Enables or disables meta share mode. | `true` if `CompatibleMode.Compatible` is set, otherwise false. | +| `scopedMetaShareEnabled` | Scoped meta share focuses on a single serialization process. Metadata created or identified during this process is exclusive to it and is not shared with by other serializations. | `true` if `CompatibleMode.Compatible` is set, otherwise false. | +| `metaCompressor` | Set a compressor for meta compression. Note that the passed MetaCompressor should be thread-safe. By default, a `Deflater` based compressor `DeflaterMetaCompressor` will be used. Users can pass other compressor such as `zstd` for better compression rate. | `DeflaterMetaCompressor` | +| `deserializeNonexistentClass` | Enables or disables deserialization/skipping of data for non-existent classes. | `true` if `CompatibleMode.Compatible` is set, otherwise false. | +| `codeGenEnabled` | Disabling may result in faster initial serialization but slower subsequent serializations. | `true` | +| `asyncCompilationEnabled` | If enabled, serialization uses interpreter mode first and switches to JIT serialization after async serializer JIT for a class is finished. | `false` | +| `scalaOptimizationEnabled` | Enables or disables Scala-specific serialization optimization. | `false` | +| `copyRef` | When disabled, the copy performance will be better. But fury deep copy will ignore circular and shared reference. Same reference of an object graph will be copied into different objects in one `Fury#copy`. | `true` | +| `serializeEnumByName` | When Enabled, fury serialize enum by name instead of ordinal. | `false` | + +## Advanced Usage + +### Fury creation + +Single thread fury: + +```java +Fury fury = Fury.builder() + .withLanguage(Language.JAVA) + // enable reference tracking for shared/circular reference. + // Disable it will have better performance if no duplicate reference. + .withRefTracking(false) + .withCompatibleMode(CompatibleMode.SCHEMA_CONSISTENT) + // enable type forward/backward compatibility + // disable it for small size and better performance. + // .withCompatibleMode(CompatibleMode.COMPATIBLE) + // enable async multi-threaded compilation. + .withAsyncCompilation(true) + .build(); +byte[] bytes = fury.serialize(object); +System.out.println(fury.deserialize(bytes)); +``` + +Thread-safe fury: + +```java +ThreadSafeFury fury = Fury.builder() + .withLanguage(Language.JAVA) + // enable reference tracking for shared/circular reference. + // Disable it will have better performance if no duplicate reference. + .withRefTracking(false) + // compress int for smaller size + // .withIntCompressed(true) + // compress long for smaller size + // .withLongCompressed(true) + .withCompatibleMode(CompatibleMode.SCHEMA_CONSISTENT) + // enable type forward/backward compatibility + // disable it for small size and better performance. + // .withCompatibleMode(CompatibleMode.COMPATIBLE) + // enable async multi-threaded compilation. + .withAsyncCompilation(true) + .buildThreadSafeFury(); +byte[] bytes = fury.serialize(object); +System.out.println(fury.deserialize(bytes)); +``` + +### Handling Class Schema Evolution in Serialization + +In many systems, the schema of a class used for serialization may change over time. For instance, fields within a class +may be added or removed. When serialization and deserialization processes use different versions of jars, the schema of +the class being deserialized may differ from the one used during serialization. + +By default, Fury serializes objects using the `CompatibleMode.SCHEMA_CONSISTENT` mode. This mode assumes that the +deserialization process uses the same class schema as the serialization process, minimizing payload overhead. +However, if there is a schema inconsistency, deserialization will fail. + +If the schema is expected to change, to make deserialization succeed, i.e. schema forward/backward compatibility. +Users must configure Fury to use `CompatibleMode.COMPATIBLE`. This can be done using the +`FuryBuilder#withCompatibleMode(CompatibleMode.COMPATIBLE)` method. +In this compatible mode, deserialization can handle schema changes such as missing or extra fields, allowing it to +succeed even when the serialization and deserialization processes have different class schemas. + +Here is an example of creating Fury to support schema evolution: + +```java +Fury fury = Fury.builder() + .withCompatibleMode(CompatibleMode.COMPATIBLE) + .build(); + +byte[] bytes = fury.serialize(object); +System.out.println(fury.deserialize(bytes)); +``` + +This compatible mode involves serializing class metadata into the serialized output. Despite Fury's use of +sophisticated compression techniques to minimize overhead, there is still some additional space cost associated with +class metadata. + +To further reduce metadata costs, Fury introduces a class metadata sharing mechanism, which allows the metadata to be +sent to the deserialization process only once. For more details, please refer to the [Meta Sharing](#MetaSharing) +section. + +### Smaller size + +`FuryBuilder#withIntCompressed`/`FuryBuilder#withLongCompressed` can be used to compress int/long for smaller size. +Normally compress int is enough. + +Both compression are enabled by default, if the serialized is not important, for example, you use flatbuffers for +serialization before, which doesn't compress anything, then you should disable compression. If your data are all +numbers, +the compression may bring 80% performance regression. + +For int compression, fury use 1~5 bytes for encoding. First bit in every byte indicate whether has next byte. if first +bit is set, then next byte will be read util first bit of next byte is unset. + +For long compression, fury support two encoding: + +- Fury SLI(Small long as int) Encoding (**used by default**): + - If long is in `[-1073741824, 1073741823]`, encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - First bit in every byte indicate whether has next byte. if first bit is set, then next byte will be read util + first bit of next byte is unset. + - Negative number will be converted to positive number by `(v << 1) ^ (v >> 63)` to reduce cost of small negative + numbers. + +If a number are `long` type, it can't be represented by smaller bytes mostly, the compression won't get good enough +result, +not worthy compared to performance cost. Maybe you should try to disable long compression if you find it didn't bring +much +space savings. + +### Object deep copy + +Deep copy example: + +```java +Fury fury = Fury.builder().withRefCopy(true).build(); +SomeClass a = xxx; +SomeClass copied = fury.copy(a); +``` + +Make fury deep copy ignore circular and shared reference, this deep copy mode will ignore circular and shared reference. +Same reference of an object graph will be copied into different objects in one `Fury#copy`. + +```java +Fury fury = Fury.builder().withRefCopy(false).build(); +SomeClass a = xxx; +SomeClass copied = fury.copy(a); +``` + +### Implement a customized serializer + +In some cases, you may want to implement a serializer for your type, especially some class customize serialization by +JDK +writeObject/writeReplace/readObject/readResolve, which is very inefficient. For example, you don't want +following `Foo#writeObject` +got invoked, you can take following `FooSerializer` as an example: + +```java +class Foo { + public long f1; + + private void writeObject(ObjectOutputStream s) throws IOException { + System.out.println(f1); + s.defaultWriteObject(); + } +} + +class FooSerializer extends Serializer { + public FooSerializer(Fury fury) { + super(fury, Foo.class); + } + + @Override + public void write(MemoryBuffer buffer, Foo value) { + buffer.writeInt64(value.f1); + } + + @Override + public Foo read(MemoryBuffer buffer) { + Foo foo = new Foo(); + foo.f1 = buffer.readInt64(); + return foo; + } +} +``` + +Register serializer: + +```java +Fury fury = getFury(); +fury.registerSerializer(Foo.class, new FooSerializer(fury)); +``` + +### Security & Class Registration + +`FuryBuilder#requireClassRegistration` can be used to disable class registration, this will allow to deserialize objects +unknown types, +more flexible but **may be insecure if the classes contains malicious code**. + +**Do not disable class registration unless you can ensure your environment is secure**. +Malicious code in `init/equals/hashCode` can be executed when deserializing unknown/untrusted types when this option +disabled. + +Class registration can not only reduce security risks, but also avoid classname serialization cost. + +You can register class with API `Fury#register`. + +Note that class registration order is important, serialization and deserialization peer +should have same registration order. + +```java +Fury fury = xxx; +fury.register(SomeClass.class); +fury.register(SomeClass1.class, 200); +``` + +If you invoke `FuryBuilder#requireClassRegistration(false)` to disable class registration check, +you can set `org.apache.fury.resolver.ClassChecker` by `ClassResolver#setClassChecker` to control which classes are +allowed +for serialization. For example, you can allow classes started with `org.example.*` by: + +```java +Fury fury = xxx; +fury.getClassResolver().setClassChecker( + (classResolver, className) -> className.startsWith("org.example.")); +``` + +```java +AllowListChecker checker = new AllowListChecker(AllowListChecker.CheckLevel.STRICT); +ThreadSafeFury fury = new ThreadLocalFury(classLoader -> { + Fury f = Fury.builder().requireClassRegistration(true).withClassLoader(classLoader).build(); + f.getClassResolver().setClassChecker(checker); + checker.addListener(f.getClassResolver()); + return f; +}); +checker.allowClass("org.example.*"); +``` + +Fury also provided a `org.apache.fury.resolver.AllowListChecker` which is allowed/disallowed list based checker to +simplify +the customization of class check mechanism. You can use this checker or implement more sophisticated checker by +yourself. + +### Register class by name + +Register class by id will have better performance and smaller space overhead. But in some cases, management for a bunch +of type id is complex. In such cases, registering class by name using API +`register(Class cls, String namespace, String typeName)` is recommended. + +```java +fury.register(Foo.class, "demo", "Foo"); +``` + +If there are no duplicate name for type, `namespace` can be left as empty to reduce serialized size. + +**Do not use this API to register class since it will increase serialized size a lot compared to register +class by id** + +### Serializer Registration + +You can also register a custom serializer for a class by `Fury#registerSerializer` API. + +Or implement `java.io.Externalizable` for a class. + +### Zero-Copy Serialization + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; +import org.apache.fury.serializer.BufferObject; +import org.apache.fury.memory.MemoryBuffer; + +import java.util.*; +import java.util.stream.Collectors; + +public class ZeroCopyExample { + // Note that fury instance should be reused instead of creation every time. + static Fury fury = Fury.builder() + .withLanguage(Language.JAVA) + .build(); + + // mvn exec:java -Dexec.mainClass="io.ray.fury.examples.ZeroCopyExample" + public static void main(String[] args) { + List list = Arrays.asList("str", new byte[1000], new int[100], new double[100]); + Collection bufferObjects = new ArrayList<>(); + byte[] bytes = fury.serialize(list, e -> !bufferObjects.add(e)); + List buffers = bufferObjects.stream() + .map(BufferObject::toBuffer).collect(Collectors.toList()); + System.out.println(fury.deserialize(bytes, buffers)); + } +} +``` + +### Meta Sharing + +Fury supports share type metadata (class name, field name, final field type information, etc.) between multiple +serializations in a context (ex. TCP connection), and this information will be sent to the peer during the first +serialization in the context. Based on this metadata, the peer can rebuild the same deserializer, which avoids +transmitting metadata for subsequent serializations and reduces network traffic pressure and supports type +forward/backward compatibility automatically. + +```java +// Fury.builder() +// .withLanguage(Language.JAVA) +// .withRefTracking(false) +// // share meta across serialization. +// .withMetaContextShare(true) +// Not thread-safe fury. +MetaContext context = xxx; +fury.getSerializationContext().setMetaContext(context); +byte[] bytes = fury.serialize(o); +// Not thread-safe fury. +MetaContext context = xxx; +fury.getSerializationContext().setMetaContext(context); +fury.deserialize(bytes); + +// Thread-safe fury +fury.setClassLoader(beanA.getClass().getClassLoader()); +byte[] serialized = fury.execute( + f -> { + f.getSerializationContext().setMetaContext(context); + return f.serialize(beanA); + } +); +// thread-safe fury +fury.setClassLoader(beanA.getClass().getClassLoader()); +Object newObj = fury.execute( + f -> { + f.getSerializationContext().setMetaContext(context); + return f.deserialize(serialized); + } +); +``` + +### Deserialize non-existent classes + +Fury support deserializing non-existent classes, this feature can be enabled +by `FuryBuilder#deserializeNonexistentClass(true)`. When enabled, and metadata sharing enabled, Fury will store +the deserialized data of this type in a lazy subclass of Map. By using the lazy map implemented by Fury, the rebalance +cost of filling map during deserialization can be avoided, which further improves performance. If this data is sent to +another process and the class exists in this process, the data will be deserialized into the object of this type without +losing any information. + +If metadata sharing is not enabled, the new class data will be skipped and an `NonexistentSkipClass` stub object will be +returned. + +### Coping/Mapping object from one type to another type + +Fury support mapping object from one type to another type. +> Notes: +> +> 1. This mapping will execute a deep copy, all mapped fields are serialized into binary and + deserialized from that binary to map into another type. +> 2. All struct types must be registered with same ID, otherwise Fury can not mapping to correct struct type. + > Be careful when you use `Fury#register(Class)`, because fury will allocate an auto-grown ID which might be + > inconsistent if you register classes with different order between Fury instance. + +```java +public class StructMappingExample { + static class Struct1 { + int f1; + String f2; + + public Struct1(int f1, String f2) { + this.f1 = f1; + this.f2 = f2; + } + } + + static class Struct2 { + int f1; + String f2; + double f3; + } + + static ThreadSafeFury fury1 = Fury.builder() + .withCompatibleMode(CompatibleMode.COMPATIBLE).buildThreadSafeFury(); + static ThreadSafeFury fury2 = Fury.builder() + .withCompatibleMode(CompatibleMode.COMPATIBLE).buildThreadSafeFury(); + + static { + fury1.register(Struct1.class); + fury2.register(Struct2.class); + } + + public static void main(String[] args) { + Struct1 struct1 = new Struct1(10, "abc"); + Struct2 struct2 = (Struct2) fury2.deserialize(fury1.serialize(struct1)); + Assert.assertEquals(struct2.f1, struct1.f1); + Assert.assertEquals(struct2.f2, struct1.f2); + struct1 = (Struct1) fury1.deserialize(fury2.serialize(struct2)); + Assert.assertEquals(struct1.f1, struct2.f1); + Assert.assertEquals(struct1.f2, struct2.f2); + } +} +``` + +## Migration + +### JDK migration + +If you use jdk serialization before, and you can't upgrade your client and server at the same time, which is common for +online application. Fury provided an util method `org.apache.fury.serializer.JavaSerializer.serializedByJDK` to check +whether +the binary are generated by jdk serialization, you use following pattern to make exiting serialization protocol-aware, +then upgrade serialization to fury in an async rolling-up way: + +```java +if (JavaSerializer.serializedByJDK(bytes)) { + ObjectInputStream objectInputStream=xxx; + return objectInputStream.readObject(); +} else { + return fury.deserialize(bytes); +} +``` + +### Upgrade fury + +Currently binary compatibility is ensured for minor versions only. For example, if you are using fury`v0.2.0`, binary +compatibility will +be provided if you upgrade to fury `v0.2.1`. But if upgrade to fury `v0.4.1`, no binary compatibility are ensured. +Most of the time there is no need to upgrade fury to newer major version, the current version is fast and compact +enough, +and we provide some minor fix for recent older versions. + +But if you do want to upgrade fury for better performance and smaller size, you need to write fury version as header to +serialized data +using code like following to keep binary compatibility: + +```java +MemoryBuffer buffer = xxx; +buffer.writeVarInt32(2); +fury.serialize(buffer, obj); +``` + +Then for deserialization, you need: + +```java +MemoryBuffer buffer = xxx; +int furyVersion = buffer.readVarInt32(); +Fury fury = getFury(furyVersion); +fury.deserialize(buffer); +``` + +`getFury` is a method to load corresponding fury, you can shade and relocate different version of fury to different +package, and load fury by version. + +If you upgrade fury by minor version, or you won't have data serialized by older fury, you can upgrade fury directly, +no need to `versioning` the data. + +## Trouble shooting + +### Class inconsistency and class version check + +If you create fury without setting `CompatibleMode` to `org.apache.fury.config.CompatibleMode.COMPATIBLE`, and you got a +strange +serialization error, it may be caused by class inconsistency between serialization peer and deserialization peer. + +In such cases, you can invoke `FuryBuilder#withClassVersionCheck` to create fury to validate it, if deserialization +throws `org.apache.fury.exception.ClassNotCompatibleException`, it shows class are inconsistent, and you should create +fury with +`FuryBuilder#withCompaibleMode(CompatibleMode.COMPATIBLE)`. + +`CompatibleMode.COMPATIBLE` has more performance and space cost, do not set it by default if your classes are always +consistent between serialization and deserialization. + +### Deserialize POJO into another type + +Fury allows you to serialize one POJO and deserialize it into a different POJO. The different POJO means the schema inconsistency. Users must to configure Fury with +`CompatibleMode` set to `org.apache.fury.config.CompatibleMode.COMPATIBLE`. + +```java +public class DeserializeIntoType { + static class Struct1 { + int f1; + String f2; + + public Struct1(int f1, String f2) { + this.f1 = f1; + this.f2 = f2; + } + } + + static class Struct2 { + int f1; + String f2; + double f3; + } + + static ThreadSafeFury fury = Fury.builder() + .withCompatibleMode(CompatibleMode.COMPATIBLE).buildThreadSafeFury(); + + public static void main(String[] args) { + Struct1 struct1 = new Struct1(10, "abc"); + byte[] data = fury.serializeJavaObject(struct1); + Struct2 struct2 = (Struct2) fury.deserializeJavaObject(bytes, Struct2.class); + } +} +``` + +### Use wrong API for deserialization + +If you serialize an object by invoking `Fury#serialize`, you should invoke `Fury#deserialize` for deserialization +instead of +`Fury#deserializeJavaObject`. + +If you serialize an object by invoking `Fury#serializeJavaObject`, you should invoke `Fury#deserializeJavaObject` for +deserialization instead of `Fury#deserializeJavaObjectAndClass`/`Fury#deserialize`. + +If you serialize an object by invoking `Fury#serializeJavaObjectAndClass`, you should +invoke `Fury#deserializeJavaObjectAndClass` for deserialization instead +of `Fury#deserializeJavaObject`/`Fury#deserialize`. diff --git a/versioned_docs/version-0.10.0/guide/row_format_guide.md b/versioned_docs/version-0.10.0/guide/row_format_guide.md new file mode 100644 index 00000000000..e8de148f284 --- /dev/null +++ b/versioned_docs/version-0.10.0/guide/row_format_guide.md @@ -0,0 +1,154 @@ +--- +title: Row Format Guide +sidebar_position: 1 +id: row_format_guide +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Row format protocol + +### Java + +```java +public class Bar { + String f1; + List f2; +} + +public class Foo { + int f1; + List f2; + Map f3; + List f4; +} + +RowEncoder encoder = Encoders.bean(Foo.class); +Foo foo = new Foo(); +foo.f1 = 10; +foo.f2 = IntStream.range(0, 1000000).boxed().collect(Collectors.toList()); +foo.f3 = IntStream.range(0, 1000000).boxed().collect(Collectors.toMap(i -> "k"+i, i->i)); +List bars = new ArrayList<>(1000000); +for (int i = 0; i < 1000000; i++) { + Bar bar = new Bar(); + bar.f1 = "s"+i; + bar.f2 = LongStream.range(0, 10).boxed().collect(Collectors.toList()); + bars.add(bar); +} +foo.f4 = bars; +// Can be zero-copy read by python +BinaryRow binaryRow = encoder.toRow(foo); +// can be data from python +Foo newFoo = encoder.fromRow(binaryRow); +// zero-copy read List f2 +BinaryArray binaryArray2 = binaryRow.getArray(1); +// zero-copy read List f4 +BinaryArray binaryArray4 = binaryRow.getArray(3); +// zero-copy read 11th element of `readList f4` +BinaryRow barStruct = binaryArray4.getStruct(10); + +// zero-copy read 6th of f2 of 11th element of `readList f4` +barStruct.getArray(1).getInt64(5); +RowEncoder barEncoder = Encoders.bean(Bar.class); +// deserialize part of data. +Bar newBar = barEncoder.fromRow(barStruct); +Bar newBar2 = barEncoder.fromRow(binaryArray4.getStruct(20)); +``` + +### Python + +```python +@dataclass +class Bar: + f1: str + f2: List[pa.int64] +@dataclass +class Foo: + f1: pa.int32 + f2: List[pa.int32] + f3: Dict[str, pa.int32] + f4: List[Bar] + +encoder = pyfury.encoder(Foo) +foo = Foo(f1=10, f2=list(range(1000_000)), + f3={f"k{i}": i for i in range(1000_000)}, + f4=[Bar(f1=f"s{i}", f2=list(range(10))) for i in range(1000_000)]) +binary: bytes = encoder.to_row(foo).to_bytes() +print(f"start: {datetime.datetime.now()}") +foo_row = pyfury.RowData(encoder.schema, binary) +print(foo_row.f2[100000], foo_row.f4[100000].f1, foo_row.f4[200000].f2[5]) +print(f"end: {datetime.datetime.now()}") + +binary = pickle.dumps(foo) +print(f"pickle start: {datetime.datetime.now()}") +new_foo = pickle.loads(binary) +print(new_foo.f2[100000], new_foo.f4[100000].f1, new_foo.f4[200000].f2[5]) +print(f"pickle end: {datetime.datetime.now()}") +``` + +### Apache Arrow Support + +Fury Format also supports automatic conversion from/to Arrow Table/RecordBatch. + +Java: + +```java +Schema schema = TypeInference.inferSchema(BeanA.class); +ArrowWriter arrowWriter = ArrowUtils.createArrowWriter(schema); +Encoder encoder = Encoders.rowEncoder(BeanA.class); +for (int i = 0; i < 10; i++) { + BeanA beanA = BeanA.createBeanA(2); + arrowWriter.write(encoder.toRow(beanA)); +} +return arrowWriter.finishAsRecordBatch(); +``` + +Python: + +```python +import pyfury +encoder = pyfury.encoder(Foo) +encoder.to_arrow_record_batch([foo] * 10000) +encoder.to_arrow_table([foo] * 10000) +``` + +C++ + +```c++ +std::shared_ptr arrow_writer; +EXPECT_TRUE( + ArrowWriter::Make(schema, ::arrow::default_memory_pool(), &arrow_writer) + .ok()); +for (auto &row : rows) { + EXPECT_TRUE(arrow_writer->Write(row).ok()); +} +std::shared_ptr<::arrow::RecordBatch> record_batch; +EXPECT_TRUE(arrow_writer->Finish(&record_batch).ok()); +EXPECT_TRUE(record_batch->Validate().ok()); +EXPECT_EQ(record_batch->num_columns(), schema->num_fields()); +EXPECT_EQ(record_batch->num_rows(), row_nums); +``` + +```java +Schema schema = TypeInference.inferSchema(BeanA.class); +ArrowWriter arrowWriter = ArrowUtils.createArrowWriter(schema); +Encoder encoder = Encoders.rowEncoder(BeanA.class); +for (int i = 0; i < 10; i++) { + BeanA beanA = BeanA.createBeanA(2); + arrowWriter.write(encoder.toRow(beanA)); +} +return arrowWriter.finishAsRecordBatch(); +``` diff --git a/versioned_docs/version-0.10.0/guide/scala_guide.md b/versioned_docs/version-0.10.0/guide/scala_guide.md new file mode 100644 index 00000000000..373fbd9203a --- /dev/null +++ b/versioned_docs/version-0.10.0/guide/scala_guide.md @@ -0,0 +1,170 @@ +--- +title: Scala Serialization Guide +sidebar_position: 4 +id: scala_guide +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +Fury supports all scala object serialization: + +- `case` class serialization supported +- `pojo/bean` class serialization supported +- `object` singleton serialization supported +- `collection` serialization supported +- other types such as `tuple/either` and basic types are all supported too. + +Scala 2 and 3 are both supported. + +## Install + +To add a dependency on Fury scala for scala 2 with sbt, use the following: + +```sbt +libraryDependencies += "org.apache.fury" % "fury-scala_2.13" % "0.10.0" +``` + +To add a dependency on Fury scala for scala 3 with sbt, use the following: + +```sbt +libraryDependencies += "org.apache.fury" % "fury-scala_3" % "0.10.0" +``` + +## Quict Start + +```scala +case class Person(name: String, id: Long, github: String) +case class Point(x : Int, y : Int, z : Int) + +object ScalaExample { + val fury: Fury = Fury.builder().withScalaOptimizationEnabled(true).build() + // Register optimized fury serializers for scala + ScalaSerializers.registerSerializers(fury) + fury.register(classOf[Person]) + fury.register(classOf[Point]) + + def main(args: Array[String]): Unit = { + val p = Person("Shawn Yang", 1, "https://github.com/chaokunyang") + println(fury.deserialize(fury.serialize(p))) + println(fury.deserialize(fury.serialize(Point(1, 2, 3)))) + } +} +``` + +## Fury creation + +When using fury for scala serialization, you should create fury at least with following options: + +```scala +import org.apache.fury.Fury +import org.apache.fury.serializer.scala.ScalaSerializers + +val fury = Fury.builder().withScalaOptimizationEnabled(true).build() + +// Register optimized fury serializers for scala +ScalaSerializers.registerSerializers(fury) +``` + +Depending on the object types you serialize, you may need to register some scala internal types: + +```scala +fury.register(Class.forName("scala.Enumeration.Val")) +``` + +If you want to avoid such registration, you can disable class registration by `FuryBuilder#requireClassRegistration(false)`. +Note that this option allow to deserialize objects unknown types, more flexible but may be insecure if the classes contains malicious code. + +And circular references are common in scala, `Reference tracking` should be enabled by `FuryBuilder#withRefTracking(true)`. If you don't enable reference tracking, [StackOverflowError](https://github.com/apache/fury/issues/1032) may happen for some scala versions when serializing scala Enumeration. + +Note that fury instance should be shared between multiple serialization, the creation of fury instance is not cheap. + +If you use shared fury instance across multiple threads, you should create `ThreadSafeFury` instead by `FuryBuilder#buildThreadSafeFury()` instead. + +## Serialize case object + +```scala +case class Person(github: String, age: Int, id: Long) +val p = Person("https://github.com/chaokunyang", 18, 1) +println(fury.deserialize(fury.serialize(p))) +println(fury.deserializeJavaObject(fury.serializeJavaObject(p))) +``` + +## Serialize pojo + +```scala +class Foo(f1: Int, f2: String) { + override def toString: String = s"Foo($f1, $f2)" +} +println(fury.deserialize(fury.serialize(Foo(1, "chaokunyang")))) +``` + +## Serialize object singleton + +```scala +object singleton { +} +val o1 = fury.deserialize(fury.serialize(singleton)) +val o2 = fury.deserialize(fury.serialize(singleton)) +println(o1 == o2) +``` + +## Serialize collection + +```scala +val seq = Seq(1,2) +val list = List("a", "b") +val map = Map("a" -> 1, "b" -> 2) +println(fury.deserialize(fury.serialize(seq))) +println(fury.deserialize(fury.serialize(list))) +println(fury.deserialize(fury.serialize(map))) +``` + +## Serialize Tuple + +```scala +val tuple = Tuple2(100, 10000L) +println(fury.deserialize(fury.serialize(tuple))) +val tuple = Tuple4(100, 10000L, 10000L, "str") +println(fury.deserialize(fury.serialize(tuple))) +``` + +## Serialize Enum + +### Scala3 Enum + +```scala +enum Color { case Red, Green, Blue } +println(fury.deserialize(fury.serialize(Color.Green))) +``` + +### Scala2 Enum + +```scala +object ColorEnum extends Enumeration { + type ColorEnum = Value + val Red, Green, Blue = Value +} +println(fury.deserialize(fury.serialize(ColorEnum.Green))) +``` + +## Serialize Option + +```scala +val opt: Option[Long] = Some(100) +println(fury.deserialize(fury.serialize(opt))) +val opt1: Option[Long] = None +println(fury.deserialize(fury.serialize(opt1))) +``` diff --git a/versioned_docs/version-0.10.0/guide/xlang_serialization_guide.md b/versioned_docs/version-0.10.0/guide/xlang_serialization_guide.md new file mode 100644 index 00000000000..99bd4d1acad --- /dev/null +++ b/versioned_docs/version-0.10.0/guide/xlang_serialization_guide.md @@ -0,0 +1,612 @@ +--- +title: Xlang Serialization Guide +sidebar_position: 2 +id: xlang_object_graph_guide +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Cross-language object graph serialization + +### Serialize built-in types + +Common types can be serialized automatically: primitive numeric types, string, binary, array, list, map and so on. + +**Java** + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; + +import java.util.*; + +public class Example1 { + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG).build(); + List list = ofArrayList(true, false, "str", -1.1, 1, new int[100], new double[20]); + byte[] bytes = fury.serialize(list); + // bytes can be data serialized by other languages. + fury.deserialize(bytes); + Map map = new HashMap<>(); + map.put("k1", "v1"); + map.put("k2", list); + map.put("k3", -1); + bytes = fury.serialize(map); + // bytes can be data serialized by other languages. + fury.deserialize(bytes); + } +} +``` + +**Python** + +```python +import pyfury +import numpy as np + +fury = pyfury.Fury() +object_list = [True, False, "str", -1.1, 1, + np.full(100, 0, dtype=np.int32), np.full(20, 0.0, dtype=np.double)] +data = fury.serialize(object_list) +# bytes can be data serialized by other languages. +new_list = fury.deserialize(data) +object_map = {"k1": "v1", "k2": object_list, "k3": -1} +data = fury.serialize(object_map) +# bytes can be data serialized by other languages. +new_map = fury.deserialize(data) +print(new_map) +``` + +**Golang** + +```go +package main + +import furygo "github.com/apache/fury/fury/go/fury" +import "fmt" + +func main() { + list := []interface{}{true, false, "str", -1.1, 1, make([]int32, 10), make([]float64, 20)} + fury := furygo.NewFury() + bytes, err := fury.Marshal(list) + if err != nil { + panic(err) + } + var newValue interface{} + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) + dict := map[string]interface{}{ + "k1": "v1", + "k2": list, + "k3": -1, + } + bytes, err = fury.Marshal(dict) + if err != nil { + panic(err) + } + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) +} +``` + +**JavaScript** + +```javascript +import Fury from '@furyjs/fury'; + +/** + * @furyjs/hps use v8's fast-calls-api that can be called directly by jit, ensure that the version of Node is 20 or above. + * Experimental feature, installation success cannot be guaranteed at this moment + * If you are unable to install the module, replace it with `const hps = null;` + **/ +import hps from '@furyjs/hps'; + +const fury = new Fury({ hps }); +const input = fury.serialize('hello fury'); +const result = fury.deserialize(input); +console.log(result); +``` + +**Rust** + +```rust +use chrono::{NaiveDate, NaiveDateTime}; +use fury::{from_buffer, to_buffer, Fury}; +use std::collections::HashMap; + +fn run() { + let bin: Vec = to_buffer(&"hello".to_string()); + let obj: String = from_buffer(&bin).expect("should success"); + assert_eq!("hello".to_string(), obj); +} +``` + +### Serialize custom types + +Serializing user-defined types needs registering the custom type using the register API to establish the mapping relationship between the type in different languages. + +**Java** + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; +import java.util.*; + +public class Example2 { + public static class SomeClass1 { + Object f1; + Map f2; + } + + public static class SomeClass2 { + Object f1; + String f2; + List f3; + Map f4; + Byte f5; + Short f6; + Integer f7; + Long f8; + Float f9; + Double f10; + short[] f11; + List f12; + } + + public static Object createObject() { + SomeClass1 obj1 = new SomeClass1(); + obj1.f1 = true; + obj1.f2 = ofHashMap((byte) -1, 2); + SomeClass2 obj = new SomeClass2(); + obj.f1 = obj1; + obj.f2 = "abc"; + obj.f3 = ofArrayList("abc", "abc"); + obj.f4 = ofHashMap((byte) 1, 2); + obj.f5 = Byte.MAX_VALUE; + obj.f6 = Short.MAX_VALUE; + obj.f7 = Integer.MAX_VALUE; + obj.f8 = Long.MAX_VALUE; + obj.f9 = 1.0f / 2; + obj.f10 = 1 / 3.0; + obj.f11 = new short[]{(short) 1, (short) 2}; + obj.f12 = ofArrayList((short) -1, (short) 4); + return obj; + } + + // mvn exec:java -Dexec.mainClass="org.apache.fury.examples.Example2" + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG).build(); + fury.register(SomeClass1.class, "example.SomeClass1"); + fury.register(SomeClass2.class, "example.SomeClass2"); + byte[] bytes = fury.serialize(createObject()); + // bytes can be data serialized by other languages. + System.out.println(fury.deserialize(bytes)); + } +} +``` + +**Python** + +```python +from dataclasses import dataclass +from typing import List, Dict, Any +import pyfury, array + + +@dataclass +class SomeClass1: + f1: Any + f2: Dict[pyfury.Int8Type, pyfury.Int32Type] + + +@dataclass +class SomeClass2: + f1: Any = None + f2: str = None + f3: List[str] = None + f4: Dict[pyfury.Int8Type, pyfury.Int32Type] = None + f5: pyfury.Int8Type = None + f6: pyfury.Int16Type = None + f7: pyfury.Int32Type = None + # int type will be taken as `pyfury.Int64Type`. + # use `pyfury.Int32Type` for type hint if peer + # are more narrow type. + f8: int = None + f9: pyfury.Float32Type = None + # float type will be taken as `pyfury.Float64Type` + f10: float = None + f11: pyfury.Int16ArrayType = None + f12: List[pyfury.Int16Type] = None + + +if __name__ == "__main__": + f = pyfury.Fury() + f.register_type(SomeClass1, typename="example.SomeClass1") + f.register_type(SomeClass2, typename="example.SomeClass2") + obj1 = SomeClass1(f1=True, f2={-1: 2}) + obj = SomeClass2( + f1=obj1, + f2="abc", + f3=["abc", "abc"], + f4={1: 2}, + f5=2 ** 7 - 1, + f6=2 ** 15 - 1, + f7=2 ** 31 - 1, + f8=2 ** 63 - 1, + f9=1.0 / 2, + f10=1 / 3.0, + f11=array.array("h", [1, 2]), + f12=[-1, 4], + ) + data = f.serialize(obj) + # bytes can be data serialized by other languages. + print(f.deserialize(data)) +``` + +**Golang** + +```go +package main + +import furygo "github.com/apache/fury/fury/go/fury" +import "fmt" + +func main() { + type SomeClass1 struct { + F1 interface{} + F2 string + F3 []interface{} + F4 map[int8]int32 + F5 int8 + F6 int16 + F7 int32 + F8 int64 + F9 float32 + F10 float64 + F11 []int16 + F12 fury.Int16Slice + } + + type SomeClas2 struct { + F1 interface{} + F2 map[int8]int32 + } + fury := furygo.NewFury() + if err := fury.RegisterTagType("example.SomeClass1", SomeClass1{}); err != nil { + panic(err) + } + if err := fury.RegisterTagType("example.SomeClass2", SomeClass2{}); err != nil { + panic(err) + } + obj1 := &SomeClass1{} + obj1.F1 = true + obj1.F2 = map[int8]int32{-1: 2} + obj := &SomeClass1{} + obj.F1 = obj1 + obj.F2 = "abc" + obj.F3 = []interface{}{"abc", "abc"} + f4 := map[int8]int32{1: 2} + obj.F4 = f4 + obj.F5 = fury.MaxInt8 + obj.F6 = fury.MaxInt16 + obj.F7 = fury.MaxInt32 + obj.F8 = fury.MaxInt64 + obj.F9 = 1.0 / 2 + obj.F10 = 1 / 3.0 + obj.F11 = []int16{1, 2} + obj.F12 = []int16{-1, 4} + bytes, err := fury.Marshal(obj); + if err != nil { + panic(err) + } + var newValue interface{} + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) +} +``` + +**JavaScript** + +```javascript +import Fury, { Type, InternalSerializerType } from '@furyjs/fury'; + +/** + * @furyjs/hps use v8's fast-calls-api that can be called directly by jit, ensure that the version of Node is 20 or above. + * Experimental feature, installation success cannot be guaranteed at this moment + * If you are unable to install the module, replace it with `const hps = null;` + **/ +import hps from '@furyjs/hps'; + +// Now we describe data structures using JSON, but in the future, we will use more ways. +const description = Type.object('example.foo', { + foo: Type.string(), +}); +const fury = new Fury({ hps }); +const { serialize, deserialize } = fury.registerSerializer(description); +const input = serialize({ foo: 'hello fury' }); +const result = deserialize(input); +console.log(result); +``` + +**Rust** + +```rust +use chrono::{NaiveDate, NaiveDateTime}; +use fury::{from_buffer, to_buffer, Fury}; +use std::collections::HashMap; + +#[test] +fn complex_struct() { + #[derive(Fury, Debug, PartialEq)] + #[tag("example.foo2")] + struct Animal { + category: String, + } + + #[derive(Fury, Debug, PartialEq)] + #[tag("example.foo")] + struct Person { + c1: Vec, // binary + c2: Vec, // primitive array + animal: Vec, + c3: Vec>, + name: String, + c4: HashMap, + age: u16, + op: Option, + op2: Option, + date: NaiveDate, + time: NaiveDateTime, + c5: f32, + c6: f64, + } + let person: Person = Person { + c1: vec![1, 2, 3], + c2: vec![5, 6, 7], + c3: vec![vec![1, 2], vec![1, 3]], + animal: vec![Animal { + category: "Dog".to_string(), + }], + c4: HashMap::from([ + ("hello1".to_string(), "hello2".to_string()), + ("hello2".to_string(), "hello3".to_string()), + ]), + age: 12, + name: "helo".to_string(), + op: Some("option".to_string()), + op2: None, + date: NaiveDate::from_ymd_opt(2025, 12, 12).unwrap(), + time: NaiveDateTime::from_timestamp_opt(1689912359, 0).unwrap(), + c5: 2.0, + c6: 4.0, + }; + + let bin: Vec = to_buffer(&person); + let obj: Person = from_buffer(&bin).expect("should success"); + assert_eq!(person, obj); +} +``` + +### Serialize Shared Reference and Circular Reference + +Shared reference and circular reference can be serialized automatically, no duplicate data or recursion error. + +**Java** + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; +import java.util.*; + +public class ReferenceExample { + public static class SomeClass { + SomeClass f1; + Map f2; + Map f3; + } + + public static Object createObject() { + SomeClass obj = new SomeClass(); + obj.f1 = obj; + obj.f2 = ofHashMap("k1", "v1", "k2", "v2"); + obj.f3 = obj.f2; + return obj; + } + + // mvn exec:java -Dexec.mainClass="org.apache.fury.examples.ReferenceExample" + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG) + .withRefTracking(true).build(); + fury.register(SomeClass.class, "example.SomeClass"); + byte[] bytes = fury.serialize(createObject()); + // bytes can be data serialized by other languages. + System.out.println(fury.deserialize(bytes)); + } +} +``` + +**Python** + +```python +from typing import Dict +import pyfury + +class SomeClass: + f1: "SomeClass" + f2: Dict[str, str] + f3: Dict[str, str] + +fury = pyfury.Fury(ref_tracking=True) +fury.register_type(SomeClass, typename="example.SomeClass") +obj = SomeClass() +obj.f2 = {"k1": "v1", "k2": "v2"} +obj.f1, obj.f3 = obj, obj.f2 +data = fury.serialize(obj) +# bytes can be data serialized by other languages. +print(fury.deserialize(data)) +``` + +**Golang** + +```go +package main + +import furygo "github.com/apache/fury/fury/go/fury" +import "fmt" + +func main() { + type SomeClass struct { + F1 *SomeClass + F2 map[string]string + F3 map[string]string + } + fury := furygo.NewFury(true) + if err := fury.RegisterTagType("example.SomeClass", SomeClass{}); err != nil { + panic(err) + } + value := &SomeClass{F2: map[string]string{"k1": "v1", "k2": "v2"}} + value.F3 = value.F2 + value.F1 = value + bytes, err := fury.Marshal(value) + if err != nil { + } + var newValue interface{} + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) +} +``` + +**JavaScript** + +```javascript +import Fury, { Type } from '@furyjs/fury'; +/** + * @furyjs/hps use v8's fast-calls-api that can be called directly by jit, ensure that the version of Node is 20 or above. + * Experimental feature, installation success cannot be guaranteed at this moment + * If you are unable to install the module, replace it with `const hps = null;` + **/ +import hps from '@furyjs/hps'; + +const description = Type.object('example.foo', { + foo: Type.string(), + bar: Type.object('example.foo'), +}); + +const fury = new Fury({ hps }); +const { serialize, deserialize } = fury.registerSerializer(description); +const data: any = { + foo: 'hello fury', +}; +data.bar = data; +const input = serialize(data); +const result = deserialize(input); +console.log(result.bar.foo === result.foo); +``` + +**JavaScript** +Reference cannot be implemented because of rust ownership restrictions + +### Zero-Copy Serialization + +**Java** + +```java +import org.apache.fury.*; +import org.apache.fury.config.*; +import org.apache.fury.serializer.BufferObject; +import org.apache.fury.memory.MemoryBuffer; + +import java.util.*; +import java.util.stream.Collectors; + +public class ZeroCopyExample { + // mvn exec:java -Dexec.mainClass="io.ray.fury.examples.ZeroCopyExample" + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG).build(); + List list = ofArrayList("str", new byte[1000], new int[100], new double[100]); + Collection bufferObjects = new ArrayList<>(); + byte[] bytes = fury.serialize(list, e -> !bufferObjects.add(e)); + // bytes can be data serialized by other languages. + List buffers = bufferObjects.stream() + .map(BufferObject::toBuffer).collect(Collectors.toList()); + System.out.println(fury.deserialize(bytes, buffers)); + } +} +``` + +**Python** + +```python +import array +import pyfury +import numpy as np + +fury = pyfury.Fury() +list_ = ["str", bytes(bytearray(1000)), + array.array("i", range(100)), np.full(100, 0.0, dtype=np.double)] +serialized_objects = [] +data = fury.serialize(list_, buffer_callback=serialized_objects.append) +buffers = [o.to_buffer() for o in serialized_objects] +# bytes can be data serialized by other languages. +print(fury.deserialize(data, buffers=buffers)) +``` + +**Golang** + +```go +package main + +import furygo "github.com/apache/fury/fury/go/fury" +import "fmt" + +func main() { + fury := furygo.NewFury() + list := []interface{}{"str", make([]byte, 1000)} + buf := fury.NewByteBuffer(nil) + var bufferObjects []fury.BufferObject + fury.Serialize(buf, list, func(o fury.BufferObject) bool { + bufferObjects = append(bufferObjects, o) + return false + }) + var newList []interface{} + var buffers []*fury.ByteBuffer + for _, o := range bufferObjects { + buffers = append(buffers, o.ToBuffer()) + } + if err := fury.Deserialize(buf, &newList, buffers); err != nil { + panic(err) + } + fmt.Println(newList) +} +``` + +**JavaScript** + +```javascript +// Coming soon +``` diff --git a/versioned_docs/version-0.10.0/guide/xlang_type_mapping.md b/versioned_docs/version-0.10.0/guide/xlang_type_mapping.md new file mode 100644 index 00000000000..b6acb9a9695 --- /dev/null +++ b/versioned_docs/version-0.10.0/guide/xlang_type_mapping.md @@ -0,0 +1,116 @@ +--- +title: Type Mapping of Xlang Serialization +sidebar_position: 3 +id: xlang_type_mapping +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +Note: + +- For type definition, see [Type Systems in Spec](../specification/xlang_serialization_spec.md#type-systems) +- `int16_t[n]/vector` indicates `int16_t[n]/vector` +- The cross-language serialization is not stable, do not use it in your production environment. + +## Type Mapping + +| Fury Type | Fury Type ID | Java | Python | Javascript | C++ | Golang | Rust | +|-------------------------|--------------|-----------------|-----------------------------------|-----------------|--------------------------------|------------------|------------------| +| bool | 1 | bool/Boolean | bool | Boolean | bool | bool | bool | +| int8 | 2 | byte/Byte | int/pyfury.Int8 | Type.int8() | int8_t | int8 | i8 | +| int16 | 3 | short/Short | int/pyfury.Int16 | Type.int16() | int16_t | int16 | i6 | +| int32 | 4 | int/Integer | int/pyfury.Int32 | Type.int32() | int32_t | int32 | i32 | +| var_int32 | 5 | int/Integer | int/pyfury.VarInt32 | Type.varint32() | fury::varint32_t | fury.varint32 | fury::varint32 | +| int64 | 6 | long/Long | int/pyfury.Int64 | Type.int64() | int64_t | int64 | i64 | +| var_int64 | 7 | long/Long | int/pyfury.VarInt64 | Type.varint64() | fury::varint64_t | fury.varint64 | fury::varint64 | +| sli_int64 | 8 | long/Long | int/pyfury.SliInt64 | Type.sliint64() | fury::sliint64_t | fury.sliint64 | fury::sliint64 | +| float16 | 9 | float/Float | float/pyfury.Float16 | Type.float16() | fury::float16_t | fury.float16 | fury::f16 | +| float32 | 10 | float/Float | float/pyfury.Float32 | Type.float32() | float | float32 | f32 | +| float64 | 11 | double/Double | float/pyfury.Float64 | Type.float64() | double | float64 | f64 | +| string | 12 | String | str | String | string | string | String/str | +| enum | 13 | Enum subclasses | enum subclasses | / | enum | / | enum | +| named_enum | 14 | Enum subclasses | enum subclasses | / | enum | / | enum | +| struct | 15 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| compatible_struct | 16 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_struct | 17 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_compatible_struct | 18 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| ext | 19 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_ext | 20 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| list | 21 | List/Collection | list/tuple | array | vector | slice | Vec | +| set | 22 | Set | set | / | set | fury.Set | Set | +| map | 23 | Map | dict | Map | unordered_map | map | HashMap | +| duration | 24 | Duration | timedelta | Number | duration | Duration | Duration | +| timestamp | 25 | Instant | datetime | Number | std::chrono::nanoseconds | Time | DateTime | +| local_date | 26 | Date | datetime | Number | std::chrono::nanoseconds | Time | DateTime | +| decimal | 27 | BigDecimal | Decimal | bigint | / | / | / | +| binary | 28 | byte[] | bytes | / | `uint8_t[n]/vector` | `[n]uint8/[]T` | `Vec` | +| array | 29 | array | np.ndarray | / | / | array/slice | Vec | +| bool_array | 30 | bool[] | ndarray(np.bool_) | / | `bool[n]` | `[n]bool/[]T` | `Vec` | +| int8_array | 31 | byte[] | ndarray(int8) | / | `int8_t[n]/vector` | `[n]int8/[]T` | `Vec` | +| int16_array | 32 | short[] | ndarray(int16) | / | `int16_t[n]/vector` | `[n]int16/[]T` | `Vec` | +| int32_array | 33 | int[] | ndarray(int32) | / | `int32_t[n]/vector` | `[n]int32/[]T` | `Vec` | +| int64_array | 34 | long[] | ndarray(int64) | / | `int64_t[n]/vector` | `[n]int64/[]T` | `Vec` | +| float16_array | 35 | float[] | ndarray(float16) | / | `fury::float16_t[n]/vector` | `[n]float16/[]T` | `Vec` | +| float32_array | 36 | float[] | ndarray(float32) | / | `float[n]/vector` | `[n]float32/[]T` | `Vec` | +| float64_array | 37 | double[] | ndarray(float64) | / | `double[n]/vector` | `[n]float64/[]T` | `Vec` | +| arrow record batch | 38 | / | / | / | / | / | / | +| arrow table | 39 | / | / | / | / | / | / | + +## Type info(not implemented currently) + +Due to differences between type systems of languages, those types can't be mapped one-to-one between languages. + +If the user notices that one type on a language corresponds to multiple types in Fury type systems, for example, `long` +in java has type `int64/varint64/sliint64`, it means the language lacks some types, and the user must provide extra type +info when using Fury. + +## Type annotation + +If the type is a field of another class, users can provide meta hints for fields of a type, or for the whole type. +Such information can be provided in other languages too: + +- java: use annotation. +- cpp: use macro and template. +- golang: use struct tag. +- python: use typehint. +- rust: use macro. + +Here is en example: + +- Java: + + ```java + class Foo { + @Int32Type(varint = true) + int f1; + List<@Int32Type(varint = true) Integer> f2; + } + ``` + +- Python: + + ```python + class Foo: + f1: Int32Type(varint=True) + f2: List[Int32Type(varint=True)] + ``` + +## Type wrapper + +If the type is not a field of a class, the user must wrap this type with a Fury type to pass the extra type info. + +For example, suppose Fury Java provide a `VarInt64` type, when a user invoke `fury.serialize(long_value)`, he need to +invoke like `fury.serialize(new VarInt64(long_value))`. diff --git a/versioned_docs/version-0.10.0/introduction/benchmark.md b/versioned_docs/version-0.10.0/introduction/benchmark.md new file mode 100644 index 00000000000..34bce8f04aa --- /dev/null +++ b/versioned_docs/version-0.10.0/introduction/benchmark.md @@ -0,0 +1,37 @@ +--- +id: benchmark +title: Benchmark +sidebar_position: 2 +--- + +Different serialization frameworks are suitable for different scenarios, and benchmark results here are for reference only. + +If you need to benchmark for your specific scenario, make sure all serialization frameworks are appropriately configured for that scenario. + +Dynamic serialization frameworks supports polymorphism and reference, which has more cost compared +to static serialization frameworks, unless it uses the jit techniques as fury did. +Since fury will generate code at runtime, please warm up before collecting benchmark statistics. + +### Java Serialization + + + + + + +### Java Deserialization + + + + + + +See [benchmarks](https://github.com/apache/fury/tree/main/docs/benchmarks) for more benchmarks about type forward/backward compatibility, off-heap support, zero-copy serialization. + +### JavaScript + + + +The data used for this bar graph includes a complex object that has many kinds of field types, and the size of the JSON data is 3KB. + +See [benchmarks](https://github.com/apache/fury/blob/main/javascript/benchmark/index.js) for the benchmark code. diff --git a/versioned_docs/version-0.10.0/introduction/features.md b/versioned_docs/version-0.10.0/introduction/features.md new file mode 100644 index 00000000000..1c03b0f9ff4 --- /dev/null +++ b/versioned_docs/version-0.10.0/introduction/features.md @@ -0,0 +1,24 @@ +--- +id: features +title: Features +sidebar_position: 3 +--- + +- Multiple languages: Java/Python/C++/Golang/Javascript/Rust. +- Zero-copy: cross-language out-of-band serialization inspired + by [pickle5](https://peps.python.org/pep-0574/) and off-heap read/write. +- High performance: A highly-extensible JIT framework to generate serializer code at runtime in an async multi-thread way to speed serialization, providing 20-170x speed up by: + - reduce memory access by inline variable in generated code. + - reduce virtual method invocation by inline call in generated code. + - reduce conditional branching. + - reduce hash lookup. + - binary protocols: object graph, row format and so on. + +In addition to cross-language serialization, Fury also features at: + +- Drop-in replace Java serialization frameworks such as JDK/Kryo/Hessian without modifying any code, but 100x faster. + It can greatly improve the efficiency of high-performance RPC calls, data transfer and object persistence. +- JDK serialization 100% compatible, support java custom serialization + `writeObject/readObject/writeReplace/readResolve/readObjectNoData` natively. +- Supports shared and circular reference object serialization for golang. +- Supports automatic object serialization for golang. diff --git a/versioned_docs/version-0.10.0/introduction/introduction.md b/versioned_docs/version-0.10.0/introduction/introduction.md new file mode 100644 index 00000000000..02bdde7fcce --- /dev/null +++ b/versioned_docs/version-0.10.0/introduction/introduction.md @@ -0,0 +1,62 @@ +--- +id: introduction +title: Introduction +sidebar_position: 1 +--- + +Fury is a blazing fast multi-language serialization framework powered by jit(just-in-time compilation) and zero-copy. + +## Protocols + +Different scenarios have different serialization requirements. Fury designed and implemented +multiple binary protocols for those requirements: + +- Cross-language object graph protocol: + - Cross-language serialize any object automatically, no need for IDL definition, schema compilation and object to/from protocol + conversion. + - Support shared reference and circular reference, no duplicate data or recursion error. + - Support object polymorphism. +- Native java/python object graph protocol: highly-optimized based on type system of the language. +- Row format protocol: a cache-friendly binary random access format, supports skipping serialization and partial serialization, + and can convert to column-format automatically. + +New protocols can be added easily based on fury existing buffer, encoding, meta, codegen and other capabilities. All of those share same codebase, and the optimization for one protocol +can be reused by another protocol. + +## Compatibility + +### Schema Compatibility + +Fury java object graph serialization support class schema forward/backward compatibility. The serialization peer and deserialization peer can add/delete fields independently. + +We plan to add support cross-language serialization after [meta compression](https://github.com/apache/fury/issues/203) are finished. + +### Binary Compatibility + +We are still improving our protocols, binary compatibility are not ensured between fury releases for now. Please `shade` fury if you will upgrade fury in the future. + +Binary compatibility will be ensured before fury 1.0. + +## Security + +Static serialization such as row format are secure by nature. But dynamic object graph serialization supports deserialize unregistered types, which can introduce security risks. + +For example, the deserialization may invoke `init` constructor or `equals`/`hashCode` method, if the method body contains malicious code, the system will be at risks. + +Fury provides a class registration mode option and enabled by default for this protocol, which allows deserializing trusted registered types or built-in types only for security. + +Fury provides a class registration option and enabled by default for such protocols, which allows only deserializing trusted registered types or built-in types. **Do not disable class registration or class registration checks unless you can ensure your environment is indeed secure**. We are not responsible for security if you disabled the class registration option. + +## RoadMap + +- Meta compression, auto meta sharing and cross-language schema compatibility. +- AOT Framework for c++/golang to generate code statically. +- C++/Rust object graph serialization support +- Golang/Rust/NodeJS row format support +- ProtoBuffer compatibility support +- Protocols for features and knowledge graph serialization +- Continuously improve our serialization infrastructure for any new protocols + +## How to Contribute + +Please read the [CONTRIBUTING](https://github.com/apache/fury/blob/main/CONTRIBUTING.md) guide for instructions on how to contribute. diff --git a/versioned_docs/version-0.10.0/specification/java_serialization_spec.md b/versioned_docs/version-0.10.0/specification/java_serialization_spec.md new file mode 100644 index 00000000000..82709374608 --- /dev/null +++ b/versioned_docs/version-0.10.0/specification/java_serialization_spec.md @@ -0,0 +1,557 @@ +--- +title: Fury Java Serialization Format +sidebar_position: 1 +id: fury_java_serialization_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Spec overview + +Fury Java Serialization is an automatic object serialization framework that supports reference and polymorphism. Fury +will +convert an object from/to fury java serialization binary format. Fury has two core concepts for java serialization: + +- **Fury Java Binary format** +- **Framework to convert object to/from Fury Java Binary format** + +The serialization format is a dynamic binary format. The dynamics and reference/polymorphism support make Fury flexible, +much more easy to use, but +also introduce more complexities compared to static serialization frameworks. So the format will be more complex. + +Here is the overall format: + +``` +| fury header | object ref meta | object class meta | object value data | +``` + +The data are serialized using little endian byte order overall. If bytes swap is costly for some object, +Fury will write the byte order for that object into the data instead of converting it to little endian. + +## Fury header + +Fury header consists starts one byte: + +``` +| 4 bits | 1 bit | 1 bit | 1 bit | 1 bit | optional 4 bytes | ++---------------+-------+-------+--------+-------+------------------------------------+ +| reserved bits | oob | xlang | endian | null | unsigned int for meta start offset | +``` + +- null flag: 1 when object is null, 0 otherwise. If an object is null, other bits won't be set. +- endian flag: 1 when data is encoded by little endian, 0 for big endian. +- xlang flag: 1 when serialization uses xlang format, 0 when serialization uses Fury java format. +- oob flag: 1 when passed `BufferCallback` is not null, 0 otherwise. + +If meta share mode is enabled, an uncompressed unsigned int is appended to indicate the start offset of metadata. + +## Reference Meta + +Reference tracking handles whether the object is null, and whether to track reference for the object by writing +corresponding flags and maintaining internal state. + +Reference flags: + +| Flag | Byte Value | Description | +|---------------------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| +| NULL FLAG | `-3` | This flag indicates the object is a null value. We don't use another byte to indicate REF, so that we can save one byte. | +| REF FLAG | `-2` | This flag indicates the object is already serialized previously, and fury will write a ref id with unsigned varint format instead of serialize it again | +| NOT_NULL VALUE FLAG | `-1` | This flag indicates the object is a non-null value and fury doesn't track ref for this type of object. | +| REF VALUE FLAG | `0` | This flag indicates the object is referencable and the first time to serialize. | + +When reference tracking is disabled globally or for specific types, or for certain types within a particular +context(e.g., a field of a class), only the `NULL` and `NOT_NULL VALUE` flags will be used for reference meta. + +## Class Meta + +Fury supports to register class by an optional id, the registration can be used for security check and class +identification. +If a class is registered, it will have a user-provided or an auto-growing unsigned int i.e. `class_id`. + +Depending on whether meta share mode and registration is enabled for current class, Fury will write class meta +differently. + +### Schema consistent + +If schema consistent mode is enabled globally or enabled for current class, class meta will be written as follows: + +- If class is registered, it will be written as a fury unsigned varint: `class_id << 1`. +- If class is not registered: + - If class is not an array, fury will write one byte `0bxxxxxxx1` first, then write class name. + - The first little bit is `1`, which is different from first bit `0` of + encoded class id. Fury can use this information to determine whether to read class by class id for + deserialization. + - If class is not registered and class is an array, fury will write one byte `dimensions << 1 | 1` first, then write + component + class subsequently. This can reduce array class name cost if component class is or will be serialized. + - Class will be written as two enumerated fury unsigned by default: `package name` and `class name`. If meta share + mode is + enabled, + class will be written as an unsigned varint which points to index in `MetaContext`. + +### Schema evolution + +If schema evolution mode is enabled globally or enabled for current class, class meta will be written as follows: + +- If meta share mode is not enabled, class meta will be written as schema consistent mode. Additionally, field meta such + as field type + and name will be written with the field value using a key-value like layout. +- If meta share mode is enabled, class meta will be written as a meta-share encoded binary if class hasn't been written + before, otherwise an unsigned varint id which references to previous written class meta will be written. + +## Meta share + +> This mode will forbid streaming writing since it needs to look back for update the start offset after the whole object +> graph +> writing and meta collecting is finished. Only in this way we can ensure deserialization failure doesn't lost shared +> meta. +> Meta streamline will be supported in the future for enclosed meta sharing which doesn't cross multiple serializations +> of different objects. + +For Schema consistent mode, class will be encoded as an enumerated string by full class name. Here we mainly describe +the meta layout for schema evolution mode: + +``` +| 8 bytes meta header | meta size | variable bytes | variable bytes | variable bytes | ++-------------------------------+-----------|--------------------+-------------------+----------------+ +| 7 bytes hash + 1 bytes header | 1~2 bytes | current class meta | parent class meta | ... | +``` + +Class meta are encoded from parent class to leaf class, only class with serializable fields will be encoded. + +### Meta header + +Meta header is a 64 bits number value encoded in little endian order. + +- Lowest 4 digits `0b0000~0b1110` are used to record num classes. `0b1111` is preserved to indicate that Fury need to + read more bytes for length using Fury unsigned int encoding. If current class doesn't has parent class, or parent + class doesn't have fields to serialize, or we're in a context which serialize fields of current class + only( `ObjectStreamSerializer#SlotInfo` is an example), num classes will be 1. +- 5rd bit is used to indicate whether this class needs schema evolution. +- 6rd bit is used to indicate whether the size sum of all layers meta is less than 256. +- Other 56 bits is used to store the unique hash of `flags + all layers class meta`. + +### Meta size + +- If the size sum of all layers meta is less than 256, then one byte is written next to indicate the length of meta. +- Otherwise, write size as two bytes in little endian. + +### Single layer class meta + +``` +| unsigned varint | meta string | meta string | field info: variable bytes | variable bytes | ... | ++----------------------------+-----------------------+---------------------+-------------------------------+-----------------+-----+ +| num fields + register flag | header + package name | header + class name | header + type id + field name | next field info | ... | +``` + +- num fields: encode `num fields << 1 | register flag(1 when class registered)` as unsigned varint. + - If class is registered, then an unsigned varint class id will be written next, package and class name will be + omitted. + - If current class is schema consistent, then num field will be `0` to flag it. + - If current class isn't schema consistent, then num field will be the number of compatible fields. For example, + users + can use tag id to mark some field as compatible field in schema consistent context. In such cases, schema + consistent + fields will be serialized first, then compatible fields will be serialized next. At deserialization, Fury will use + fields info of those fields which aren't annotated by tag id for deserializing schema consistent fields, then use + fields info in meta for deserializing compatible fields. +- Package name encoding(omitted when class is registered): + - encoding algorithm: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL` + - Header: `6 bits size | 2 bits encoding flags`. The `6 bits size: 0~63` will be used to indicate size `0~63`, + the value `63` the size need more byte to read, the encoding will encode `size - 63` as a varint next. +- Class name encoding(omitted when class is registered): + - encoding algorithm: `UTF8/LOWER_UPPER_DIGIT_SPECIAL/FIRST_TO_LOWER_SPECIAL/ALL_TO_LOWER_SPECIAL` + - header: `6 bits size | 2 bits encoding flags`. The `6 bits size: 0~63` will be used to indicate size `0~63`, + the value `63` the size need more byte to read, the encoding will encode `size - 63` as a varint next. +- Field info: + - header(8 + bits): `3 bits size + 2 bits field name encoding + polymorphism flag + nullability flag + ref tracking flag`. + Users can use annotation to provide those info. + - 2 bits field name encoding: + - encoding: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL/TAG_ID` + - If tag id is used, i.e. field name is written by an unsigned varint tag id. 2 bits encoding will be `11`. + - size of field name: + - The `3 bits size: 0~7` will be used to indicate length `1~7`, the value `6` the size read more bytes, + the encoding will encode `size - 7` as a varint next. + - If encoding is `TAG_ID`, then num_bytes of field name will be used to store tag id. + - ref tracking: when set to 1, ref tracking will be enabled for this field. + - nullability: when set to 1, this field can be null. + - polymorphism: when set to 1, the actual type of field will be the declared field type even the type if + not `final`. + - type id: + - For registered type-consistent classes, it will be the registered class id. + - Otherwise it will be encoded as `OBJECT_ID` if it isn't `final` and `FINAL_OBJECT_ID` if it's `final`. The + meta for such types is written separately instead of inlining here is to reduce meta space cost if object of + this type is serialized in current object graph multiple times, and the field value may be null too. + - Field name: If type id is set, type id will be used instead. Otherwise meta string encoding length and data will + be written instead. + +Field order are left as implementation details, which is not exposed to specification, the deserialization need to +resort fields based on Fury field comparator. In this way, fury can compute statistics for field names or types and +using a more compact encoding. + +### Other layers class meta + +Same encoding algorithm as the previous layer except: + +- header + package name: + - Header: + - If package name has been written before: `varint index + sharing flag(set)` will be written + - If package name hasn't been written before: + - If meta string encoding is `LOWER_SPECIAL` and the length of encoded string `<=` 64, then header will be + `6 bits size + encoding flag(set) + sharing flag(unset)`. + - Otherwise, header will + be `3 bits unset + 3 bits encoding flags + encoding flag(unset) + sharing flag(unset)` + +## Meta String + +Meta string is mainly used to encode meta strings such as class name and field names. + +### Encoding Algorithms + +String binary encoding algorithm: + +| Algorithm | Pattern | Description | +|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| UTF-8 | any chars | UTF-8 encoding | + +Encoding flags: + +| Encoding Flag | Pattern | Encoding Algorithm | +|---------------------------|---------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | every char is in `a-z._$\|` | `LOWER_SPECIAL` | +| FIRST_TO_LOWER_SPECIAL | every char is in `a-z[c1,c2]` except first char is upper case | replace first upper case char to lower case, then use `LOWER_SPECIAL` | +| ALL_TO_LOWER_SPECIAL | every char is in `a-zA-Z[c1,c2]` | replace every upper case char by `\|` + `lower case`, then use `LOWER_SPECIAL`, use this encoding if it's smaller than Encoding `LOWER_UPPER_DIGIT_SPECIAL` | +| LOWER_UPPER_DIGIT_SPECIAL | every char is in `a-zA-Z[c1,c2]` | use `LOWER_UPPER_DIGIT_SPECIAL` encoding if it's smaller than Encoding `FIRST_TO_LOWER_SPECIAL` | +| UTF8 | any utf-8 char | use `UTF-8` encoding | +| Compression | any utf-8 char | lossless compression | + +Notes: + +- For package name encoding, `c1,c2` should be `._`; For field/type name encoding, `c1,c2` should be `_$`; +- Depending on cases, one can choose encoding `flags + data` jointly, uses 3 bits of first byte for flags and other + bytes + for data. + +### Shared meta string + +The shared meta string format consists of header and encoded string binary. Header of encoded string binary will be +inlined +in shared meta header. + +Header is written using little endian order, Fury can read this flag first to determine how to deserialize the data. + +#### Write by data + +If string hasn't been written before, the data will be written as follows: + +``` +| unsigned varint: string binary size + 1 bit: not written before | 56 bits: unique hash | 3 bits encoding flags + string binary | +``` + +If string binary size is less than `16` bytes, the hash will be omitted to save spaces. Unique hash can be omitted too +if caller pass a flag to disable it. In such cases, the format will be: + +``` +| unsigned varint: string binary size + 1 bit: not written before | 3 bits encoding flags + string binary | +``` + +#### Write by ref + +If string has been written before, the data will be written as follows: + +``` +| unsigned varint: written string id + 1 bit: written before | +``` + +## Value Format + +### Basic types + +#### Bool + +- size: 1 byte +- format: 0 for `false`, 1 for `true` + +#### Byte + +- size: 1 byte +- format: write as pure byte. + +#### Short + +- size: 2 byte +- byte order: little endian order + +#### Char + +- size: 2 byte +- byte order: little endian order + +#### Unsigned int + +- size: 1~5 byte +- Format: The most significant bit (MSB) in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then + the next byte should be read until the first bit of the next byte is unset. + +#### Signed int + +- size: 1~5 byte +- Format: First convert the number into positive unsigned int by `(v << 1) ^ (v >> 31)` ZigZag algorithm, then encoding + it as an unsigned int. + +#### Unsigned long + +- size: 1~9 byte +- Fury PVL(Progressive Variable-length Long) Encoding: + - positive long format: first bit in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then the next byte should be read until the first bit is unset. + +#### Signed long + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in [-1073741824, 1073741823], encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - First convert the number into positive unsigned long by `(v << 1) ^ (v >> 63)` ZigZag algorithm to reduce cost of + small negative numbers, then encoding it as an unsigned long. + +#### Float + +- size: 4 byte +- format: convert float to 4 bytes int by `Float.floatToRawIntBits`, then write as binary by little endian order. + +#### Double + +- size: 8 byte +- format: convert double to 8 bytes int by `Double.doubleToRawLongBits`, then write as binary by little endian order. + +### String + +Format: + +``` +| header: size << 2 | 2 bits encoding flags | binary data | +``` + +- `size + encoding` will be concat as a long and encoded as an unsigned var long. The little 2 bits is used for + encoding: + 0 for `latin`, 1 for `utf-16`, 2 for `utf-8`. +- encoded string binary data based on encoding: `latin/utf-16/utf-8`. + +Which encoding to choose: + +- For JDK8: fury detect `latin` at runtime, if string is `latin` string, then use `latin` encoding, otherwise + use `utf-16`. +- For JDK9+: fury use `coder` in `String` object for encoding, `latin`/`utf-16` will be used for encoding. +- If the string is encoded by `utf-8`, then fury will use `utf-8` to decode the data. But currently fury doesn't enable + utf-8 encoding by default for java. Cross-language string serialization of fury uses `utf-8` by default. + +### Collection + +> All collection serializers must extend `AbstractCollectionSerializer`. + +Format: + +``` +length(unsigned varint) | collection header | elements header | elements data +``` + +#### Collection header + +- For `ArrayList/LinkedArrayList/HashSet/LinkedHashSet`, this will be empty. +- For `TreeSet`, this will be `Comparator` +- For subclass of `ArrayList`, this may be extra object field info. + +#### Elements header + +In most cases, all collection elements are same type and not null, elements header will encode those homogeneous +information to avoid the cost of writing it for every element. Specifically, there are four kinds of information +which will be encoded by elements header, each use one bit: + +- If track elements ref, use the first bit `0b1` of the header to flag it. +- If the collection has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + element type, this flag is invalid. +- If the collection element types are not declared type, use the 3rd bit `0b100` of the header to flag it. +- If the collection element types are different, use the 4rd bit `0b1000` header to flag it. + +By default, all bits are unset, which means all elements won't track ref, all elements are same type, not null and +the actual element is the declared type in the custom class field. + +The implementation can generate different deserialization code based read header, and look up the generated code from a +linear map/list. + +#### Elements data + +Based on the elements header, the serialization of elements data may skip `ref flag`/`null flag`/`element class info`. + +`CollectionSerializer#write/read` can be taken as an example. + +### Array + +#### Primitive array + +Primitive array are taken as a binary buffer, serialization will just write the length of array size as an unsigned int, +then copy the whole buffer into the stream. + +Such serialization won't compress the array. If users want to compress primitive array, users need to register custom +serializers for such types. + +#### Object array + +Object array is serialized using the collection format. Object component type will be taken as collection element +generic +type. + +### Map + +> All Map serializers must extend `AbstractMapSerializer`. + +Format: + +``` +| length(unsigned varint) | map header | key value pairs data | +``` + +#### Map header + +- For `HashMap/LinkedHashMap`, this will be empty. +- For `TreeMap`, this will be `Comparator` +- For other `Map`, this may be extra object field info. + +#### Map Key-Value data + +Map iteration is too expensive, Fury won't compute the header like for collection before since it introduce +[considerable overhead](https://github.com/apache/fury/issues/925). +Users can use `MapFieldInfo` annotation to provide header in advance. Otherwise Fury will use first key-value pair to +predict header optimistically, and update the chunk header if the prediction failed at some pair. + +Fury will serialize map chunk by chunk, every chunk has 127 pairs at most. + +``` +| 1 byte | 1 byte | variable bytes | ++----------------+----------------+-----------------+ +| KV header | chunk size: N | N*2 objects | +``` + +KV header: + +- If track key ref, use the first bit `0b1` of the header to flag it. +- If the key has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + key type, this flag is invalid. +- If the actual key type of map is not the declared key type, use the 3rd bit `0b100` of the header to flag it. +- If track value ref, use the 4th bit `0b1000` of the header to flag it. +- If the value has null, use the 5th bit `0b10000` of the header to flag it. If ref tracking is enabled for this + value type, this flag is invalid. +- If the value type of map is not the declared value type, use the 6rd bit `0b100000` of the header to flag it. +- If key or value is null, that key and value will be written as a separate chunk, and chunk size writing will be + skipped too. + +If streaming write is enabled, which means Fury can't update written `chunk size`. In such cases, map key-value data +format will be: + +``` +| 1 byte | variable bytes | ++----------------+-----------------+ +| KV header | N*2 objects | +``` + +`KV header` will be a header marked by `MapFieldInfo` in java. The implementation can generate different deserialization +code based read header, and look up the generated code from a linear map/list. + +### Enum + +Enums are serialized as an unsigned var int. If the order of enum values change, the deserialized enum value may not be +the value users expect. In such cases, users must register enum serializer by make it write enum value as an enumerated +string with unique hash disabled. + +### Object + +Object means object of `pojo/struct/bean/record` type. +Object will be serialized by writing its fields data in fury order. + +Depending on schema compatibility, objects will have different formats. + +#### Field order + +Field will be ordered as following, every group of fields will have its own order: + +- primitive fields: larger size type first, smaller later, variable size type last. +- boxed primitive fields: same order as primitive fields +- final fields: same type together, then sorted by field name lexicographically. +- collection fields: same order as final fields +- map fields: same order as final fields +- other fields: same order as final fields + +#### Schema consistent + +Object fields will be serialized one by one using following format: + +``` +Primitive field value: +| var bytes | ++----------------+ +| value data | ++----------------+ +Boxed field value: +| one byte | var bytes | ++-----------+---------------+ +| null flag | field value | ++-----------+---------------+ +field value of final type with ref tracking: +| var bytes | var objects | ++-----------+-------------+ +| ref meta | value data | ++-----------+-------------+ +field value of final type without ref tracking: +| one byte | var objects | ++-----------+-------------+ +| null flag | field value | ++-----------+-------------+ +field value of non-final type with ref tracking: +| one byte | var bytes | var objects | ++-----------+-------------+-------------+ +| ref meta | class meta | value data | ++-----------+-------------+-------------+ +field value of non-final type without ref tracking: +| one byte | var bytes | var objects | ++-----------+------------+------------+ +| null flag | class meta | value data | ++-----------+------------+------------+ +``` + +#### Schema evolution + +Schema evolution have similar format as schema consistent mode for object except: + +- For this object type itself, `schema consistent` mode will write class by id/name, but `schema evolution` mode will + write class field names, types and other meta too, see [Class meta](#class-meta). +- Class meta of `final custom type` needs to be written too, because peers may not have this class defined. + +### Class + +Class will be serialized using class meta format. + +## Implementation guidelines + +- Try to merge multiple bytes into an int/long write before writing to reduce memory IO and bound check cost. +- Read multiple bytes as an int/long, then split into multiple bytes to reduce memory IO and bound check cost. +- Try to use one varint/long to write flags and length together to save one byte cost and reduce memory io. +- Condition branches are less expensive compared to memory IO cost unless there are too many branches. diff --git a/versioned_docs/version-0.10.0/specification/row_format_spec.md b/versioned_docs/version-0.10.0/specification/row_format_spec.md new file mode 100644 index 00000000000..eefd9d9793b --- /dev/null +++ b/versioned_docs/version-0.10.0/specification/row_format_spec.md @@ -0,0 +1,24 @@ +--- +title: Fury Row Format +sidebar_position: 2 +id: fury_row_format_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Row Format + +Coming soon diff --git a/versioned_docs/version-0.10.0/specification/xlang_serialization_spec.md b/versioned_docs/version-0.10.0/specification/xlang_serialization_spec.md new file mode 100644 index 00000000000..d15a3da9fd3 --- /dev/null +++ b/versioned_docs/version-0.10.0/specification/xlang_serialization_spec.md @@ -0,0 +1,807 @@ +--- +title: Fury Xlang Serialization Format +sidebar_position: 0 +id: fury_xlang_serialization_spec +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +## Cross-language Serialization Specification + +> Format Version History: +> +> - Version 0.1 - serialization spec formalized + +Fury xlang serialization is an automatic object serialization framework that supports reference and polymorphism. +Fury will convert an object from/to fury xlang serialization binary format. +Fury has two core concepts for xlang serialization: + +- **Fury xlang binary format** +- **Framework implemented in different languages to convert object to/from Fury xlang binary format** + +The serialization format is a dynamic binary format. The dynamics and reference/polymorphism support make Fury flexible, +much more easy to use, but +also introduce more complexities compared to static serialization frameworks. So the format will be more complex. + +## Type Systems + +### Data Types + +- bool: a boolean value (true or false). +- int8: a 8-bit signed integer. +- int16: a 16-bit signed integer. +- int32: a 32-bit signed integer. +- var_int32: a 32-bit signed integer which use fury var_int32 encoding. +- int64: a 64-bit signed integer. +- var_int64: a 64-bit signed integer which use fury PVL encoding. +- sli_int64: a 64-bit signed integer which use fury SLI encoding. +- float16: a 16-bit floating point number. +- float32: a 32-bit floating point number. +- float64: a 64-bit floating point number including NaN and Infinity. +- string: a text string encoded using Latin1/UTF16/UTF-8 encoding. +- enum: a data type consisting of a set of named values. Rust enum with non-predefined field values are not supported as + an enum. +- named_enum: an enum whose value will be serialized as the registered name. +- struct: a morphic(final) type serialized by Fury Struct serializer. i.e. it doesn't have subclasses. Suppose we're + deserializing `List`, we can save dynamic serializer dispatch since `SomeClass` is morphic(final). +- compatible_struct: a morphic(final) type serialized by Fury compatible Struct serializer. +- named_struct: a `struct` whose type mapping will be encoded as a name. +- named_compatible_struct: a `compatible_struct` whose type mapping will be encoded as a name. +- ext: a type which will be serialized by a customized serializer. +- named_ext: an `ext` type whose type mapping will be encoded as a name. +- list: a sequence of objects. +- set: an unordered set of unique elements. +- map: a map of key-value pairs. Mutable types such as `list/map/set/array/tensor/arrow` are not allowed as key of map. +- duration: an absolute length of time, independent of any calendar/timezone, as a count of nanoseconds. +- timestamp: a point in time, independent of any calendar/timezone, as a count of nanoseconds. The count is relative + to an epoch at UTC midnight on January 1, 1970. +- local_date: a naive date without timezone. The count is days relative to an epoch at UTC midnight on Jan 1, 1970. +- decimal: exact decimal value represented as an integer value in two's complement. +- binary: an variable-length array of bytes. +- array: only allow numeric components. Other arrays will be taken as List. The implementation should support the + interoperability between array and list. +- array: multidimensional array which every sub-array can have different sizes but all have same type. +- bool_array: one dimensional int16 array. +- int8_array: one dimensional int8 array. +- int16_array: one dimensional int16 array. +- int32_array: one dimensional int32 array. +- int64_array: one dimensional int64 array. +- float16_array: one dimensional half_float_16 array. +- float32_array: one dimensional float32 array. +- float64_array: one dimensional float64 array. +- arrow record batch: an arrow [record batch](https://arrow.apache.org/docs/cpp/tables.html#record-batches) object. +- arrow table: an arrow [table](https://arrow.apache.org/docs/cpp/tables.html#tables) object. + +Note: + +- Unsigned int/long are not added here, since not every language support those types. + +### Polymorphisms + +For polymorphism, if one non-final class is registered, and only one subclass is registered, then we can take all +elements in List/Map have same type, thus reduce runtime check cost. + +Collection/Array polymorphism are not fully supported, since some languages such as golang have only one collection +type. If users want to get exactly the type he passed, he must pass that type when deserializing or annotate that type +to the field of struct. + +### Type disambiguation + +Due to differences between type systems of languages, those types can't be mapped one-to-one between languages. When +deserializing, Fury use the target data structure type and the data type in the data jointly to determine how to +deserialize and populate the target data structure. For example: + +```java +class Foo { + int[] intArray; + Object[] objects; + List objectList; +} + +class Foo2 { + int[] intArray; + List objects; + List objectList; +} +``` + +`intArray` has an `int32_array` type. But both `objects` and `objectList` fields in the serialize data have `list` data +type. When deserializing, the implementation will create an `Object` array for `objects`, but create a `ArrayList` +for `objectList` to populate its elements. And the serialized data of `Foo` can be deserialized into `Foo2` too. + +Users can also provide meta hints for fields of a type, or the type whole. Here is an example in java which use +annotation to provide such information. + +```java +@FuryObject(fieldsNullable = false, trackingRef = false) +class Foo { + @FuryField(trackingRef = false) + int[] intArray; + @FuryField(polymorphic = true) + Object object; + @FuryField(tagId = 1, nullable = true) + List objectList; +} +``` + +Such information can be provided in other languages too: + +- cpp: use macro and template. +- golang: use struct tag. +- python: use typehint. +- rust: use macro. + +### Type ID + +All internal data types are expressed using an ID in range `0~64`. Users can use `0~4096` for representing their +types. + +### Type mapping + +See [Type mapping](../guide/xlang_type_mapping.md) + +## Spec overview + +Here is the overall format: + +``` +| fury header | object ref meta | object type meta | object value data | +``` + +The data are serialized using little endian byte order overall. If bytes swap is costly for some object, +Fury will write the byte order for that object into the data instead of converting it to little endian. + +## Fury header + +Fury header consists starts one byte: + +``` +| 2 bytes | 4 bits | 1 bit | 1 bit | 1 bit | 1 bit | 1 byte | optional 4 bytes | ++--------------+---------------+-------+-------+--------+-------+------------+------------------------------------+ +| magic number | reserved bits | oob | xlang | endian | null | language | unsigned int for meta start offset | +``` + +- magic number: used to identify fury serialization protocol, current version use `0x62d4`. +- null flag: 1 when object is null, 0 otherwise. If an object is null, other bits won't be set. +- endian flag: 1 when data is encoded by little endian, 0 for big endian. +- xlang flag: 1 when serialization uses xlang format, 0 when serialization uses Fury java format. +- oob flag: 1 when passed `BufferCallback` is not null, 0 otherwise. +- language: the language when serializing objects, such as JAVA, PYTHON, GO, etc. Fury can use this flag to determine whether spend more time on serialization to make the deserialization faster for dynamic languages. + +If meta share mode is enabled, an uncompressed unsigned int is appended to indicate the start offset of metadata. + +## Reference Meta + +Reference tracking handles whether the object is null, and whether to track reference for the object by writing +corresponding flags and maintaining internal state. + +Reference flags: + +| Flag | Byte Value | Description | +|---------------------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| +| NULL FLAG | `-3` | This flag indicates the object is a null value. We don't use another byte to indicate REF, so that we can save one byte. | +| REF FLAG | `-2` | This flag indicates the object is already serialized previously, and fury will write a ref id with unsigned varint format instead of serialize it again | +| NOT_NULL VALUE FLAG | `-1` | This flag indicates the object is a non-null value and fury doesn't track ref for this type of object. | +| REF VALUE FLAG | `0` | This flag indicates the object is referencable and the first time to serialize. | + +When reference tracking is disabled globally or for specific types, or for certain types within a particular +context(e.g., a field of a type), only the `NULL` and `NOT_NULL VALUE` flags will be used for reference meta. + +For languages which doesn't support reference such as rust, reference tracking must be disabled for correct +deserialization by fury rust implementation. + +For languages whose object values are not null by default: + +- In rust, Fury takes `Option:None` as a null value +- In c++, Fury takes `std::nullopt` as a null value +- In golang, Fury takes `null interface/pointer` as a null value + +If one want to deserialize in languages like `Java/Python/JavaScript`, he should mark the type with all fields +not-null by default, or using schema-evolution mode to carry the not-null fields info in the data. + +## Type Meta + +For every type to be serialized, it must be registered with an optional ID first. The registered type will have a +user-provided or an auto-growing unsigned int i.e. `type_id`. The registration can be used for security check and type +identification. The id of user registered type will be added by `64` to make space for Fury internal data types. + +Depending on whether meta share mode and registration is enabled for current type, Fury will write type meta +differently. + +### Schema consistent + +- If schema consistent mode is enabled globally when creating fury, type meta will be written as a fury unsigned varint + of `type_id`. Schema evolution related meta will be ignored. +- If schema evolution mode is enabled globally when creating fury, and current class is configured to use schema + consistent mode like `struct` vs `table` in flatbuffers: + - Type meta will be add to `captured_type_defs`: `captured_type_defs[type def stub] = map size` ahead when + registering type. + - Get index of the meta in `captured_type_defs`, write that index as `| unsigned varint: index |`. + +### Schema evolution + +If schema evolution mode is enabled globally when creating fury, and enabled for current type, type meta will be written +using one of the following mode. Which mode to use is configured when creating fury. + +- Normal mode(meta share not enabled): + - If type meta hasn't been written before, add `type def` + to `captured_type_defs`: `captured_type_defs[type def] = map size`. + - Get index of the meta in `captured_type_defs`, write that index as `| unsigned varint: index |`. + - After finished the serialization of the object graph, fury will start to write `captured_type_defs`: + - Firstly, set current to `meta start offset` of fury header + - Then write `captured_type_defs` one by one: + + ```python + buffer.write_var_uint32(len(writting_type_defs) - len(schema_consistent_type_def_stubs)) + for type_meta in writting_type_defs: + if not type_meta.is_stub(): + type_meta.write_type_def(buffer) + writing_type_defs = copy(schema_consistent_type_def_stubs) + ``` + +- Meta share mode: the writing steps are same as the normal mode, but `captured_type_defs` will be shared across + multiple serializations of different objects. For example, suppose we have a batch to serialize: + + ```python + captured_type_defs = {} + stream = ... + # add `Type1` to `captured_type_defs` and write `Type1` + fury.serialize(stream, [Type1()]) + # add `Type2` to `captured_type_defs` and write `Type2`, `Type1` is written before. + fury.serialize(stream, [Type1(), Type2()]) + # `Type1` and `Type2` are written before, no need to write meta. + fury.serialize(stream, [Type1(), Type2()]) + ``` + +- Streaming mode(streaming mode doesn't support meta share): + - If type meta hasn't been written before, the data will be written as: + + ``` + | unsigned varint: 0b11111111 | type def | + ``` + + - If type meta has been written before, the data will be written as: + + ``` + | unsigned varint: written index << 1 | + ``` + + `written index` is the id in `captured_type_defs`. + - With this mode, `meta start offset` can be omitted. + +> The normal mode and meta share mode will forbid streaming writing since it needs to look back for update the start +> offset after the whole object graph writing and meta collecting is finished. Only in this way we can ensure +> deserialization failure in meta share mode doesn't lost shared meta. + +#### Type Def + +Here we mainly describe the meta layout for schema evolution mode: + +``` +| 8 bytes meta header | variable bytes | variable bytes | variable bytes | ++-------------------------------+--------------------+-------------------+----------------+ +| 7 bytes hash + 1 bytes header | current type meta | parent type meta | ... | +``` + +Type meta are encoded from parent type to leaf type, only type with serializable fields will be encoded. + +##### Meta header + +Meta header is a 64 bits number value encoded in little endian order. + +- Lowest 4 digits `0b0000~0b1110` are used to record num classes. `0b1111` is preserved to indicate that Fury need to + read more bytes for length using Fury unsigned int encoding. If current type doesn't has parent type, or parent + type doesn't have fields to serialize, or we're in a context which serialize fields of current type + only, num classes will be 1. +- The 5th bit is used to indicate whether this type needs schema evolution. +- Other 56 bits are used to store the unique hash of `flags + all layers type meta`. + +##### Single layer type meta + +``` +| unsigned varint | var uint | field info: variable bytes | variable bytes | ... | ++-----------------+----------+-------------------------------+-----------------+-----+ +| num_fields | type id | header + type id + field name | next field info | ... | +``` + +- num fields: encode `num fields` as unsigned varint. + - If the current type is schema consistent, then num_fields will be `0` to flag it. + - If the current type isn't schema consistent, then num_fields will be the number of compatible fields. For example, + users can use tag id to mark some fields as compatible fields in schema consistent context. In such cases, schema + consistent fields will be serialized first, then compatible fields will be serialized next. At deserialization, + Fury will use fields info of those fields which aren't annotated by tag id for deserializing schema consistent + fields, then use fields info in meta for deserializing compatible fields. +- type id: the registered id for the current type, which will be written as an unsigned varint. +- field info: + - header(8 + bits): `4 bits size + 2 bits field name encoding + nullability flag + ref tracking flag`. + Users can use annotation to provide those info. + - 2 bits field name encoding: + - encoding: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL/TAG_ID` + - If tag id is used, i.e. field name is written by an unsigned varint tag id. 2 bits encoding will be `11`. + - size of field name: + - The `4 bits size: 0~14` will be used to indicate length `1~15`, the value `15` indicates to read more bytes, + the encoding will encode `size - 15` as a varint next. + - If encoding is `TAG_ID`, then num_bytes of field name will be used to store tag id. + - ref tracking: when set to 1, ref tracking will be enabled for this field. + - nullability: when set to 1, this field can be null. + - field name: If tag id is set, tag id will be used instead. Otherwise meta string encoding `[length]` and data will + be written instead. + - type id: + - Format: `id << 1 | polymorphic flag`. If field type is polymorphic, this flag is set to `0b1`, otherwise it's + `0b0` + - For registered type-consistent classes, it will be the registered type id. + - For struct type it will be written as `STRUCT`. + - The meta for struct type is written separately instead of inlining here is to reduce meta space cost if object of + this type is serialized in current object graph multiple times, and the field value may be null too. + - For enum type, it will be written as `ENUM`. + - For collection type, it will be written as `COLLECTION`, then write element type recursively. + - For map type, it will be written as `MAP`, then write key and value type recursively. + +Field order are left as implementation details, which is not exposed to specification, the deserialization need to +resort fields based on Fury field comparator. In this way, fury can compute statistics for field names or types and +using a more compact encoding. + +##### Other layers type meta + +Same encoding algorithm as the previous layer. + +## Meta String + +Meta string is mainly used to encode meta strings such as field names. + +### Encoding Algorithms + +String binary encoding algorithm: + +| Algorithm | Pattern | Description | +|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| UTF-8 | any chars | UTF-8 encoding | + +Encoding flags: + +| Encoding Flag | Pattern | Encoding Algorithm | +|---------------------------|----------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | every char is in `a-z._\|` | `LOWER_SPECIAL` | +| FIRST_TO_LOWER_SPECIAL | every char is in `a-z._` except first char is upper case | replace first upper case char to lower case, then use `LOWER_SPECIAL` | +| ALL_TO_LOWER_SPECIAL | every char is in `a-zA-Z._` | replace every upper case char by `\|` + `lower case`, then use `LOWER_SPECIAL`, use this encoding if it's smaller than Encoding `LOWER_UPPER_DIGIT_SPECIAL` | +| LOWER_UPPER_DIGIT_SPECIAL | every char is in `a-zA-Z._` | use `LOWER_UPPER_DIGIT_SPECIAL` encoding if it's smaller than Encoding `FIRST_TO_LOWER_SPECIAL` | +| UTF8 | any utf-8 char | use `UTF-8` encoding | +| Compression | any utf-8 char | lossless compression | + +Notes: + +- Depending on cases, one can choose encoding `flags + data` jointly, uses 3 bits of first byte for flags and other + bytes + for data. + +## Value Format + +### Basic types + +#### bool + +- size: 1 byte +- format: 0 for `false`, 1 for `true` + +#### int8 + +- size: 1 byte +- format: write as pure byte. + +#### int16 + +- size: 2 byte +- byte order: raw bytes of little endian order + +#### unsigned int32 + +- size: 4 byte +- byte order: raw bytes of little endian order + +#### unsigned varint32 + +- size: 1~5 byte +- Format: The most significant bit (MSB) in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then + the next byte should be read until the first bit of the next byte is unset. + +#### signed int32 + +- size: 4 byte +- byte order: raw bytes of little endian order + +#### signed varint32 + +- size: 1~5 byte +- Format: First convert the number into positive unsigned int by `(v << 1) ^ (v >> 31)` ZigZag algorithm, then encode + it as an unsigned varint. + +#### unsigned int64 + +- size: 8 byte +- byte order: raw bytes of little endian order + +#### unsigned varint64 + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in `[0, 2147483647]`, encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - positive long format: first bit in every byte indicates whether to have the next byte. If first bit is set + i.e. `b & 0x80 == 0x80`, then the next byte should be read until the first bit is unset. + +#### signed int64 + +- size: 8 byte +- byte order: raw bytes of little endian order + +#### signed varint64 + +- size: 1~9 byte +- Fury SLI(Small long as int) Encoding: + - If long is in `[-1073741824, 1073741823]`, encode as 4 bytes int: `| little-endian: ((int) value) << 1 |` + - Otherwise write as 9 bytes: `| 0b1 | little-endian 8 bytes long |` +- Fury PVL(Progressive Variable-length Long) Encoding: + - First convert the number into positive unsigned long by `(v << 1) ^ (v >> 63)` ZigZag algorithm to reduce cost of + small negative numbers, then encoding it as an unsigned long. + +#### float32 + +- size: 4 byte +- format: encode the specified floating-point value according to the IEEE 754 floating-point "single format" bit layout, + preserving Not-a-Number (NaN) values, then write as binary by little endian order. + +#### float64 + +- size: 8 byte +- format: encode the specified floating-point value according to the IEEE 754 floating-point "double format" bit layout, + preserving Not-a-Number (NaN) values. then write as binary by little endian order. + +### string + +Format: + +``` +| unsigned varint64: size << 2 `bitor` 2 bits encoding flags | binary data | +``` + +- `size + encoding` will be concat as a long and encoded as an unsigned varint64. The little 2 bits is used for + encoding: + 0 for `latin1(ISO-8859-1)`, 1 for `utf-16`, 2 for `utf-8`. +- encoded string binary data based on encoding: `latin/utf-16/utf-8`. + +Which encoding to choose: + +- For JDK8: fury detect `latin` at runtime, if string is `latin` string, then use `latin` encoding, otherwise + use `utf-16`. +- For JDK9+: fury use `coder` in `String` object for encoding, `latin`/`utf-16` will be used for encoding. +- If the string is encoded by `utf-8`, then fury will use `utf-8` to decode the data. Cross-language string + serialization of fury uses `utf-8` by default. + +### list + +Format: + +``` +| unsigned varint64: length << 4 `bitor` 4 bits elements header | elements data | +``` + +#### elements header + +In most cases, all elements are same type and not null, elements header will encode those homogeneous +information to avoid the cost of writing it for every element. Specifically, there are four kinds of information +which will be encoded by elements header, each use one bit: + +- If track elements ref, use the first bit `0b1` of the header to flag it. +- If the elements have null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + element type, this flag is invalid. +- If the element types are not the declared type, use the 3rd bit `0b100` of the header to flag it. +- If the element types are different, use the 4rd bit `0b1000` header to flag it. + +By default, all bits are unset, which means all elements won't track ref, all elements are same type, not null and +the actual element is the declared type in the custom type field. + +The implementation can generate different deserialization code based read header, and look up the generated code from +a linear map/list. + +#### elements data + +Based on the elements header, the serialization of elements data may skip `ref flag`/`null flag`/`element type info`. + +```python +fury = ... +buffer = ... +elems = ... +if element_type_is_same: + if not is_declared_type: + fury.write_type(buffer, elem_type) + elem_serializer = get_serializer(...) + if track_ref: + for elem in elems: + if not ref_resolver.write_ref_or_null(buffer, elem): + elem_serializer.write(buffer, elem) + elif has_null: + for elem in elems: + if elem is None: + buffer.write_byte(null_flag) + else: + buffer.write_byte(not_null_flag) + elem_serializer.write(buffer, elem) + else: + for elem in elems: + elem_serializer.write(buffer, elem) +else: + if track_ref: + for elem in elems: + fury.write_ref(buffer, elem) + elif has_null: + for elem in elems: + fury.write_nullable(buffer, elem) + else: + for elem in elems: + fury.write_value(buffer, elem) +``` + +[`CollectionSerializer#writeElements`](https://github.com/apache/fury/blob/20a1a78b17a75a123a6f5b7094c06ff77defc0fe/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractCollectionSerializer.java#L302) +can be taken as an example. + +### array + +#### primitive array + +Primitive array are taken as a binary buffer, serialization will just write the length of array size as an unsigned int, +then copy the whole buffer into the stream. + +Such serialization won't compress the array. If users want to compress primitive array, users need to register custom +serializers for such types or mark it as list type. + +#### object array + +Object array is serialized using the list format. Object component type will be taken as list element +generic type. + +### map + +> All Map serializers must extend `AbstractMapSerializer`. + +Format: + +``` +| length(unsigned varint) | key value chunk data | ... | key value chunk data | +``` + +#### map key-value chunk data + +Map iteration is too expensive, Fury won't compute the header like for list since it introduce +[considerable overhead](https://github.com/apache/fury/issues/925). +Users can use `MapFieldInfo` annotation to provide the header in advance. Otherwise Fury will use first key-value pair +to predict header optimistically, and update the chunk header if the prediction failed at some pair. + +Fury will serialize the map chunk by chunk, every chunk has 255 pairs at most. + +``` +| 1 byte | 1 byte | variable bytes | ++----------------+----------------+-----------------+ +| KV header | chunk size: N | N*2 objects | +``` + +KV header: + +- If track key ref, use the first bit `0b1` of the header to flag it. +- If the key has null, use the second bit `0b10` of the header to flag it. If ref tracking is enabled for this + key type, this flag is invalid. +- If the actual key type of map is not the declared key type, use the 3rd bit `0b100` of the header to flag it. +- If track value ref, use the 4th bit `0b1000` of the header to flag it. +- If the value has null, use the 5th bit `0b10000` of the header to flag it. If ref tracking is enabled for this + value type, this flag is invalid. +- If the value type of map is not the declared value type, use the 6rd bit `0b100000` of the header to flag it. +- If key or value is null, that key and value will be written as a separate chunk, and chunk size writing will be + skipped too. + +If streaming write is enabled, which means Fury can't update written `chunk size`. In such cases, map key-value data +format will be: + +``` +| 1 byte | variable bytes | ++----------------+-----------------+ +| KV header | N*2 objects | +``` + +`KV header` will be a header marked by `MapFieldInfo` in java. For languages such as golang, this can be computed in +advance for non-interface types most times. The implementation can generate different deserialization code based read +header, and look up the generated code from a linear map/list. + +#### Why serialize chunk by chunk? + +When fury will use first key-value pair to predict header optimistically, it can't know how many pairs have same +meta(tracking kef ref, key has null and so on). If we don't write chunk by chunk with max chunk size, we must write at +least `X` bytes to take up a place for later to update the number which has same elements, `X` is the num_bytes for +encoding varint encoding of map size. + +And most map size are smaller than 255, if all pairs have same data, the chunk will be 1. This is common in golang/rust, +which object are not reference by default. + +Also, if only one or two keys have different meta, we can make it into a different chunk, so that most pairs can share +meta. + +The implementation can accumulate read count with map size to decide whether to read more chunks. + +### enum + +Enums are serialized as an unsigned var int. If the order of enum values change, the deserialized enum value may not be +the value users expect. In such cases, users must register enum serializer by make it write enum value as an enumerated +string with unique hash disabled. + +### decimal + +Not supported for now. + +### struct + +Struct means object of `class/pojo/struct/bean/record` type. +Struct will be serialized by writing its fields data in fury order. + +Depending on schema compatibility, structs will have different formats. + +#### field order + +Field will be ordered as following, every group of fields will have its own order: + +- primitive fields: larger size type first, smaller later, variable size type last. +- boxed primitive fields: same order as primitive fields +- final fields: same type together, then sorted by field name lexicographically. +- list fields: same order as final fields +- map fields: same order as final fields +- other fields: same order as final fields + +#### schema consistent + +Object will be written as: + +``` +| 4 byte | variable bytes | ++---------------+------------------+ +| type hash | field values | +``` + +Type hash is used to check the type schema consistency across languages. Type hash will be the first 32 bits of 56 bits +value of the type meta. + +Object fields will be serialized one by one using following format: + +``` +not null primitive field value: +| var bytes | ++----------------+ +| value data | ++----------------+ +nullable primitive field value: +| one byte | var bytes | ++-----------+---------------+ +| null flag | field value | ++-----------+---------------+ +field value of final type with ref tracking: +| var bytes | var objects | ++-----------+-------------+ +| ref meta | value data | ++-----------+-------------+ +field value of final type without ref tracking: +| one byte | var objects | ++-----------+-------------+ +| null flag | field value | ++-----------+-------------+ +field value of non-final type with ref tracking: +| one byte | var bytes | var objects | ++-----------+-------------+-------------+ +| ref meta | type meta | value data | ++-----------+-------------+-------------+ +field value of non-final type without ref tracking: +| one byte | var bytes | var objects | ++-----------+------------+------------+ +| null flag | type meta | value data | ++-----------+------------+------------+ +``` + +#### Schema evolution + +Schema evolution have similar format as schema consistent mode for object except: + +- For the object type, `schema consistent` mode will write type by id only, but `schema evolution` mode will + write type consisting of field names, types and other meta too, see [Type meta](#type-meta). +- Type meta of `final custom type` needs to be written too, because peers may not have this type defined. + +### Type + +Type will be serialized using type meta format. + +## Implementation guidelines + +### How to reduce memory read/write code + +- Try to merge multiple bytes into an int/long write before writing to reduce memory IO and bound check cost. +- Read multiple bytes as an int/long, then split into multiple bytes to reduce memory IO and bound check cost. +- Try to use one varint/long to write flags and length together to save one byte cost and reduce memory io. +- Condition branches are less expensive compared to memory IO cost unless there are too many branches. + +### Fast deserialization for static languages without runtime codegen support + +For type evolution, the serializer will encode the type meta into the serialized data. The deserializer will compare +this meta with class meta in the current process, and use the diff to determine how to deserialize the data. + +For java/javascript/python, we can use the diff to generate serializer code at runtime and load it as class/function for +deserialization. In this way, the type evolution will be as fast as type consist mode. + +For C++/Rust, we can't generate the serializer code at runtime. So we need to generate the code at compile-time using +meta programming. But at that time, we don't know the type schema in other processes, so we can't generate the +serializer code for such inconsistent types. We may need to generate the code which has a loop and compare field name +one by one to decide whether to deserialize and assign the field or skip the field value. + +One fast way is that we can optimize the string comparison into `jump` instructions: + +- Assume the current type has `n` fields, and the peer type has `n1` fields. +- Generate an auto growing `field id` from `0` for every sorted field in the current type at the compile time. +- Compare the received type meta with current type, generate same id if the field name is same, otherwise generate an + auto growing id starting from `n`, cache this meta at runtime. +- Iterate the fields of received type meta, use a `switch` to compare the `field id` to deserialize data + and `assign/skip` field value. **Continuous** field id will be optimized into `jump` in `switch` block, so it will + very fast. + +Here is an example, suppose process A has a class `Foo` with version 1 defined as `Foo1`, process B has a class `Foo` +with version 2 defined as `Foo2`: + +```c++ +// class Foo with version 1 +class Foo1 { + int32_t v1; // id 0 + std::string v2; // id 1 +}; +// class Foo with version 2 +class Foo2 { + // id 0, but will have id 2 in process A + bool v0; + // id 1, but will have id 0 in process A + int32_t v1; + // id 2, but will have id 3 in process A + int64_t long_value; + // id 3, but will have id 1 in process A + std::string v2; + // id 4, but will have id 4 in process A + std::vector list; +}; +``` + +When process A received serialized `Foo2` from process B, here is how it deserialize the data: + +```c++ +Foo1 foo1 = ...; +const std::vector &field_infos = type_meta.field_infos; +for (const auto &field_info : field_infos) { + switch (field_info.field_id) { + case 0: + foo1.v1 = buffer.read_varint32(); + break; + case 1: + foo1.v2 = fury.read_string(); + break; + default: + fury.skip_data(field_info); + } +} +``` diff --git a/versioned_docs/version-0.10.0/start/install.md b/versioned_docs/version-0.10.0/start/install.md new file mode 100644 index 00000000000..1443db69d84 --- /dev/null +++ b/versioned_docs/version-0.10.0/start/install.md @@ -0,0 +1,73 @@ +--- +id: install +title: Install +sidebar_position: 0 +--- + +The official Apache Fury releases are provided as source artifacts. + +For source download, please see Fury [download](https://fury.apache.org/download) page. + +## Java + +To add a dependency on Fury using Maven, use the following: + +```xml + + org.apache.fury + fury-core + 0.10.0 + + + +``` + +## Scala + +To add a dependency on Fury scala for scala 2.13 with maven, use the following: + +```xml + + org.apache.fury + fury-scala_2.13 + 0.10.0 + +``` + +To add a dependency on Fury scala for scala 3 with maven, use the following: + +```xml + + org.apache.fury + fury-scala_3 + 0.10.0 + +``` + +To add a dependency on Fury scala for scala 2.13 with sbt, use the following: + +```sbt +libraryDependencies += "org.apache.fury" % "fury-scala_2.13" % "0.10.0" +``` + +To add a dependency on Fury scala for scala 3 with sbt, use the following: + +```sbt +libraryDependencies += "org.apache.fury" % "fury-scala_3" % "0.10.0" +``` + +## Kotlin + +To add a dependency on Fury kotlin with maven, use the following: + +```xml + + org.apache.fury + fury-kotlin + 0.10.0 + +``` diff --git a/versioned_docs/version-0.10.0/start/usage.md b/versioned_docs/version-0.10.0/start/usage.md new file mode 100644 index 00000000000..ebbd4b20a5b --- /dev/null +++ b/versioned_docs/version-0.10.0/start/usage.md @@ -0,0 +1,237 @@ +--- +id: usage +title: Usage +sidebar_position: 1 +--- + +## Java Serialization + +```java +import java.util.List; +import java.util.Arrays; +import org.apache.fury.*; + +public class Example { + public static void main(String[] args) { + SomeClass object = new SomeClass(); + // Note that Fury instances should be reused between + // multiple serializations of different objects. + Fury fury = Fury.builder().withLanguage(Language.JAVA) + // Allow to deserialize objects unknown types, + // more flexible but less secure. + // .requireClassRegistration(false) + .build(); + // Registering types can reduce class name serialization overhead, but not mandatory. + // If secure mode enabled, all custom types must be registered. + fury.register(SomeClass.class); + byte[] bytes = fury.serialize(object); + System.out.println(fury.deserialize(bytes)); + } +} +``` + +## Scala Serialization + +```scala +import org.apache.fury.Fury +import org.apache.fury.serializer.scala.ScalaSerializers + +case class Person(name: String, id: Long, github: String) +case class Point(x : Int, y : Int, z : Int) + +object ScalaExample { + val fury: Fury = Fury.builder().withScalaOptimizationEnabled(true).build() + // Register optimized fury serializers for scala + ScalaSerializers.registerSerializers(fury) + fury.register(classOf[Person]) + fury.register(classOf[Point]) + + def main(args: Array[String]): Unit = { + val p = Person("Shawn Yang", 1, "https://github.com/chaokunyang") + println(fury.deserialize(fury.serialize(p))) + println(fury.deserialize(fury.serialize(Point(1, 2, 3)))) + } +} +``` + +## Kotlin Serialization + +```kotlin +import org.apache.fury.Fury +import org.apache.fury.ThreadSafeFury +import org.apache.fury.serializer.kotlin.KotlinSerializers + +data class Person(val name: String, val id: Long, val github: String) +data class Point(val x : Int, val y : Int, val z : Int) + +fun main(args: Array) { + // Note: following fury init code should be executed only once in a global scope instead + // of initializing it everytime when serialization. + val fury: ThreadSafeFury = Fury.builder().requireClassRegistration(true).buildThreadSafeFury() + KotlinSerializers.registerSerializers(fury) + fury.register(Person::class.java) + fury.register(Point::class.java) + + val p = Person("Shawn Yang", 1, "https://github.com/chaokunyang") + println(fury.deserialize(fury.serialize(p))) + println(fury.deserialize(fury.serialize(Point(1, 2, 3)))) +} +``` + +## CrossLanguage Serialization + +### Java + +```java +import com.google.common.collect.ImmutableMap; +import org.apache.fury.*; + +import java.util.Map; + +public class ReferenceExample { + public static class SomeClass { + SomeClass f1; + Map f2; + Map f3; + } + + public static Object createObject() { + SomeClass obj = new SomeClass(); + obj.f1 = obj; + obj.f2 = ImmutableMap.of("k1", "v1", "k2", "v2"); + obj.f3 = obj.f2; + return obj; + } + + // mvn exec:java -Dexec.mainClass="io.fury.examples.ReferenceExample" + public static void main(String[] args) { + Fury fury = Fury.builder().withLanguage(Language.XLANG) + .withRefTracking(true).build(); + fury.register(SomeClass.class, "example.SomeClass"); + byte[] bytes = fury.serialize(createObject()); + // bytes can be data serialized by other languages. + System.out.println(fury.deserialize(bytes)); + ; + } +} +``` + +### Python + +```python +from typing import Dict +import pyfury + +class SomeClass: + f1: "SomeClass" + f2: Dict[str, str] + f3: Dict[str, str] + +fury = pyfury.Fury(ref_tracking=True) +fury.register_class(SomeClass, "example.SomeClass") +obj = SomeClass() +obj.f2 = {"k1": "v1", "k2": "v2"} +obj.f1, obj.f3 = obj, obj.f2 +data = fury.serialize(obj) +# bytes can be data serialized by other languages. +print(fury.deserialize(data)) +``` + +### Golang + +```go +package main + +import ( + "fmt" + furygo "github.com/apache/fury/go/fury" +) + +func main() { + type SomeClass struct { + F1 *SomeClass + F2 map[string]string + F3 map[string]string + } + fury := furygo.NewFury(true) + if err := fury.RegisterTagType("example.SomeClass", SomeClass{}); err != nil { + panic(err) + } + value := &SomeClass{F2: map[string]string{"k1": "v1", "k2": "v2"}} + value.F3 = value.F2 + value.F1 = value + bytes, err := fury.Marshal(value) + if err != nil { + } + var newValue interface{} + // bytes can be data serialized by other languages. + if err := fury.Unmarshal(bytes, &newValue); err != nil { + panic(err) + } + fmt.Println(newValue) +} +``` + +### JavaScript + +```typescript +import Fury, { Type } from '@furyjs/fury'; + +/** + * @furyjs/hps use v8's fast-calls-api that can be called directly by jit, ensure that the version of Node is 20 or above. + * Experimental feature, installation success cannot be guaranteed at this moment + * If you are unable to install the module, replace it with `const hps = null;` + **/ +import hps from '@furyjs/hps'; + +// Now we describe data structures using JSON, but in the future, we will use more ways. +const description = Type.object('example.foo', { + foo: Type.string(), +}); +const fury = new Fury({ hps }); +const { serialize, deserialize } = fury.registerSerializer(description); +const input = serialize({ foo: 'hello fury' }); +const result = deserialize(input); +console.log(result); +``` + +### Rust + +```rust +use fury::{from_buffer, to_buffer, Fury}; + +#[derive(Fury, Debug, PartialEq)] +#[tag("example.foo")] +struct Animal { + name: String, + category: String, +} + +#[derive(Fury, Debug, PartialEq)] +#[tag("example.bar")] +struct Person { + name: String, + age: u32, + pets: Vec, +} + +fn main() { + let penson = Person { + name: "hello".to_string(), + age: 12, + pets: vec![ + Animal { + name: "world1".to_string(), + category: "cat".to_string(), + }, + Animal { + name: "world2".to_string(), + category: "dog".to_string(), + }, + ], + }; + let bin = to_buffer(&penson); + let obj: Person = from_buffer(&bin).expect("should success"); + assert_eq!(obj, penson); +} +``` diff --git a/versioned_sidebars/version-0.10.0-sidebars.json b/versioned_sidebars/version-0.10.0-sidebars.json new file mode 100644 index 00000000000..af81029c4bb --- /dev/null +++ b/versioned_sidebars/version-0.10.0-sidebars.json @@ -0,0 +1,32 @@ +{ + "startSidebar": [ + { + "type": "autogenerated", + "dirName": "start" + } + ], + "introductionSidebar": [ + { + "type": "autogenerated", + "dirName": "introduction" + } + ], + "specificationSidebar": [ + { + "type": "autogenerated", + "dirName": "specification" + } + ], + "guideSidebar": [ + { + "type": "autogenerated", + "dirName": "guide" + } + ], + "communitySidebar": [ + { + "type": "autogenerated", + "dirName": "community" + } + ] +} diff --git a/versions.json b/versions.json new file mode 100644 index 00000000000..d7be0f6e22c --- /dev/null +++ b/versions.json @@ -0,0 +1,3 @@ +[ + "0.10.0" +]