From 332ec23cd72a9916755d1b6ad6a934b27b3bcd95 Mon Sep 17 00:00:00 2001 From: chaokunyang Date: Fri, 5 May 2023 16:39:25 +0800 Subject: [PATCH] add enum string resolver support --- .../io/fury/resolver/EnumStringBytes.java | 96 +++++++++ .../io/fury/resolver/EnumStringResolver.java | 191 ++++++++++++++++++ .../fury/resolver/EnumStringResolverTest.java | 46 +++++ 3 files changed, 333 insertions(+) create mode 100644 java/fury-core/src/main/java/io/fury/resolver/EnumStringBytes.java create mode 100644 java/fury-core/src/main/java/io/fury/resolver/EnumStringResolver.java create mode 100644 java/fury-core/src/test/java/io/fury/resolver/EnumStringResolverTest.java diff --git a/java/fury-core/src/main/java/io/fury/resolver/EnumStringBytes.java b/java/fury-core/src/main/java/io/fury/resolver/EnumStringBytes.java new file mode 100644 index 0000000000..d8ce3eef00 --- /dev/null +++ b/java/fury-core/src/main/java/io/fury/resolver/EnumStringBytes.java @@ -0,0 +1,96 @@ +/* + * Copyright 2023 The Fury authors + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.fury.resolver; + +import com.google.common.base.Preconditions; +import io.fury.annotation.Internal; +import io.fury.util.MurmurHash3; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; + +@Internal +public final class EnumStringBytes { + static final short DEFAULT_DYNAMIC_WRITE_STRING_ID = -1; + + final byte[] bytes; + final long hashCode; + short dynamicWriteStringId = DEFAULT_DYNAMIC_WRITE_STRING_ID; + + /** + * Create a binary EnumString. + * + * @param bytes String encoded bytes. + * @param hashCode String hash code. This should be unique and has no hash collision, and be + * deterministic, so we can use cache to reduce hash loop up for read. + */ + public EnumStringBytes(byte[] bytes, long hashCode) { + assert hashCode != 0; + this.bytes = bytes; + this.hashCode = hashCode; + } + + public EnumStringBytes(String string) { + byte[] classNameBytes = string.getBytes(StandardCharsets.UTF_8); + Preconditions.checkArgument(classNameBytes.length <= Short.MAX_VALUE); + // Set seed to ensure hash is deterministic. + long hashCode = + MurmurHash3.murmurhash3_x64_128(classNameBytes, 0, classNameBytes.length, 47)[0]; + if (hashCode == 0) { + // Ensure hashcode is not 0, so we can do some optimization to avoid boxing. + hashCode += 1; + } + this.bytes = classNameBytes; + this.hashCode = hashCode; + } + + @Override + public boolean equals(Object o) { + // EnumStringBytes is used internally, skip unnecessary parameter check. + // if (this == o) { + // return true; + // } + // if (o == null || getClass() != o.getClass()) { + // return false; + // } + EnumStringBytes that = (EnumStringBytes) o; + // Skip compare data for equality for performance. + // Enum string such as classname are very common, compare hashcode only will have better + // performance. + // Java hashcode is 32-bit, so comparing hashCode equality is necessary here. + return hashCode == that.hashCode; + } + + public byte[] getBytes() { + return bytes; + } + + @Override + public int hashCode() { + // equals will compare 8 byte hash code. + return (int) hashCode; + } + + @Override + public String toString() { + // TODO support other str encoding. + String str = new String(bytes); + ; + return "string: " + str + " " + "size: " + bytes.length + " " + Arrays.toString(bytes); + } +} diff --git a/java/fury-core/src/main/java/io/fury/resolver/EnumStringResolver.java b/java/fury-core/src/main/java/io/fury/resolver/EnumStringResolver.java new file mode 100644 index 0000000000..669e7cd2b5 --- /dev/null +++ b/java/fury-core/src/main/java/io/fury/resolver/EnumStringResolver.java @@ -0,0 +1,191 @@ +/* + * Copyright 2023 The Fury authors + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.fury.resolver; + +import io.fury.collection.LongMap; +import io.fury.collection.ObjectMap; +import io.fury.memory.MemoryBuffer; +import java.nio.charset.StandardCharsets; + +/** + * A resolver for limited string value writing. Currently, we only support classname dynamic + * writing. In the future, we may profile string field value dynamically and writing by this + * resolver to reduce string cost. TODO add common inner package names and classnames here. TODO + * share common immutable datastructure globally across multiple fury. + */ +public final class EnumStringResolver { + public static final byte USE_STRING_VALUE = 0; + public static final byte USE_STRING_ID = 1; + private static final int initialCapacity = 8; + // use a lower load factor to minimize hash collision + private static final float furyMapLoadFactor = 0.25f; + + // Every deserialization for unregistered string will query it, performance is important. + private final ObjectMap enumStringBytes2StringMap = + new ObjectMap<>(initialCapacity, furyMapLoadFactor); + private final LongMap hash2EnumStringBytesMap = + new LongMap<>(initialCapacity, furyMapLoadFactor); + // Every enum bytes should be singleton at every fury, since we keep state in it. + private final ObjectMap enumString2BytesMap = + new ObjectMap<>(initialCapacity, furyMapLoadFactor); + private EnumStringBytes[] dynamicWrittenString = new EnumStringBytes[32]; + private EnumStringBytes[] dynamicReadStringIds = new EnumStringBytes[32]; + private short dynamicWriteStringId; + private short dynamicReadStringId; + + public EnumStringResolver() { + dynamicWriteStringId = 0; + dynamicReadStringId = 0; + } + + EnumStringBytes getOrCreateEnumStringBytes(String str) { + EnumStringBytes enumStringBytes = enumString2BytesMap.get(str); + if (enumStringBytes == null) { + enumStringBytes = new EnumStringBytes(str); + enumString2BytesMap.put(str, enumStringBytes); + } + return enumStringBytes; + } + + public void writeEnumString(MemoryBuffer buffer, String str) { + writeEnumStringBytes(buffer, getOrCreateEnumStringBytes(str)); + } + + public String readEnumString(MemoryBuffer buffer) { + EnumStringBytes byteString = readEnumStringBytes(buffer); + String str = enumStringBytes2StringMap.get(byteString); + if (str == null) { // TODO use io.fury.resolver.ObjectMap + str = new String(byteString.bytes, StandardCharsets.UTF_8); + enumStringBytes2StringMap.put(byteString, str); + } + return str; + } + + public void writeEnumStringBytes(MemoryBuffer buffer, EnumStringBytes byteString) { + short id = byteString.dynamicWriteStringId; + int writerIndex = buffer.writerIndex(); + if (id == EnumStringBytes.DEFAULT_DYNAMIC_WRITE_STRING_ID) { + id = dynamicWriteStringId++; + byteString.dynamicWriteStringId = id; + EnumStringBytes[] dynamicWrittenEnumString = this.dynamicWrittenString; + if (dynamicWrittenEnumString.length <= id) { + EnumStringBytes[] tmp = new EnumStringBytes[id * 2]; + System.arraycopy(dynamicWrittenEnumString, 0, tmp, 0, dynamicWrittenEnumString.length); + dynamicWrittenEnumString = tmp; + this.dynamicWrittenString = tmp; + } + dynamicWrittenEnumString[id] = byteString; + int bytesLen = byteString.bytes.length; + buffer.increaseWriterIndex(11 + bytesLen); + buffer.unsafePut(writerIndex, USE_STRING_VALUE); + // Since duplicate enum string writing are avoided by dynamic id, + // use 8-byte hash won't increase too much space. + buffer.unsafePutLong(writerIndex + 1, byteString.hashCode); + buffer.unsafePutShort(writerIndex + 9, (short) bytesLen); + buffer.put(writerIndex + 11, byteString.bytes, 0, bytesLen); + } else { + buffer.increaseWriterIndex(3); + buffer.unsafePut(writerIndex, USE_STRING_ID); + buffer.unsafePutShort(writerIndex + 1, id); + } + } + + EnumStringBytes readEnumStringBytes(MemoryBuffer buffer) { + if (buffer.readByte() == USE_STRING_VALUE) { + long hashCode = buffer.readLong(); + EnumStringBytes byteString = trySkipEnumStringBytes(buffer, hashCode); + updateDynamicString(byteString); + return byteString; + } else { + return dynamicReadStringIds[buffer.readShort()]; + } + } + + EnumStringBytes readEnumStringBytes(MemoryBuffer buffer, EnumStringBytes cache) { + if (buffer.readByte() == USE_STRING_VALUE) { + long hashCode = buffer.readLong(); + if (cache.hashCode == hashCode) { + // skip byteString data + buffer.increaseReaderIndex(2 + cache.bytes.length); + updateDynamicString(cache); + return cache; + } else { + EnumStringBytes byteString = trySkipEnumStringBytes(buffer, hashCode); + updateDynamicString(byteString); + return byteString; + } + } else { + return dynamicReadStringIds[buffer.readShort()]; + } + } + + /** Read enum string by try to reuse previous read {@link EnumStringBytes} object. */ + private EnumStringBytes trySkipEnumStringBytes(MemoryBuffer buffer, long hashCode) { + EnumStringBytes byteString = hash2EnumStringBytesMap.get(hashCode); + if (byteString == null) { + int strBytesLength = buffer.readShort(); + byte[] strBytes = buffer.readBytes(strBytesLength); + byteString = new EnumStringBytes(strBytes, hashCode); + hash2EnumStringBytesMap.put(hashCode, byteString); + } else { + // skip byteString data + buffer.increaseReaderIndex(2 + byteString.bytes.length); + } + return byteString; + } + + private void updateDynamicString(EnumStringBytes byteString) { + short currentDynamicReadId = dynamicReadStringId++; + EnumStringBytes[] dynamicReadStringIds = this.dynamicReadStringIds; + if (dynamicReadStringIds.length <= currentDynamicReadId) { + EnumStringBytes[] tmp = new EnumStringBytes[currentDynamicReadId * 2]; + System.arraycopy(dynamicReadStringIds, 0, tmp, 0, dynamicReadStringIds.length); + dynamicReadStringIds = tmp; + this.dynamicReadStringIds = tmp; + } + dynamicReadStringIds[currentDynamicReadId] = byteString; + } + + public void reset() { + resetRead(); + resetWrite(); + } + + public void resetRead() { + int dynamicReadId = this.dynamicReadStringId; + if (dynamicReadId != 0) { + for (int i = 0; i < dynamicReadId; i++) { + dynamicReadStringIds[i] = null; + } + this.dynamicReadStringId = 0; + } + } + + public void resetWrite() { + int dynamicWriteStringId = this.dynamicWriteStringId; + if (dynamicWriteStringId != 0) { + for (int i = 0; i < dynamicWriteStringId; i++) { + dynamicWrittenString[i].dynamicWriteStringId = + EnumStringBytes.DEFAULT_DYNAMIC_WRITE_STRING_ID; + dynamicWrittenString[i] = null; + } + this.dynamicWriteStringId = 0; + } + } +} diff --git a/java/fury-core/src/test/java/io/fury/resolver/EnumStringResolverTest.java b/java/fury-core/src/test/java/io/fury/resolver/EnumStringResolverTest.java new file mode 100644 index 0000000000..a28fa7c541 --- /dev/null +++ b/java/fury-core/src/test/java/io/fury/resolver/EnumStringResolverTest.java @@ -0,0 +1,46 @@ +/* + * Copyright 2023 The Fury authors + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.fury.resolver; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +import io.fury.memory.MemoryBuffer; +import io.fury.memory.MemoryUtils; +import io.fury.util.StringUtils; +import org.testng.annotations.Test; + +public class EnumStringResolverTest { + + @Test + public void testWriteEnumString() { + MemoryBuffer buffer = MemoryUtils.buffer(32); + String str = StringUtils.random(128, 0); + EnumStringResolver stringResolver = new EnumStringResolver(); + for (int i = 0; i < 128; i++) { + stringResolver.writeEnumString(buffer, str); + } + for (int i = 0; i < 128; i++) { + String enumString = stringResolver.readEnumString(buffer); + assertEquals(enumString.hashCode(), str.hashCode()); + assertEquals(enumString.getBytes(), str.getBytes()); + } + assertTrue(buffer.writerIndex() < str.getBytes().length + 128 * 4); + } +}