From 6ce4c4ebf2d4c0a27feb7f341c6eee2525a1cce7 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Fri, 29 Jul 2022 09:05:02 +0100
Subject: [PATCH] Faster parquet DictEncoder (~20%) (#2123)

* Faster parquet DictEncoder

* Reserve dictionary capacity

* Split out interner

* Fix RAT
---
 parquet/Cargo.toml                            |   2 +
 .../src/encodings/encoding/dict_encoder.rs    | 185 ++++++++++++++
 .../{encoding.rs => encoding/mod.rs}          | 229 +-----------------
 parquet/src/util/hash_util.rs                 | 162 -------------
 parquet/src/util/interner.rs                  |  94 +++++++
 parquet/src/util/mod.rs                       |   4 +-
 6 files changed, 290 insertions(+), 386 deletions(-)
 create mode 100644 parquet/src/encodings/encoding/dict_encoder.rs
 rename parquet/src/encodings/{encoding.rs => encoding/mod.rs} (84%)
 delete mode 100644 parquet/src/util/hash_util.rs
 create mode 100644 parquet/src/util/interner.rs
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index eb620773325..a0e164150aa 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -30,6 +30,7 @@ edition = "2021"
 rust-version = "1.62"
 
 [dependencies]
+ahash = "0.7"
 parquet-format = { version = "4.0.0", default-features = false }
 bytes = { version = "1.1", default-features = false, features = ["std"] }
 byteorder = { version = "1", default-features = false }
@@ -49,6 +50,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"], op
 rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
 futures = { version = "0.3", default-features = false, features = ["std"], optional = true }
 tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "fs", "rt", "io-util"] }
+hashbrown = { version = "0.12", default-features = false }
 
 [dev-dependencies]
 base64 = { version = "0.13", default-features = false, features = ["std"] }
diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs
new file mode 100644
index 00000000000..7bf98325466
--- /dev/null
+++ b/parquet/src/encodings/encoding/dict_encoder.rs
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// ----------------------------------------------------------------------
+// Dictionary encoding
+
+use crate::basic::{Encoding, Type};
+use crate::data_type::private::ParquetValueType;
+use crate::data_type::{AsBytes, DataType};
+use crate::encodings::encoding::{Encoder, PlainEncoder};
+use crate::encodings::rle::RleEncoder;
+use crate::errors::{ParquetError, Result};
+use crate::schema::types::ColumnDescPtr;
+use crate::util::bit_util::num_required_bits;
+use crate::util::interner::{Interner, Storage};
+use crate::util::memory::ByteBufferPtr;
+use std::io::Write;
+
+#[derive(Debug)]
+struct KeyStorage<T: DataType> {
+    uniques: Vec<T::T>,
+
+    size_in_bytes: usize,
+
+    type_length: usize,
+}
+
+impl<T: DataType> Storage for KeyStorage<T> {
+    type Key = u64;
+    type Value = T::T;
+
+    fn get(&self, idx: Self::Key) -> &Self::Value {
+        &self.uniques[idx as usize]
+    }
+
+    fn push(&mut self, value: &Self::Value) -> Self::Key {
+        let (base_size, num_elements) = value.dict_encoding_size();
+
+        let unique_size = match T::get_physical_type() {
+            Type::BYTE_ARRAY => base_size + num_elements,
+            Type::FIXED_LEN_BYTE_ARRAY => self.type_length,
+            _ => base_size,
+        };
+        self.size_in_bytes += unique_size;
+
+        let key = self.uniques.len() as u64;
+        self.uniques.push(value.clone());
+        key
+    }
+}
+
+/// Dictionary encoder.
+/// The dictionary encoding builds a dictionary of values encountered in a given column.
+/// The dictionary page is written first, before the data pages of the column chunk.
+///
+/// Dictionary page format: the entries in the dictionary - in dictionary order -
+/// using the plain encoding.
+///
+/// Data page format: the bit width used to encode the entry ids stored as 1 byte
+/// (max bit width = 32), followed by the values encoded using RLE/Bit packed described
+/// above (with the given bit width).
+pub struct DictEncoder<T: DataType> {
+    /// Descriptor for the column to be encoded.
+    desc: ColumnDescPtr,
+
+    interner: Interner<KeyStorage<T>>,
+
+    /// The buffered indices
+    indices: Vec<u64>,
+}
+
+impl<T: DataType> DictEncoder<T> {
+    /// Creates new dictionary encoder.
+    pub fn new(desc: ColumnDescPtr) -> Self {
+        let storage = KeyStorage {
+            uniques: vec![],
+            size_in_bytes: 0,
+            type_length: desc.type_length() as usize,
+        };
+
+        Self {
+            desc,
+            interner: Interner::new(storage),
+            indices: vec![],
+        }
+    }
+
+    /// Returns true if dictionary entries are sorted, false otherwise.
+    pub fn is_sorted(&self) -> bool {
+        // Sorting is not supported currently.
+        false
+    }
+
+    /// Returns number of unique values (keys) in the dictionary.
+    pub fn num_entries(&self) -> usize {
+        self.interner.storage().uniques.len()
+    }
+
+    /// Returns size of unique values (keys) in the dictionary, in bytes.
+    pub fn dict_encoded_size(&self) -> usize {
+        self.interner.storage().size_in_bytes
+    }
+
+    /// Writes out the dictionary values with PLAIN encoding in a byte buffer, and return
+    /// the result.
+    pub fn write_dict(&self) -> Result<ByteBufferPtr> {
+        let mut plain_encoder = PlainEncoder::<T>::new(self.desc.clone(), vec![]);
+        plain_encoder.put(&self.interner.storage().uniques)?;
+        plain_encoder.flush_buffer()
+    }
+
+    /// Writes out the dictionary values with RLE encoding in a byte buffer, and return
+    /// the result.
+    pub fn write_indices(&mut self) -> Result<ByteBufferPtr> {
+        let buffer_len = self.estimated_data_encoded_size();
+        let mut buffer = vec![0; buffer_len];
+        buffer[0] = self.bit_width() as u8;
+
+        // Write bit width in the first byte
+        buffer.write_all((self.bit_width() as u8).as_bytes())?;
+        let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer, 1);
+        for index in &self.indices {
+            if !encoder.put(*index as u64)? {
+                return Err(general_err!("Encoder doesn't have enough space"));
+            }
+        }
+        self.indices.clear();
+        Ok(ByteBufferPtr::new(encoder.consume()?))
+    }
+
+    fn put_one(&mut self, value: &T::T) {
+        self.indices.push(self.interner.intern(value));
+    }
+
+    #[inline]
+    fn bit_width(&self) -> u8 {
+        let num_entries = self.num_entries();
+        if num_entries <= 1 {
+            num_entries as u8
+        } else {
+            num_required_bits(num_entries as u64 - 1)
+        }
+    }
+}
+
+impl<T: DataType> Encoder<T> for DictEncoder<T> {
+    fn put(&mut self, values: &[T::T]) -> Result<()> {
+        self.indices.reserve(values.len());
+        for i in values {
+            self.put_one(i)
+        }
+        Ok(())
+    }
+
+    // Performance Note:
+    // As far as can be seen these functions are rarely called and as such we can hint to the
+    // compiler that they dont need to be folded into hot locations in the final output.
+    fn encoding(&self) -> Encoding {
+        Encoding::PLAIN_DICTIONARY
+    }
+
+    fn estimated_data_encoded_size(&self) -> usize {
+        let bit_width = self.bit_width();
+        1 + RleEncoder::min_buffer_size(bit_width)
+            + RleEncoder::max_buffer_size(bit_width, self.indices.len())
+    }
+
+    fn flush_buffer(&mut self) -> Result<ByteBufferPtr> {
+        self.write_indices()
+    }
+}
diff --git a/parquet/src/encodings/encoding.rs b/parquet/src/encodings/encoding/mod.rs
similarity index 84%
rename from parquet/src/encodings/encoding.rs
rename to parquet/src/encodings/encoding/mod.rs
index 651635af59c..5cb94b7c0ae 100644
--- a/parquet/src/encodings/encoding.rs
+++ b/parquet/src/encodings/encoding/mod.rs
@@ -17,7 +17,7 @@
 
 //! Contains all supported encoders for Parquet.
 
-use std::{cmp, io::Write, marker::PhantomData};
+use std::{cmp, marker::PhantomData};
 
 use crate::basic::*;
 use crate::data_type::private::ParquetValueType;
@@ -27,10 +27,13 @@ use crate::errors::{ParquetError, Result};
 use crate::schema::types::ColumnDescPtr;
 use crate::util::{
     bit_util::{self, num_required_bits, BitWriter},
-    hash_util,
     memory::ByteBufferPtr,
 };
 
+pub use dict_encoder::DictEncoder;
+
+mod dict_encoder;
+
 // ----------------------------------------------------------------------
 // Encoders
 
@@ -154,225 +157,6 @@ impl<T: DataType> Encoder<T> for PlainEncoder<T> {
     }
 }
 
-// ----------------------------------------------------------------------
-// Dictionary encoding
-
-const INITIAL_HASH_TABLE_SIZE: usize = 1024;
-const MAX_HASH_LOAD: f32 = 0.7;
-const HASH_SLOT_EMPTY: i32 = -1;
-
-/// Dictionary encoder.
-/// The dictionary encoding builds a dictionary of values encountered in a given column.
-/// The dictionary page is written first, before the data pages of the column chunk.
-///
-/// Dictionary page format: the entries in the dictionary - in dictionary order -
-/// using the plain encoding.
-///
-/// Data page format: the bit width used to encode the entry ids stored as 1 byte
-/// (max bit width = 32), followed by the values encoded using RLE/Bit packed described
-/// above (with the given bit width).
-pub struct DictEncoder<T: DataType> {
-    // Descriptor for the column to be encoded.
-    desc: ColumnDescPtr,
-
-    // Size of the table. **Must be** a power of 2.
-    hash_table_size: usize,
-
-    // Store `hash_table_size` - 1, so that `j & mod_bitmask` is equivalent to
-    // `j % hash_table_size`, but uses far fewer CPU cycles.
-    mod_bitmask: u32,
-
-    // Stores indices which map (many-to-one) to the values in the `uniques` array.
-    // Here we are using fix-sized array with linear probing.
-    // A slot with `HASH_SLOT_EMPTY` indicates the slot is not currently occupied.
-    hash_slots: Vec<i32>,
-
-    // Indices that have not yet be written out by `write_indices()`.
-    buffered_indices: Vec<i32>,
-
-    // The unique observed values.
-    uniques: Vec<T::T>,
-
-    // Size in bytes needed to encode this dictionary.
-    uniques_size_in_bytes: usize,
-}
-
-impl<T: DataType> DictEncoder<T> {
-    /// Creates new dictionary encoder.
-    pub fn new(desc: ColumnDescPtr) -> Self {
-        let mut slots = vec![];
-        slots.resize(INITIAL_HASH_TABLE_SIZE, -1);
-        Self {
-            desc,
-            hash_table_size: INITIAL_HASH_TABLE_SIZE,
-            mod_bitmask: (INITIAL_HASH_TABLE_SIZE - 1) as u32,
-            hash_slots: slots,
-            buffered_indices: vec![],
-            uniques: vec![],
-            uniques_size_in_bytes: 0,
-        }
-    }
-
-    /// Returns true if dictionary entries are sorted, false otherwise.
-    #[inline]
-    pub fn is_sorted(&self) -> bool {
-        // Sorting is not supported currently.
-        false
-    }
-
-    /// Returns number of unique values (keys) in the dictionary.
-    pub fn num_entries(&self) -> usize {
-        self.uniques.len()
-    }
-
-    /// Returns size of unique values (keys) in the dictionary, in bytes.
-    pub fn dict_encoded_size(&self) -> usize {
-        self.uniques_size_in_bytes
-    }
-
-    /// Writes out the dictionary values with PLAIN encoding in a byte buffer, and return
-    /// the result.
-    #[inline]
-    pub fn write_dict(&self) -> Result<ByteBufferPtr> {
-        let mut plain_encoder = PlainEncoder::<T>::new(self.desc.clone(), vec![]);
-        plain_encoder.put(&self.uniques)?;
-        plain_encoder.flush_buffer()
-    }
-
-    /// Writes out the dictionary values with RLE encoding in a byte buffer, and return
-    /// the result.
-    pub fn write_indices(&mut self) -> Result<ByteBufferPtr> {
-        // TODO: the caller should allocate the buffer
-        let buffer_len = self.estimated_data_encoded_size();
-        let mut buffer: Vec<u8> = vec![0; buffer_len as usize];
-        buffer[0] = self.bit_width() as u8;
-
-        // Write bit width in the first byte
-        buffer.write_all((self.bit_width() as u8).as_bytes())?;
-        let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer, 1);
-        for index in &self.buffered_indices {
-            if !encoder.put(*index as u64)? {
-                return Err(general_err!("Encoder doesn't have enough space"));
-            }
-        }
-        self.buffered_indices.clear();
-        Ok(ByteBufferPtr::new(encoder.consume()?))
-    }
-
-    #[inline]
-    #[allow(clippy::unnecessary_wraps)]
-    fn put_one(&mut self, value: &T::T) -> Result<()> {
-        let mut j = (hash_util::hash(value, 0) & self.mod_bitmask) as usize;
-        let mut index = self.hash_slots[j];
-
-        while index != HASH_SLOT_EMPTY && self.uniques[index as usize] != *value {
-            j += 1;
-            if j == self.hash_table_size {
-                j = 0;
-            }
-            index = self.hash_slots[j];
-        }
-
-        if index == HASH_SLOT_EMPTY {
-            index = self.insert_fresh_slot(j, value.clone());
-        }
-
-        self.buffered_indices.push(index);
-        Ok(())
-    }
-
-    #[inline(never)]
-    fn insert_fresh_slot(&mut self, slot: usize, value: T::T) -> i32 {
-        let index = self.uniques.len() as i32;
-        self.hash_slots[slot] = index;
-
-        let (base_size, num_elements) = value.dict_encoding_size();
-
-        let unique_size = match T::get_physical_type() {
-            Type::BYTE_ARRAY => base_size + num_elements,
-            Type::FIXED_LEN_BYTE_ARRAY => self.desc.type_length() as usize,
-            _ => base_size,
-        };
-
-        self.uniques_size_in_bytes += unique_size;
-        self.uniques.push(value);
-
-        if self.uniques.len() > (self.hash_table_size as f32 * MAX_HASH_LOAD) as usize {
-            self.double_table_size();
-        }
-
-        index
-    }
-
-    #[inline]
-    fn bit_width(&self) -> u8 {
-        let num_entries = self.uniques.len();
-        if num_entries <= 1 {
-            num_entries as u8
-        } else {
-            num_required_bits(num_entries as u64 - 1)
-        }
-    }
-
-    fn double_table_size(&mut self) {
-        let new_size = self.hash_table_size * 2;
-        let mut new_hash_slots = vec![];
-        new_hash_slots.resize(new_size, HASH_SLOT_EMPTY);
-        for i in 0..self.hash_table_size {
-            let index = self.hash_slots[i];
-            if index == HASH_SLOT_EMPTY {
-                continue;
-            }
-            let value = &self.uniques[index as usize];
-            let mut j = (hash_util::hash(value, 0) & ((new_size - 1) as u32)) as usize;
-            let mut slot = new_hash_slots[j];
-            while slot != HASH_SLOT_EMPTY && self.uniques[slot as usize] != *value {
-                j += 1;
-                if j == new_size {
-                    j = 0;
-                }
-                slot = new_hash_slots[j];
-            }
-
-            new_hash_slots[j] = index;
-        }
-
-        self.hash_table_size = new_size;
-        self.mod_bitmask = (new_size - 1) as u32;
-        self.hash_slots = new_hash_slots;
-    }
-}
-
-impl<T: DataType> Encoder<T> for DictEncoder<T> {
-    #[inline]
-    fn put(&mut self, values: &[T::T]) -> Result<()> {
-        for i in values {
-            self.put_one(i)?
-        }
-        Ok(())
-    }
-
-    // Performance Note:
-    // As far as can be seen these functions are rarely called and as such we can hint to the
-    // compiler that they dont need to be folded into hot locations in the final output.
-    #[cold]
-    fn encoding(&self) -> Encoding {
-        Encoding::PLAIN_DICTIONARY
-    }
-
-    #[inline]
-    fn estimated_data_encoded_size(&self) -> usize {
-        let bit_width = self.bit_width();
-        1 + RleEncoder::min_buffer_size(bit_width)
-            + RleEncoder::max_buffer_size(bit_width, self.buffered_indices.len())
-    }
-
-    #[inline]
-    fn flush_buffer(&mut self) -> Result<ByteBufferPtr> {
-        self.write_indices()
-    }
-}
-
 // ----------------------------------------------------------------------
 // RLE encoding
 
@@ -587,7 +371,8 @@ impl<T: DataType> DeltaBitPackEncoder<T> {
             }
 
             // Compute bit width to store (max_delta - min_delta)
-            let bit_width = num_required_bits(self.subtract_u64(max_delta, min_delta)) as usize;
+            let bit_width =
+                num_required_bits(self.subtract_u64(max_delta, min_delta)) as usize;
             self.bit_writer.write_at(offset + i, bit_width as u8);
 
             // Encode values in current mini block using min_delta and bit_width
diff --git a/parquet/src/util/hash_util.rs b/parquet/src/util/hash_util.rs
deleted file mode 100644
index dd23e7a65f4..00000000000
--- a/parquet/src/util/hash_util.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::data_type::AsBytes;
-
-/// Computes hash value for `data`, with a seed value `seed`.
-/// The data type `T` must implement the `AsBytes` trait.
-pub fn hash<T: AsBytes>(data: &T, seed: u32) -> u32 {
-    hash_(data.as_bytes(), seed)
-}
-
-fn hash_(data: &[u8], seed: u32) -> u32 {
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    {
-        if is_x86_feature_detected!("sse4.2") {
-            unsafe { crc32_hash(data, seed) }
-        } else {
-            murmur_hash2_64a(data, seed as u64) as u32
-        }
-    }
-
-    #[cfg(any(
-        target_arch = "aarch64",
-        target_arch = "arm",
-        target_arch = "riscv64",
-        target_arch = "wasm32"
-    ))]
-    {
-        murmur_hash2_64a(data, seed as u64) as u32
-    }
-}
-
-const MURMUR_PRIME: u64 = 0xc6a4a7935bd1e995;
-const MURMUR_R: i32 = 47;
-
-/// Rust implementation of MurmurHash2, 64-bit version for 64-bit platforms
-fn murmur_hash2_64a(data_bytes: &[u8], seed: u64) -> u64 {
-    let len = data_bytes.len();
-    let len_64 = (len / 8) * 8;
-
-    let mut h = seed ^ (MURMUR_PRIME.wrapping_mul(data_bytes.len() as u64));
-    for mut k in data_bytes
-        .chunks_exact(8)
-        .map(|chunk| u64::from_ne_bytes(chunk.try_into().unwrap()))
-    {
-        k = k.wrapping_mul(MURMUR_PRIME);
-        k ^= k >> MURMUR_R;
-        k = k.wrapping_mul(MURMUR_PRIME);
-        h ^= k;
-        h = h.wrapping_mul(MURMUR_PRIME);
-    }
-
-    let data2 = &data_bytes[len_64..];
-
-    let v = len & 7;
-    if v == 7 {
-        h ^= (data2[6] as u64) << 48;
-    }
-    if v >= 6 {
-        h ^= (data2[5] as u64) << 40;
-    }
-    if v >= 5 {
-        h ^= (data2[4] as u64) << 32;
-    }
-    if v >= 4 {
-        h ^= (data2[3] as u64) << 24;
-    }
-    if v >= 3 {
-        h ^= (data2[2] as u64) << 16;
-    }
-    if v >= 2 {
-        h ^= (data2[1] as u64) << 8;
-    }
-    if v >= 1 {
-        h ^= data2[0] as u64;
-    }
-    if v > 0 {
-        h = h.wrapping_mul(MURMUR_PRIME);
-    }
-
-    h ^= h >> MURMUR_R;
-    h = h.wrapping_mul(MURMUR_PRIME);
-    h ^= h >> MURMUR_R;
-    h
-}
-
-/// CRC32 hash implementation using SSE4 instructions. Borrowed from Impala.
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[target_feature(enable = "sse4.2")]
-unsafe fn crc32_hash(bytes: &[u8], seed: u32) -> u32 {
-    #[cfg(target_arch = "x86")]
-    use std::arch::x86::*;
-    #[cfg(target_arch = "x86_64")]
-    use std::arch::x86_64::*;
-
-    let mut hash = seed;
-    for chunk in bytes
-        .chunks_exact(4)
-        .map(|chunk| u32::from_le_bytes(chunk.try_into().unwrap()))
-    {
-        hash = _mm_crc32_u32(hash, chunk);
-    }
-
-    let remainder = bytes.len() % 4;
-
-    for byte in &bytes[bytes.len() - remainder..] {
-        hash = _mm_crc32_u8(hash, *byte);
-    }
-
-    // The lower half of the CRC hash has poor uniformity, so swap the halves
-    // for anyone who only uses the first several bits of the hash.
-    hash = (hash << 16) | (hash >> 16);
-    hash
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_murmur2_64a() {
-        let result = murmur_hash2_64a(b"hello", 123);
-        assert_eq!(result, 2597646618390559622);
-
-        let result = murmur_hash2_64a(b"helloworld", 123);
-        assert_eq!(result, 4934371746140206573);
-
-        let result = murmur_hash2_64a(b"helloworldparquet", 123);
-        assert_eq!(result, 2392198230801491746);
-    }
-
-    #[test]
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    fn test_crc32() {
-        if is_x86_feature_detected!("sse4.2") {
-            unsafe {
-                let result = crc32_hash(b"hello", 123);
-                assert_eq!(result, 3359043980);
-
-                let result = crc32_hash(b"helloworld", 123);
-                assert_eq!(result, 3971745255);
-
-                let result = crc32_hash(b"helloworldparquet", 123);
-                assert_eq!(result, 1124504676);
-            }
-        }
-    }
-}
diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs
new file mode 100644
index 00000000000..e64ae0179e6
--- /dev/null
+++ b/parquet/src/util/interner.rs
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data_type::AsBytes;
+use hashbrown::hash_map::RawEntryMut;
+use hashbrown::HashMap;
+use std::hash::Hash;
+
+/// Storage trait for [`Interner`]
+pub trait Storage {
+    type Key: Copy;
+
+    type Value: AsBytes + PartialEq + ?Sized;
+
+    /// Gets an element by its key
+    fn get(&self, idx: Self::Key) -> &Self::Value;
+
+    /// Adds a new element, returning the key
+    fn push(&mut self, value: &Self::Value) -> Self::Key;
+}
+
+/// A generic value interner supporting various different [`Storage`]
+pub struct Interner<S: Storage> {
+    state: ahash::RandomState,
+
+    /// Used to provide a lookup from value to unique value
+    ///
+    /// Note: `S::Key`'s hash implementation is not used, instead the raw entry
+    /// API is used to store keys w.r.t the hash of the strings themselves
+    ///
+    dedup: HashMap<S::Key, (), ()>,
+
+    storage: S,
+}
+
+impl<S: Storage> Interner<S> {
+    /// Create a new `Interner` with the provided storage
+    pub fn new(storage: S) -> Self {
+        Self {
+            state: Default::default(),
+            dedup: Default::default(),
+            storage,
+        }
+    }
+
+    /// Intern the value, returning the interned key, and if this was a new value
+    pub fn intern(&mut self, value: &S::Value) -> S::Key {
+        let hash = compute_hash(&self.state, value);
+
+        let entry = self
+            .dedup
+            .raw_entry_mut()
+            .from_hash(hash, |index| value == self.storage.get(*index));
+
+        match entry {
+            RawEntryMut::Occupied(entry) => *entry.into_key(),
+            RawEntryMut::Vacant(entry) => {
+                let key = self.storage.push(value);
+
+                *entry
+                    .insert_with_hasher(hash, key, (), |key| {
+                        compute_hash(&self.state, self.storage.get(*key))
+                    })
+                    .0
+            }
+        }
+    }
+
+    /// Returns the storage for this interner
+    pub fn storage(&self) -> &S {
+        &self.storage
+    }
+}
+
+fn compute_hash<T: AsBytes + ?Sized>(state: &ahash::RandomState, value: &T) -> u64 {
+    use std::hash::{BuildHasher, Hasher};
+    let mut hasher = state.build_hasher();
+    value.as_bytes().hash(&mut hasher);
+    hasher.finish()
+}
diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs
index b49e3251692..01ac39116dc 100644
--- a/parquet/src/util/mod.rs
+++ b/parquet/src/util/mod.rs
@@ -21,10 +21,10 @@ pub mod memory;
 pub mod bit_util;
 mod bit_packing;
 pub mod cursor;
-pub mod hash_util;
+pub(crate) mod interner;
+pub(crate) mod page_util;
 #[cfg(any(test, feature = "test_common"))]
 pub(crate) mod test_common;
-pub(crate)mod page_util;
 
 #[cfg(any(test, feature = "test_common"))]
 pub use self::test_common::page_util::{