Skip to content

Commit

Permalink
Create indexes that store only one of the bands (not finished)
Browse files Browse the repository at this point in the history
This allow to create a distributed index between different
processes/machines.
  • Loading branch information
ZJaume committed Jun 12, 2023
1 parent 810b976 commit d0c54b3
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 9 deletions.
22 changes: 16 additions & 6 deletions gaoya/src/minhash/minhash_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ impl<T, Id> MinHashIndex<T, Id>
{
/// Create a new MinHashIndex
pub fn new(num_bands: usize, band_width: usize, jaccard_threshold: f64) -> Self {
MinHashIndex::<T, Id, HashSetContainer<Id>>::new_index(num_bands, band_width, jaccard_threshold)
MinHashIndex::<T, Id, HashSetContainer<Id>>::new_index(num_bands, band_width, jaccard_threshold, -1)
}
}

Expand All @@ -352,14 +352,24 @@ where
C: IdContainer<Id>
{
/// Create a new MinHashIndex
pub fn new_index(num_bands: usize, band_width: usize, jaccard_threshold: f64) -> Self {
let build_hasher = RandomState::new();
pub fn new_index(num_bands: usize,
band_width: usize,
jaccard_threshold: f64,
band_id: isize) -> Self {
let build_hasher = RandomState::with_seed(42);
let mut bands = Vec::new();
for i in 0..num_bands {
let (start, end) = (i * band_width, (i + 1) * band_width);
if band_id < 0 {
for i in 0..num_bands {
let (start, end) = (i * band_width, (i + 1) * band_width);
bands.push(MinHashBand::<T, Id, C>::new(start, end, build_hasher.clone()));
}
} else {
// Index with only one of all possible bands for partitioned indexing
let uband_id: usize = band_id.try_into().unwrap();
let (start, end) = (uband_id * band_width, (uband_id + 1) * band_width);
bands.push(MinHashBand::<T, Id, C>::new(start, end, build_hasher.clone()));
}
let mut hash_table = HashMap::with_hasher(ahash::RandomState::new());
let mut hash_table = HashMap::with_hasher(ahash::RandomState::with_seed(42));
hash_table.reserve(1000);
MinHashIndex {
bands,
Expand Down
8 changes: 5 additions & 3 deletions py-gaoya/src/min_hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ macro_rules! py_minhash_index {
num_hashes = "126",
analyzer = "\"word\"",
lowercase = "false",
ngram_range = "(1,1)"
ngram_range = "(1,1)",
band_id = "-1",
)]
pub fn new(
jaccard_threshold: f64,
Expand All @@ -46,10 +47,11 @@ macro_rules! py_minhash_index {
analyzer: Option<&str>,
lowercase: Option<bool>,
ngram_range: Option<(usize, usize)>,
band_id: Option<isize>,
) -> PyResult<Self> {
if let (Some(num_bands), Some(band_width)) = (num_bands, band_width) {
let index = $name {
inner: gaoya::minhash::MinHashIndex::<_, _, $container_type>::new_index(num_bands, band_width, jaccard_threshold),
inner: gaoya::minhash::MinHashIndex::<_, _, $container_type>::new_index(num_bands, band_width, jaccard_threshold, band_id.unwrap()),
min_hash: $minhash::new(num_bands * band_width),
tokenizer: TokenizerSpecification::new(analyzer.unwrap_or("word"), ngram_range),
lowercase: lowercase.unwrap_or(false),
Expand All @@ -59,7 +61,7 @@ macro_rules! py_minhash_index {
if let Some(num_hashes) = num_hashes {
let (num_bands, band_width) = calculate_minhash_params(jaccard_threshold, num_hashes);
let index = $name {
inner: gaoya::minhash::MinHashIndex::<_,_, $container_type>::new_index(num_bands, band_width, jaccard_threshold),
inner: gaoya::minhash::MinHashIndex::<_,_, $container_type>::new_index(num_bands, band_width, jaccard_threshold, band_id.unwrap()),
min_hash: $minhash::new(num_bands * band_width),
tokenizer: TokenizerSpecification::new(analyzer.unwrap_or("word"), ngram_range),
lowercase: lowercase.unwrap_or(false),
Expand Down

0 comments on commit d0c54b3

Please sign in to comment.