# Character Matrix Approach

In [1]:
import numpy as np
import hashlib
from collections import defaultdict
from joblib import Parallel, delayed
import hashlib
import re
from collections import defaultdict
from itertools import combinations
from utils.utils import clean_document, shingle, minhash
from joblib import Parallel, delayed

# Helper functions

def generate_shingles(text, k=2):
    """Generate k-shingles (substrings of length k) from the text."""
    split = text.split()
    return {' '.join(split[i:i+k]) for i in range(len(split) - k + 1)}

def create_characteristic_matrix(shingle_dict, docs):
    """Create a binary characteristic matrix based on shingles."""
    all_shingles = list(set(shingle for doc_shingles in shingle_dict.values() for shingle in doc_shingles))
    matrix = np.zeros((len(all_shingles), len(docs)), dtype=int)
    
    shingle_index = {shingle: idx for idx, shingle in enumerate(all_shingles)}
    
    doc_index_mapping = {doc_id: idx for idx, doc_id in enumerate(docs.keys())}  # Mapping of original doc_id to matrix index

    for doc_id, doc in enumerate(docs):
        for shingle in shingle_dict[doc]:
            matrix[shingle_index[shingle], doc_id] = 1

    return matrix, all_shingles, doc_index_mapping

def hash_shingle(shingle, seed):
    """Hash function for shingle using a seed."""
    return int(hashlib.md5((shingle + str(seed)).encode()).hexdigest(), 16)

def minhash_signature(characteristic_matrix, num_hashes, all_shingles):
    """Generate Minhash signatures for each document."""
    num_shingles, num_docs = characteristic_matrix.shape
    signatures = np.full((num_hashes, num_docs), np.inf)  # Initialize signatures with infinity
    
    # Apply multiple hash functions
    for i in range(num_hashes):
        # Hash function: apply a different hash function for each iteration
        hash_values = np.array([hash_shingle(shingle, i) for shingle in all_shingles])  # Hash shingles
        # hash_values = np.array(Parallel(n_jobs=-1)(delayed(hash_shingle)(shingle, i) for shingle in all_shingles))
        for doc_idx in range(num_docs):
            # For each document, find the minimum hash value of the shingles that are present (value 1)
            signatures[i, doc_idx] = np.min(hash_values[characteristic_matrix[:, doc_idx] == 1])

    return signatures

def lsh(signatures, bands, rows, doc_index_mapping):
    """Apply Locality Sensitive Hashing (LSH) to group similar signatures."""
    assert bands * rows == signatures.shape[0], "Number of hash functions must equal bands * rows"
    
    candidate_pairs = set()
    buckets = defaultdict(list)
    
    for b in range(bands):
        for doc_idx in range(signatures.shape[1]):
            band = tuple(signatures[b*rows:(b+1)*rows, doc_idx])
            buckets[band].append(doc_idx)
        
        # Collect candidate pairs from the buckets
        for bucket in buckets.values():
            if len(bucket) > 1:
                for i in range(len(bucket)):
                    for j in range(i + 1, len(bucket)):
                        doc1 = list(doc_index_mapping.keys())[list(doc_index_mapping.values()).index(bucket[i])]
                        doc2 = list(doc_index_mapping.keys())[list(doc_index_mapping.values()).index(bucket[j])]
                        candidate_pairs.add((doc1, doc2))
        buckets.clear()
    
    return candidate_pairs


def compute_hash_values(i, all_shingles):
    """Compute hash values for a specific hash function (i) across all shingles."""
    return i, np.array([int(hashlib.md5((str(s) + str(i)).encode()).hexdigest(), 16) for s in all_shingles])

from utils.utils import read_tsv

num_hashes = 100
bands = 20
rows_per_band = num_hashes // bands

tsv_dict = read_tsv('../../data/hundred.tsv')

seen_docs = {}
unique_docs = {}
cleaned_docs = {}
exact_duplicates = {}

for doc_id, doc in tsv_dict.items():
    unique_docs[doc_id] = doc
    cleaned_docs[doc_id] = clean_document(doc)

shingle_dict = {doc: generate_shingles(text, k=5) for doc, text in cleaned_docs.items()}
characteristic_matrix, all_shingles, doc_index_mapping = create_characteristic_matrix(shingle_dict, cleaned_docs)

seen_docs = {}
unique_docs = {}
exact_duplicates = {}

for doc_id, doc in tsv_dict.items():
    if doc not in seen_docs:
        unique_docs[doc_id] = doc
        seen_docs[doc] = doc_id  # Track first occurrence of the document
    else:
        original_id = seen_docs[doc]
        if original_id not in exact_duplicates:
            exact_duplicates[original_id] = []
        exact_duplicates[original_id].append(doc_id)

cleaned_docs = {doc_id: clean_document(doc) for doc_id, doc in unique_docs.items()}
shingle_dict = {doc: generate_shingles(text, k=5) for doc, text in cleaned_docs.items()}
characteristic_matrix, all_shingles, doc_index_mapping = create_characteristic_matrix(shingle_dict, cleaned_docs)

num_shingles, num_docs = characteristic_matrix.shape
signatures = np.full((num_hashes, num_docs), np.inf)

# Compute hash values for minhash signatures
hash_results = Parallel(n_jobs=-1)(delayed(compute_hash_values)(i, all_shingles) for i in range(num_hashes))

# Reconstruct the hash dictionary from the results
hash_dict = {i: hashes for i, hashes in hash_results}

# Apply Minhashing
signatures = np.full((num_hashes, len(unique_docs)), np.inf)
for i in range(num_hashes):
    hash_values = hash_dict[i]
    for doc_idx in range(len(unique_docs)):
        present_shingles = np.where(characteristic_matrix[:, doc_idx] == 1)[0]
        if len(present_shingles) > 0:
            signatures[i, doc_idx] = np.min(hash_values[present_shingles])

# Apply LSH on the signatures
candidate_pairs = lsh(signatures, bands, rows_per_band, doc_index_mapping)

In [2]:
from utils.utils import UnionFind
# Perform Union-Find to group similar documents
uf = UnionFind()
for doc1, doc2 in candidate_pairs:
    uf.union(doc1, doc2)

# Group documents by their root in Union-Find
clusters = defaultdict(list)
for doc_id in unique_docs:
    root = uf.find(doc_id)
    clusters[root].append(doc_id)

# Add the exact duplicates to their respective clusters
for original_doc, duplicates in exact_duplicates.items():
    root = uf.find(original_doc)
    clusters[root].extend(duplicates)

In [3]:
for cluster_id, doc_ids in clusters.items():
    # Join the doc_ids with spaces and write to the file
    doc_ids_str = ' '.join(map(str, doc_ids))
    print(doc_ids_str)

1
2
3
4
5
6
7 19 42 49 51 77 87 93
8
9
10 22 85
11
12
13
14
15
16 26 34 48 57 72 75 89 98
17
18
20
21 33 80 86 96
23
24
25 32
27
28 37 97
29 31 35 36 40 62 67 68 95 100
30
38 53 99
39 94
41
43
44
45
46 76
47
50
52
54 69
55
56
58
59
60
61 81
63
64
65
66
70
71
73
74
78
79
82
83
84
88
90
91
92


# Implemented Base LSH in project

In [4]:
import hashlib
import re

# Helper functions
def clean_document(text):
    """Clean and normalize the document by lowercasing and removing special characters."""
    text = re.sub(r'[^a-z\s]', '', text.lower())
    return text

def shingle(text, k=5):
    """Generate k-shingles from a given text."""
    split = text.split()
    return {' '.join(split[i:i+k]) for i in range(len(split) - k + 1)}

def minhash(shingles, num_hashes=100):
    """Generate minhash signature for the shingles."""
    signature = []
    for i in range(num_hashes):
        hash_vals = [int(hashlib.md5((str(s) + str(i)).encode()).hexdigest(), 16) for s in shingles]
        signature.append(min(hash_vals))
    return signature

class UnionFind:
    """Union-Find (Disjoint Set) implementation with path compression."""
    def __init__(self):
        self.parent = {}

    def find(self, x):
        if x not in self.parent:
            self.parent[x] = x
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])  # Path compression
        return self.parent[x]

    def union(self, x, y):
        rootX = self.find(x)
        rootY = self.find(y)
        if rootX != rootY:
            self.parent[rootX] = rootY

def read_tsv(tsv):
    with open(tsv, 'r', encoding='utf-8') as file:
        tsv_dict = {}
        for line in file:
            if line.strip():  # To skip empty lines
                index, text = line.split('\t', 1)
                tsv_dict[int(index)] = text
    return tsv_dict

import hashlib
import re
from collections import defaultdict
from itertools import combinations
from utils.utils import clean_document, shingle, minhash
from joblib import Parallel, delayed

class LSH:
    """Locality Sensitive Hashing with MinHash and Banding."""
    def __init__(self, num_hashes=100, num_bands=20, rows_per_band=5, k=5):
        self.num_hashes = num_hashes
        self.num_bands = num_bands
        self.rows_per_band = rows_per_band
        self.index = defaultdict(list)
        self.unique_docs = {}
        self.cleaned_docs = {}
        self.candidate_pairs = set()
        self.exact_duplicates = {}

        self.k = k
        assert self.num_hashes == self.num_bands * self.rows_per_band, "Hash functions must equal bands * rows_per_band"
    
    def remove_duplicates(self, docs):
        seen_docs = {}
        for doc_id, doc in docs.items():
            if doc not in seen_docs:
                self.unique_docs[doc_id] = doc
                seen_docs[doc] = doc_id  # Track first occurrence of the document
            else:
                # Track exact duplicates
                original_id = seen_docs[doc]
                if original_id not in self.exact_duplicates:
                    self.exact_duplicates[original_id] = []
                self.exact_duplicates[original_id].append(doc_id)

    # def compute_minhash_signatures(self, docs):
    #     """Compute MinHash signatures for each document."""
    #     self.remove_duplicates(docs)
    #     self.cleaned_docs = {doc_id: clean_document(doc) for doc_id, doc in self.unique_docs.items()}

    #     self.shingle_sets = {doc_id: shingle(doc, self.k) for doc_id, doc in self.cleaned_docs.items()}
    #     self.signatures = {doc_id: minhash(shingles, self.num_hashes) for doc_id, shingles in self.shingle_sets.items()}

    #     return self.signatures


    def compute_minhash_signatures(self, docs):
        """Compute MinHash signatures for each document in parallel."""
        self.remove_duplicates(docs)
        self.cleaned_docs = {doc_id: clean_document(doc) for doc_id, doc in self.unique_docs.items()}
        self.shingle_sets = {doc_id: shingle(doc, self.k) for doc_id, doc in self.cleaned_docs.items()}
        
        # Parallel computation of MinHash signatures
        signatures = Parallel(n_jobs=-1)( delayed(minhash)(shingles, self.num_hashes) for doc_id, shingles in self.shingle_sets.items())
        self.signatures = dict(zip(self.shingle_sets.keys(), signatures))

        return self.signatures

    def banding(self, signatures):
        """Apply LSH banding technique to find candidate pairs."""
        # Split the signature into bands
        for doc_id, sig in signatures.items():
            for band_idx in range(self.num_bands):
                start = band_idx * self.rows_per_band
                band = tuple(sig[start:start + self.rows_per_band])  # Use this band as the hash key
                self.index[(band_idx, band)].append(doc_id)
        
        # Find candidate pairs from documents that share the same band
        for doc_ids in self.index.values():
            if len(doc_ids) > 1:
                self.candidate_pairs.update(combinations(doc_ids, 2))  # All combinations of doc_ids in the same bucket
        return self.candidate_pairs
    
    
from collections import defaultdict
from utils.utils import UnionFind, clean_document, shingle, minhash

#Use Case 1
def collection_deduplication(lsh):
    # Step 5: Use Union-Find to cluster documents
    uf = UnionFind()
    for doc1, doc2 in lsh.candidate_pairs:
        uf.union(doc1, doc2)
    
    # Group documents by their root in Union-Find
    clusters = defaultdict(list)
    for doc_id in lsh.unique_docs:
        root = uf.find(doc_id)
        clusters[root].append(doc_id)
    
    # Now include the exact duplicates
    for original_id, duplicate_ids in lsh.exact_duplicates.items():
        root = uf.find(original_id)
        clusters[root].extend(duplicate_ids)  # Add the duplicates to the cluster of their original doc

    return clusters

# Use Case 2
def nearest_neighbor_search(query_doc, lsh):
    """Find approximate nearest neighbors for a given query document."""
    query_doc_cleaned = clean_document(query_doc)
    query_shingles = shingle(query_doc_cleaned, lsh.k)
    query_signature = minhash(query_shingles, lsh.num_hashes)
    
    candidate_pairs = set()
    # Find candidate pairs from the index
    for band_idx in range(lsh.num_bands):
        start = band_idx * lsh.rows_per_band
        band = tuple(query_signature[start:start + lsh.rows_per_band])
        if (band_idx, band) in lsh.index:
            candidate_pairs.update(lsh.index[(band_idx, band)])
    
    return candidate_pairs

In [5]:
lsh = LSH(num_hashes=100, num_bands=20, rows_per_band=5, k=3)

signatures = lsh.compute_minhash_signatures(tsv_dict)

lsh.banding(signatures)

clusters = collection_deduplication(lsh)

for cluster_id, doc_ids in clusters.items():
    # Join the doc_ids with spaces and write to the file
    doc_ids_str = ' '.join(map(str, doc_ids))
    print(doc_ids_str)

1
2
3
4
5
6
7 19 42 49 51 77 87 93
8
9
10 22 85
11
12
13
14
15
16 26 34 48 57 72 75 89 98
17
18
20
21 33 80 86 96
23
24
25 32
27
28 37 97
29 31 35 36 40 62 67 68 95 100
30
38 53 99
39 94
41
43
44
45
46 76
47
50
52
54 69
55
56
58
59
60
61 81
63
64
65
66
70
71
73
74
78
79
82
83
84
88
90
91
92
