In [1]:
import torch

class Entity:
    def __init__(self, hllset, identifier, references=None):
        self.hllset = hllset
        self.identifier = identifier
        self.references = references if references is not None else []

    def add_reference(self, entity):
        self.references.append(entity)

    def save_to_redis(self, redis_client):
        serialized_tensor = self.serialize_tensor(self.hllset)
        redis_client.set(self.identifier, serialized_tensor)

    @staticmethod
    def load_from_redis(redis_client, identifier):
        serialized_tensor = redis_client.get(identifier)
        if serialized_tensor is None:
            return None
        hllset = Entity.deserialize_tensor(serialized_tensor).cuda()
        return Entity(hllset, identifier)

    @staticmethod
    def serialize_tensor(tensor):
        buffer = io.BytesIO()
        torch.save(tensor, buffer)
        return buffer.getvalue()

    @staticmethod
    def deserialize_tensor(buffer):
        buffer = io.BytesIO(buffer)
        return torch.load(buffer)

def elementwise_union(tensor1, tensor2):
    assert tensor1.shape == tensor2.shape, "Tensors must have the same shape"
    return tensor1 | tensor2

def elementwise_intersection(tensor1, tensor2):
    assert tensor1.shape == tensor2.shape, "Tensors must have the same shape"
    return tensor1 & tensor2

def elementwise_xor(tensor1, tensor2):
    assert tensor1.shape == tensor2.shape, "Tensors must have the same shape"
    return tensor1 ^ tensor2

def elementwise_complement(tensor):
    return ~tensor

In [2]:
# Example usage
max_int64 = 2**63 - 10
tensor1 = torch.randint(0, max_int64, (1024,), dtype=torch.int64).cuda()
tensor2 = torch.randint(0, max_int64, (1024,), dtype=torch.int64).cuda()

union_result = elementwise_union(tensor1, tensor2)
intersection_result = elementwise_intersection(tensor1, tensor2)
xor_result = elementwise_xor(tensor1, tensor2)
complement_result = elementwise_complement(tensor1)

print("Union Result:\n", union_result)
print("Intersection Result:\n", intersection_result)
print("XOR Result:\n", xor_result)
print("Complement Result:\n", complement_result)

Union Result:
 tensor([8779200116750787835, 8069323755752452854, 9221673701443105525,
         ..., 9213118953410246071, 7491509169858410319,
        3888717505835822718], device='cuda:0')
Intersection Result:
 tensor([2900329981963968520,  592654507737219776, 5206693427625853152,
         ..., 2022257627576931328, 2325266023330152969,
        2343588147121717250], device='cuda:0')
XOR Result:
 tensor([5878870134786819315, 7476669248015233078, 4014980273817252373,
         ..., 7190861325833314743, 5166243146528257350,
        1545129358714105468], device='cuda:0')
Complement Result:
 tensor([-4125750208646651994, -7817013255694848743, -6683892798729584886,
         ..., -9122762985539679394, -7333319403748549196,
        -2348411588737876031], device='cuda:0')


In [3]:
import torch
import mmh3
import random
import string

def generate_random_string(length=10):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def murmurhash3_64(s):
    # Ensure the hash value is within the range of a 64-bit signed integer
    return mmh3.hash64(s)[0] & 0x7FFFFFFFFFFFFFFF

def reverse_bits(tensor):
    reversed_tensor = torch.zeros_like(tensor).cuda()
    for i in range(64):
        reversed_tensor |= ((tensor >> i) & 1) << (63 - i)
    return reversed_tensor

def process_hashes(hashes, result_tensor, p):
    # Calculate index from leading bits
    mask = (1 << p) - 1  # Create a mask with the lower p bits set to 1
    indices = (hashes >> (64 - p)) & mask
    
    # Calculate number of trailing zeros
    trailing_zeros = (hashes & -hashes).log2().long()
    
    # Update tensor
    for i in range(hashes.size(0)):
        idx = indices[i]
        z = trailing_zeros[i]
        
        # Extract the integer from the tensor at index idx
        value = result_tensor[idx]
        
        # Reverse the bits of the integer
        reversed_value = reverse_bits(value)
        
        # Update the bitmap with 1 at position z
        updated_value = reversed_value | (1 << z)
        
        # Convert the bitmap back to an integer
        final_value = reverse_bits(updated_value)
        
        # Save it back in the tensor at index idx
        result_tensor[idx] = final_value

In [4]:
# Parameters
num_batches = 10000
batch_size = 8192  # Increased batch size
p = 10  # Number of leading bits to use for index

# Initialize tensor to store results
result_tensor = torch.zeros(1 << p, dtype=torch.int64).cuda()

# Generate string batches and process
for batch_idx in range(num_batches):
    # Generate random strings
    strings = [generate_random_string() for _ in range(batch_size)]
    print(f"strins length: {strings.count}")
    # Remove duplicates by converting to a set
    unique_strings = list(set(strings))    
    print(f"unique_strins length: {unique_strings.count}")
    # Apply MurmurHash3 to convert strings to unsigned integers
    hashes = torch.tensor([murmurhash3_64(s) for s in unique_strings], dtype=torch.int64).cuda()
    print(f"hashes length: {hashes.size()}")
    # Process hashes and update result tensor
    process_hashes(hashes, result_tensor, p)
    
    # Print results for the current batch
    print(f"Batch {batch_idx + 1}/{num_batches}")
    # print("Result Tensor:\n", result_tensor)

strins length: <built-in method count of list object at 0x7fc2640ae480>
unique_strins length: <built-in method count of list object at 0x7fc2641477c0>
hashes length: torch.Size([8192])
Batch 1/10000
strins length: <built-in method count of list object at 0x7fc264a73b00>
unique_strins length: <built-in method count of list object at 0x7fc258fb0340>
hashes length: torch.Size([8192])
Batch 2/10000
strins length: <built-in method count of list object at 0x7fc258f4c940>
unique_strins length: <built-in method count of list object at 0x7fc258f03e80>
hashes length: torch.Size([8192])
Batch 3/10000
strins length: <built-in method count of list object at 0x7fc258fb0280>
unique_strins length: <built-in method count of list object at 0x7fc258f46980>
hashes length: torch.Size([8192])
Batch 4/10000
strins length: <built-in method count of list object at 0x7fc258f4c980>
unique_strins length: <built-in method count of list object at 0x7fc258f1a3c0>
hashes length: torch.Size([8192])
Batch 5/10000
strin

In [6]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset

class GenericTokenizer:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    def tokenize(self, text):
        # Tokenize the text and return tokenized tokens as a string collection
        tokens = self.tokenizer.tokenize(text)
        return tokens

In [7]:
class BertTokenizerWrapper(GenericTokenizer):
    def __init__(self):
        super().__init__('bert-base-uncased')

class RobertaTokenizerWrapper(GenericTokenizer):
    def __init__(self):
        super().__init__('roberta-base')

class GPT2TokenizerWrapper(GenericTokenizer):
    def __init__(self):
        super().__init__('gpt2')
        
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx]

In [8]:
# Example text data
texts = [
    "Hello, how are you?",
    "Transformers are amazing!",
    "Let's tokenize this text."
]

# Create dataset and dataloader
dataset = TextDataset(texts)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

# Instantiate a tokenizer wrapper (e.g., BERT)
tokenizer = BertTokenizerWrapper()

# Tokenize text data using the dataloader
for batch in dataloader:
    for text in batch:
        tokens = tokenizer.tokenize(text)
        print(f"Original Text: {text}")
        print(f"Tokenized Tokens: {tokens}")

Original Text: Hello, how are you?
Tokenized Tokens: ['hello', ',', 'how', 'are', 'you', '?']
Original Text: Transformers are amazing!
Tokenized Tokens: ['transformers', 'are', 'amazing', '!']
Original Text: Let's tokenize this text.
Tokenized Tokens: ['let', "'", 's', 'token', '##ize', 'this', 'text', '.']


