In [1]:
import cdfsampler
import natjoin
import random

In [2]:
def isPrime(n):
    for i in range(2,int(n**0.5)+1):
        if n%i==0:
            return False

    return True

In [3]:
def normalize(lower, upper, value):
    if lower == upper:
        return .5
    return (float(value) - float(lower)) / (float(upper) - float(lower))

In [4]:
def hash_function(entry_value, prime):
    hash_value = 0
    for element in str(entry_value):
        hash_value += ord(element) % prime
    return hash_value

In [5]:
def build_attrs_hash_dict(join_attrs):
    primes = []
    primes_needed = len(join_attrs.keys())
    lower_bound = 31
    upper_bound = 126
    while len(primes) < primes_needed:
        primes += [i for i in range(upper_bound, lower_bound+upper_bound) if isPrime(i)]
        lower_bound = upper_bound+1
        upper_bound += primes_needed

    attrs_hash_dict = {}
    for k, v in join_attrs.items():
        hash_num = random.choice(primes)
        attrs_hash_dict[k] = hash_num
        primes.remove(hash_num)
        
    return attrs_hash_dict

In [6]:
def generate_random_indices(table_length, filtered_table_length):
    random_indices = []
    while len(random_indices) < filtered_table_length:
        i = random.randint(0, table_length-1)
        if i not in random_indices:
            random_indices.append(i)
    return random_indices

In [11]:
def cdfjoin(tables, sampling_threshold):
    if len(tables) <= 1:
        return tables
    else:
        attrs = {}
        
        #find join attributes
        for i in range(len(tables)):
            table = tables[i]
            for table_key in table[0].keys():
                if table_key in attrs.keys(): attrs[table_key].append(i)
                else: attrs[table_key] = [i]
        join_attrs = {k: v for k,v in attrs.items() if len(v) > 1}
        
        #build dictionary of prime numbers to use when hashing each join attribute
        attrs_hash_dict = build_attrs_hash_dict(join_attrs)
            
        #compute hashes of each key in join attribute dictionary
        for k,v in attrs_hash_dict.items():
            table_indices = join_attrs[k]
            hash_min = hash_function(tables[table_indices[0]][0][k], v)
            hash_max = hash_min
            for index in table_indices:
                table = tables[index]
                for entry in table:
                    hash_score = hash_function(entry[k], v)
                    entry[str(k) + " hash score"] = hash_score
                    if hash_score < hash_min:
                        hash_min = hash_score
                    elif hash_score > hash_max:
                        hash_max = hash_score
            for index in table_indices:
                table = tables[index]
                for entry in table:
                    value = entry[str(k) + " hash score"]
                    entry[str(k) + " hash score"] = normalize(hash_min, hash_max, value)
            #attrs_hash_dict[k].append([hash_min, hash_max])
            
        #compute sums of all hashes i
        for table in tables:
            for entry in table:
                entry["hash sum"] = sum([v for k,v in entry.items() if k[-11:] == " hash score"])
                delete_keys = [k for k,v in entry.items() if k[-11:] == " hash score"]
                for key in delete_keys:
                    del entry[key]
        
        #filter for all entries whose cdf <= sampling probability
        filtered_tables = []
        for i in range(len(tables)):
            table = tables[i]
            filtered_table = []
            if "hash sum" in table[0].keys():
                n_join_attrs = len([i for k,v in join_attrs.items() if i in v])
                for entry in table:
                    if cdfsampler.cdf(n_join_attrs, entry["hash sum"]) <= sampling_threshold:
                        filtered_table.append(entry)
            if len(filtered_table) > 0:
                filtered_tables.append(filtered_table)
            else:
                return []
                
        #select random entries from each table
        random_tables = []
        for i in range(len(tables)):
            table = tables[i]
            #print(table)
            random_table_indices = generate_random_indices(len(table), len(filtered_tables[i]))
            random_tables.append([table[j] for j in random_table_indices])
        
        random_tables_empty = False
        for table in random_tables:
            if len(table) == 0:
                random_tables_empty = True
                break
        if random_tables_empty:
            joined_random_tables = []
        else:
            joined_random_tables = natjoin.natural_join(random_tables.copy())
    
        
        joined_filtered_tables = natjoin.natural_join(filtered_tables.copy())
        print("filtered tables")
        print(joined_filtered_tables)
        
        print("\nrandom tables")
        print(joined_random_tables)
        return joined_filtered_tables

In [17]:
example_table_one = [{"name": "barbara lewis", "age": 21, "year": 1957},
                    {"name": "mitski", "age": 21, "year": 1938},
                    {"name": "julian casablancas", "age": 21, "year": 2512},
                    {"name": "kali uchis", "age": 21, "year": 1914},
                    {"name": "angel olsen", "age": 21, "year": 1932},
                    {"name": "elvis presley", "age": 21, "year": 1957}]

example_table_two = [{"name": "harry styles", "age": 21, "g": "a"},
                    {"name": "mitski", "age": 21, "g": "b"},
                    {"name": "ravyn lenae", "age": 21, "g": "c"},
                    {"name": "paul simon", "age": 42, "g": "d"},
                    {"name": "janis joplin", "age": 27, "g": "e"},
                    {"name": "james morrison", "age": 27, "g": "f"}]

example_table_three = [{"name": "the breeders", "song": "off you"},
                      {"name": "mitski", "song": "nobody"}]

example_table_four = [{"id": " ", "birth": " ", "death": " "}]

cdfjoin([example_table_one, example_table_two, example_table_three], .5)

IndexError: list index out of range