# LSH Example

With the IMDB data and minhash signatures, use LSH to find similar pairs

In [1]:
%matplotlib inline

In [15]:
import json
import time
import random
import pandas as pd

from scipy.sparse import lil_matrix

from sklearn.metrics import jaccard_score

In [3]:
actor_id_to_name_map = {}     # Map Actor IDs to actor names
actor_id_to_index_map = {}    # Map actor IDs to a unique index of known actors
index_to_actor_ids = []       # Array mapping unique index back to actor ID (invert of actor_id_to_index_map)

index_counter = 0    # Unique actor index; increment for each new actor
known_actors = set()

movie_actor_list = [] # List of all our movies and their actors

test_count = 0
with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        # Keep track of all the actors in this movie
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            
            # Keep names and IDs
            actor_id_to_name_map[actor_id] = actor_name
            
            # If we've seen this actor before, skip...
            if actor_id in known_actors:
                continue
                
            # ... Otherwise, add to known actor set and create new index for them
            known_actors.add(actor_id)
            actor_id_to_index_map[actor_id] = index_counter
            index_to_actor_ids.append(actor_id)
            index_counter += 1
            
        # Finished with this film
        movie_actor_list.append({
            "movie": this_movie["title_name"],
            "actors": set(this_movie['actor_ids'])
        })


In [4]:
print("Known Actors:", len(known_actors))

Known Actors: 258059


## Convert to MinHash Reduced Dimension

For min-hashing, we first generate some number of permutations that will drive our "min hash" of the first non-zero element in the permuted column set (i.e., after shuffling columns, find the first permuted column idex with a non-zero element).

Note that, here, we _never_ store the full matrix of Movies X Actors.

In [5]:
permutations = []

# How many hashing functions to use
#  What happens as you change this number?
#  Larger values make things slower but provide better Jaccard estimates
num_minhashes = 16

# Generate permutations for hashing d-dimensional features into k minhash dimensions
for i in range(num_minhashes):
    column_indices = list(range(len(known_actors)))
    
    # Shuffle the original indices
    random.shuffle(column_indices)
    
    # Generate a map of the actual index to its permuted index
    permutations.append({j:i for i,j in enumerate(column_indices)})

In [7]:
minhash_rows = []

test_count = 0
with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        this_movie_actor_indices = []
        for actor_id in this_movie['actor_ids']:
            this_movie_actor_indices.append(actor_id_to_index_map[actor_id])
            
        minhash_sig = []
        for permutation in permutations:
            
            # Find the first index of actors in this movie using the permuted indices map
            permed_index = sorted([permutation[i] for i in this_movie_actor_indices])[0]
            minhash_sig.append(permed_index)
        
        minhash_rows.append(minhash_sig)
        


In [10]:
minhash_df = pd.DataFrame(
    minhash_rows,
    index=[m["movie"] for m in movie_actor_list],
)
minhash_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
The Other Side of the Wind,20638,46609,88108,9793,57208,128643,88263,41799,107264,33126,19180,45936,60673,208132,23414,33987
November 1828,26357,68003,22910,25096,41455,80703,89260,63058,106732,15965,58087,16957,44400,19402,12271,47627
The Drive to Win,2219,32209,85612,106572,50965,137835,61283,54709,46857,116312,130238,103291,206391,203283,52176,91507
The Naked Monster,104706,5090,3381,198684,37335,71463,60673,11116,95625,126820,128286,157969,71372,1727,104298,27547
El día de los albañiles 2,96794,145985,26773,119227,30861,75890,31597,77941,237014,174819,197444,33149,74228,90660,72816,86531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Secret of China,28390,29556,53989,14947,25392,37586,77044,21078,9064,14005,64434,6762,128459,35569,79517,33012
Kuambil Lagi Hatiku,170850,200769,201300,69362,36243,6006,83371,63360,232814,25450,44413,118610,190494,101752,247589,69449
Rodolpho Teóphilo - O Legado de um Pioneiro,93091,228108,93132,21451,27476,29842,113138,44833,66294,168889,241508,53892,205893,139925,38480,17276
Dankyavar Danka,56832,70277,7930,158030,30967,96238,78689,73659,61063,167661,22030,186495,36090,53862,32192,14728


In [11]:
query_row = minhash_df.loc["Star Wars: Episode II - Attack of the Clones"]
minh_similarities = []

for idx,row in minhash_df.iterrows():
    score = sum(query_row == row) / len(row)
    minh_similarities.append((idx, score))
    

In [12]:
most_sim_mh = sorted(minh_similarities, key=lambda d: d[1], reverse=True)[:20]
most_sim_mh

[('Star Wars: Episode II - Attack of the Clones', 1.0),
 ('Star Wars: Episode III - Revenge of the Sith', 0.5),
 ('Whales of Atlantis: In Search of Moby Dick', 0.375),
 ('Necessary Evil: Super-Villains of DC Comics', 0.375),
 ('Crimson Rivers 2: Angels of the Apocalypse', 0.3125),
 ('Faster', 0.3125),
 ("Troy's Story", 0.3125),
 ('Miss Potter', 0.3125),
 ('The Final Fix', 0.3125),
 ('The Resident', 0.3125),
 ('Perfect Sense', 0.3125),
 ('Vanishing on 7th Street', 0.3125),
 ('Fastest', 0.3125),
 ('Charge', 0.3125),
 ('Ice Bear', 0.3125),
 ('Extraordinary Tales', 0.3125),
 ('Angels in Notting Hill', 0.3125),
 ('Life as a House', 0.25),
 ('Quantum Quest: A Cassini Space Odyssey', 0.25),
 ('Shattered Glass', 0.25)]

In [13]:
lsh_buckets = {}

rows = 4
bands = int(minhash_df.shape[1] / rows)
print("Bands:", bands, "Rows:", rows)

Bands: 4 Rows: 4


In [16]:
counter = 0
for idx,row in minhash_df.iterrows():
    
    for b in range(bands):
        segment = ",".join(["%d" % d for d in row[b*rows:(b+1)*rows]])
        bucket = hash(segment) % 2**20
        
        collision_set = lsh_buckets.get(bucket, set())
        collision_set.add(idx)
        
        lsh_buckets[bucket] = collision_set
    
    counter += 1
    
    if counter % 1000 == 0:
        print(time.asctime(), counter)

Tue Mar  8 15:01:26 2022 1000
Tue Mar  8 15:01:26 2022 2000
Tue Mar  8 15:01:26 2022 3000
Tue Mar  8 15:01:26 2022 4000
Tue Mar  8 15:01:26 2022 5000
Tue Mar  8 15:01:27 2022 6000
Tue Mar  8 15:01:27 2022 7000
Tue Mar  8 15:01:27 2022 8000
Tue Mar  8 15:01:27 2022 9000
Tue Mar  8 15:01:27 2022 10000
Tue Mar  8 15:01:28 2022 11000
Tue Mar  8 15:01:28 2022 12000
Tue Mar  8 15:01:28 2022 13000
Tue Mar  8 15:01:28 2022 14000
Tue Mar  8 15:01:28 2022 15000
Tue Mar  8 15:01:29 2022 16000
Tue Mar  8 15:01:29 2022 17000
Tue Mar  8 15:01:29 2022 18000
Tue Mar  8 15:01:29 2022 19000
Tue Mar  8 15:01:29 2022 20000
Tue Mar  8 15:01:30 2022 21000
Tue Mar  8 15:01:30 2022 22000
Tue Mar  8 15:01:30 2022 23000
Tue Mar  8 15:01:30 2022 24000
Tue Mar  8 15:01:31 2022 25000
Tue Mar  8 15:01:31 2022 26000
Tue Mar  8 15:01:31 2022 27000
Tue Mar  8 15:01:32 2022 28000
Tue Mar  8 15:01:32 2022 29000
Tue Mar  8 15:01:32 2022 30000
Tue Mar  8 15:01:32 2022 31000
Tue Mar  8 15:01:33 2022 32000
Tue Mar  8 15:01:

In [17]:
target_movie = "The Matrix Reloaded"

query_row = minhash_df.loc[target_movie]

candidates = set()
for bucket in lsh_buckets.values():
    if target_movie in bucket:
        candidates = candidates.union(bucket)

In [18]:
len(candidates)

16

In [19]:
for c in candidates:

    right_q = minhash_df.loc[c]
    
    score = 0.0
    if len(right_q.shape) > 1:
        for subidx,subrow in right_q.iterrows():
            score = sum(query_row == subrow) / len(subrow)
            if score > 0:
                print(c)
                print("\t", score)
    else:
        score = sum(query_row == right_q) / len(right_q)
        if score > 0:
            print(c)
            print("\t", score)


The Matrix Reloaded
	 1.0
Days of Wrath
	 0.5
Bobby Z
	 0.4375
Osmosis Jones
	 0.5
The Colony
	 0.375
The Autobiography of Malcolm X: As Told to Alex Haley (Audible Original)
	 0.5625
The Matrix Revolutions
	 1.0
Standoff
	 0.5
John Wick: Chapter 3 - Parabellum
	 0.625
The Signal
	 0.375
Five Fingers
	 0.5625


In [20]:
with open("lsh_serial.json", "w") as out_file:
    json.dump([list(bucket) for bucket in lsh_buckets.values()], out_file)