# LSH Example

With the IMDB data and minhash signatures, use LSH to find similar pairs

In [1]:
%matplotlib inline

In [16]:
import json
import time
import random
import pandas as pd

from scipy.sparse import lil_matrix

from sklearn.metrics import jaccard_score

In [4]:
actor_id_to_name_map = {}     # Map Actor IDs to actor names
actor_id_to_index_map = {}    # Map actor IDs to a unique index of known actors
index_to_actor_ids = []       # Array mapping unique index back to actor ID (invert of actor_id_to_index_map)

index_counter = 0    # Unique actor index; increment for each new actor
known_actors = set()

movie_actor_list = [] # List of all our movies and their actors

test_count = 0
with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        # Keep track of all the actors in this movie
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            
            # Keep names and IDs
            actor_id_to_name_map[actor_id] = actor_name
            
            # If we've seen this actor before, skip...
            if actor_id in known_actors:
                continue
                
            # ... Otherwise, add to known actor set and create new index for them
            known_actors.add(actor_id)
            actor_id_to_index_map[actor_id] = index_counter
            index_to_actor_ids.append(actor_id)
            index_counter += 1
            
        # Finished with this film
        movie_actor_list.append({
            "movie": this_movie["title_name"],
            "actors": set(this_movie['actor_ids'])
        })


In [5]:
print("Known Actors:", len(known_actors))

Known Actors: 258059


## Convert to MinHash Reduced Dimension

For min-hashing, we first generate some number of permutations that will drive our "min hash" of the first non-zero element in the permuted column set (i.e., after shuffling columns, find the first permuted column idex with a non-zero element).

Note that, here, we _never_ store the full matrix of Movies X Actors.

In [8]:
permutations = []

# How many hashing functions to use
#  What happens as you change this number?
#  Larger values make things slower but provide better Jaccard estimates
num_minhashes = 16

# Generate permutations for hashing d-dimensional features into k minhash dimensions
for i in range(num_minhashes):
    column_indices = list(range(len(known_actors)))
    
    # Shuffle the original indices
    random.shuffle(column_indices)
    
    # Generate a map of the actual index to its permuted index
    permutations.append({j:i for i,j in enumerate(column_indices)})

In [10]:
minhash_rows = []

test_count = 0
with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        this_movie_actor_indices = []
        for actor_id in this_movie['actor_ids']:
            this_movie_actor_indices.append(actor_id_to_index_map[actor_id])
            
        minhash_sig = []
        for permutation in permutations:
            
            # Find the first index of actors in this movie using the permuted indices map
            permed_index = sorted([permutation[i] for i in this_movie_actor_indices])[0]
            minhash_sig.append(permed_index)
        
        minhash_rows.append(minhash_sig)
        


In [11]:
minhash_df = pd.DataFrame(
    minhash_rows,
    index=[m["movie"] for m in movie_actor_list],
)
minhash_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
The Other Side of the Wind,64063,213413,51756,31238,16668,108436,24046,168172,58127,146828,27495,96999,163909,64312,60444,59233
November 1828,45332,47998,51595,35681,21,67069,34183,38204,11222,58566,35478,701,14818,37920,29699,2719
The Drive to Win,102805,46818,104378,63696,114424,232973,137885,52116,10508,94942,58653,30333,70723,57428,9763,220997
The Naked Monster,92937,15574,113219,162130,10418,54017,70303,72520,155452,37712,85426,163802,36324,214145,48090,17136
El día de los albañiles 2,34600,240137,22990,77982,38064,115900,234972,74578,129473,25958,142589,5514,25597,6666,52415,43184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Secret of China,54895,12658,28993,23757,91912,6571,20939,25588,66368,5293,6402,18888,115912,55632,27621,67947
Kuambil Lagi Hatiku,43987,121924,179921,67471,30523,102785,112033,9315,12417,128539,212117,251681,66727,170977,117760,257539
Rodolpho Teóphilo - O Legado de um Pioneiro,179931,122225,98779,217441,200417,131108,25581,169482,219240,252974,148701,142194,139595,204941,225992,138995
Dankyavar Danka,11720,69910,9874,32253,87024,139477,7807,21012,13804,15807,26024,55813,55352,137508,102149,126133


In [12]:
query_row = minhash_df.loc["Star Wars: Episode II - Attack of the Clones"]
minh_similarities = []

for idx,row in minhash_df.iterrows():
    score = sum(query_row == row) / len(row)
    minh_similarities.append((idx, score))
    

In [13]:
most_sim_mh = sorted(minh_similarities, key=lambda d: d[1], reverse=True)[:20]
most_sim_mh

[('Star Wars: Episode II - Attack of the Clones', 1.0),
 ('Whales of Atlantis: In Search of Moby Dick', 0.6875),
 ('Necessary Evil: Super-Villains of DC Comics', 0.6875),
 ('Crimson Rivers 2: Angels of the Apocalypse', 0.5),
 ('Hugo', 0.4375),
 ('The Resident', 0.4375),
 ('Extraordinary Tales', 0.4375),
 ('Angels in Notting Hill', 0.375),
 ('Faster', 0.25),
 ('The Island', 0.25),
 ("Troy's Story", 0.25),
 ('Miss Potter', 0.25),
 ('The Final Fix', 0.25),
 ('Perfect Sense', 0.25),
 ('Salmon Fishing in the Yemen', 0.25),
 ('Fastest', 0.25),
 ('Charge', 0.25),
 ('Ice Bear', 0.25),
 ('Star Wars: Episode III - Revenge of the Sith', 0.1875),
 ('Young Adam', 0.1875)]

In [14]:
lsh_buckets = {}

rows = 4
bands = int(minhash_df.shape[1] / rows)
print("Bands:", bands, "Rows:", rows)

Bands: 4 Rows: 4


In [17]:
counter = 0
for idx,row in minhash_df.iterrows():
    
    for b in range(bands):
        segment = ",".join(["%d" % d for d in row[b*rows:(b+1)*rows]])
        bucket = hash(segment) % 2**20
        
        collision_set = lsh_buckets.get(bucket, set())
        collision_set.add(idx)
        
        lsh_buckets[bucket] = collision_set
    
    counter += 1
    
    if counter % 1000 == 0:
        print(time.asctime(), counter)

Thu Mar  3 15:17:05 2022 1000
Thu Mar  3 15:17:06 2022 2000
Thu Mar  3 15:17:06 2022 3000
Thu Mar  3 15:17:06 2022 4000
Thu Mar  3 15:17:06 2022 5000
Thu Mar  3 15:17:07 2022 6000
Thu Mar  3 15:17:07 2022 7000
Thu Mar  3 15:17:07 2022 8000
Thu Mar  3 15:17:07 2022 9000
Thu Mar  3 15:17:08 2022 10000
Thu Mar  3 15:17:08 2022 11000
Thu Mar  3 15:17:08 2022 12000
Thu Mar  3 15:17:08 2022 13000
Thu Mar  3 15:17:08 2022 14000
Thu Mar  3 15:17:09 2022 15000
Thu Mar  3 15:17:09 2022 16000
Thu Mar  3 15:17:09 2022 17000
Thu Mar  3 15:17:09 2022 18000
Thu Mar  3 15:17:10 2022 19000
Thu Mar  3 15:17:10 2022 20000
Thu Mar  3 15:17:10 2022 21000
Thu Mar  3 15:17:10 2022 22000
Thu Mar  3 15:17:10 2022 23000
Thu Mar  3 15:17:11 2022 24000
Thu Mar  3 15:17:11 2022 25000
Thu Mar  3 15:17:11 2022 26000
Thu Mar  3 15:17:11 2022 27000
Thu Mar  3 15:17:12 2022 28000
Thu Mar  3 15:17:12 2022 29000
Thu Mar  3 15:17:13 2022 30000
Thu Mar  3 15:17:13 2022 31000
Thu Mar  3 15:17:13 2022 32000
Thu Mar  3 15:17:

In [18]:
target_movie = "The Matrix Reloaded"

query_row = minhash_df.loc[target_movie]

candidates = set()
for bucket in lsh_buckets.values():
    if target_movie in bucket:
        candidates = candidates.union(bucket)

In [19]:
len(candidates)

9

In [20]:
for c in candidates:

    right_q = minhash_df.loc[c]
    
    score = 0.0
    if len(right_q.shape) > 1:
        for subidx,subrow in right_q.iterrows():
            score = sum(query_row == subrow) / len(subrow)
            if score > 0:
                print(c)
                print("\t", score)
    else:
        score = sum(query_row == right_q) / len(right_q)
        if score > 0:
            print(c)
            print("\t", score)


The Matrix Revolutions
	 1.0
The Matrix Reloaded
	 1.0


In [21]:
with open("lsh_serial.json", "w") as out_file:
    json.dump([list(bucket) for bucket in lsh_buckets.values()], out_file)