# MinHashing Example

Using the IMDB data, create different feature matrices and demonstrate performance improvement when using minhash signatures as features rather than the full set of actors.

In [1]:
%matplotlib inline

In [2]:
import json
import random
import pandas as pd

from scipy.sparse import lil_matrix

from sklearn.metrics import jaccard_score

In [3]:
# For testing, set abridge to true and a small dataset limit
#. Can run with MinHash without abridging your data
abridge = True
dataset_limit = 10000

In [4]:
actor_id_to_name_map = {}     # Map Actor IDs to actor names
actor_id_to_index_map = {}    # Map actor IDs to a unique index of known actors
index_to_actor_ids = []       # Array mapping unique index back to actor ID (invert of actor_id_to_index_map)

index_counter = 0    # Unique actor index; increment for each new actor
known_actors = set()

movie_actor_map = {} # dict of all our movies and their actors

test_count = 0
with open("../data/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)

        # Keep track of all the actors in this movie            
        for actor_id,actor_name in this_movie['actors']:
            
            # Keep names and IDs
            actor_id_to_name_map[actor_id] = actor_name
            
            # If we've seen this actor before, skip...
            if actor_id in known_actors:
                continue
                
            # ... Otherwise, add to known actor set and create new index for them
            known_actors.add(actor_id)
            actor_id_to_index_map[actor_id] = index_counter
            index_to_actor_ids.append(actor_id)
            index_counter += 1
            

        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": {a[0] for a in this_movie['actors']},
            "genres": this_movie["genres"]
        })
        
        # If abrdiged, test for limit
        test_count += 1        
        if abridge and test_count > dataset_limit:
            break

In [5]:
print("Known Actors:", len(known_actors))

Known Actors: 18324


## Convert to MinHash Reduced Dimension

For min-hashing, we first generate some number of permutations that will drive our "min hash" of the first non-zero element in the permuted column set (i.e., after shuffling columns, find the first permuted column idex with a non-zero element).

Note that, here, we _never_ store the full matrix of Movies X Actors.

In [17]:
permutations = []

# How many hashing functions to use
#  What happens as you change this number?
#  Larger values make things slower but provide better Jaccard estimates
num_minhashes = 10

# Generate permutations for hashing d-dimensional features into k minhash dimensions
for i in range(num_minhashes):
    column_indices = list(range(len(known_actors)))
    
    # Shuffle the original indices
    random.shuffle(column_indices)
    
    # Generate a map of the actual index to its permuted index
    permutations.append({j:i for i,j in enumerate(column_indices)})

In [18]:
minhash_rows = []

test_count = 0
with open("../data/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        this_movie_actor_indices = []
        for actor_id,_ in this_movie['actors']:
            this_movie_actor_indices.append(actor_id_to_index_map[actor_id])
            
        minhash_sig = []
        for permutation in permutations:
            
            # Find the first index of actors in this movie using the permuted indices map
            permed_index = sorted([permutation[i] for i in this_movie_actor_indices])[0]
            minhash_sig.append(permed_index)

            # Could do this instead, but it's slower in Python
#             for permed_index,rand_index in enumerate(permutation):
#                 if rand_index in this_movie_actor_indices:
#                     minhash_sig.append(permed_index)
#                     break
        
        minhash_rows.append(minhash_sig)
        
        
        test_count += 1        
        if abridge and test_count > dataset_limit:
            break

In [20]:
minhash_df = pd.DataFrame(
    minhash_rows,
    index=[m["movie"] for m in movie_actor_map.values()],
)
minhash_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Kate & Leopold,9370,7061,3145,6489,911,1694,303,11983,1585,4074
The Naked Monster,450,3900,9417,8065,3608,741,3508,298,1343,7908
Crime and Punishment,8211,202,10156,2491,2356,226,5411,948,686,555
For the Cause,7987,1275,1801,640,7440,6371,4608,269,3363,3074
Gang,1273,149,6866,1037,1219,108,1525,3314,5652,5519
...,...,...,...,...,...,...,...,...,...,...
H20: Tremont's Deathmatch Tournament,380,2362,1653,2618,4846,902,3186,2455,5392,1368
Daydream Nation,7916,4413,3352,7284,9477,6564,344,2164,2091,910
American Boy,5003,9044,8959,5479,2319,1971,4313,2155,1422,6869
Big Fat Gypsy Gangster,690,1386,1004,5776,8446,763,4841,1941,5451,2461


In [21]:
query_row = minhash_df.loc["Star Wars: Episode II - Attack of the Clones"]
minh_similarities = []

for idx,row in minhash_df.iterrows():
    score = sum(query_row == row) / len(row)
    minh_similarities.append((idx, score))
    

In [22]:
most_sim_mh = sorted(minh_similarities, key=lambda d: d[1], reverse=True)[:20]
most_sim_mh

[('Star Wars: Episode II - Attack of the Clones', 1.0),
 ('Star Wars: Episode III - Revenge of the Sith', 0.7),
 ('Awake', 0.3),
 ('Quantum Quest: A Cassini Space Odyssey', 0.3),
 ('New York, I Love You', 0.3),
 ('Vanishing on 7th Street', 0.3),
 ('Outcast', 0.3),
 ('Nora', 0.2),
 ('Black Hawk Down', 0.2),
 ('Big Fish', 0.2),
 ('Garden State', 0.2),
 ('Robots', 0.2),
 ('Faster', 0.2),
 ('Factory Girl', 0.2),
 ('V for Vendetta', 0.2),
 ('Virgin Territory', 0.2),
 ('Free Zone', 0.2),
 ('Jumper', 0.2),
 ('Miles Ahead', 0.2),
 ("Cassandra's Dream", 0.2)]