In [1]:
import numpy as np
import pandas as pd
import time
from datasketch import MinHash, MinHashLSHForest

In [13]:
def get_lsh_forest(data, num_permutations, seed):
    start_time = time.time()

    # Compute MinHash signatures
    minhash = []
    for movie_names in data['title']:
        # Ensured the same permutation for all users
        m = MinHash(num_perm=num_permutations, seed=seed)
        # MinHash all the movie names (shingles) for this user 
        for movie_name in movie_names:
            m.update(movie_name.encode('utf8'))
        minhash.append(m)

    # Build a forest of all the MinHashed strings
    forest = MinHashLSHForest(num_perm=num_permutations)
 
    for i, minmash in enumerate(minhash):
        forest.add(i, minmash)

    # Index the forest to make it searchable      
    forest.index()

    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [14]:
def predict(rated_movies, database, num_permutations, seed, num_results, forest):
    start_time = time.time()
    
    m = MinHash(num_perm=num_permutations, seed=seed)
    for movie_name in rated_movies:
        m.update(movie_name.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        print('\n We couldn not find similar users to your favourite movies. \n')
        return None
    
    similar_users = database.iloc[idx_array].reset_index(drop=True)[['userId', 'title']]
    
    print('It took %s seconds to query forest.' % (time.time() - start_time))
    
    return similar_users

In [4]:
import pandas as pd
from datasketch import MinHash, MinHashLSHForest

# Load ratings data
ratings_data = pd.read_csv("/Users/yushiyang/desktop/RecSys-Materials/ml-latest-small/ratings.csv")  # Replace with your file path

# Load movies data
movies_data = pd.read_csv("/Users/yushiyang/desktop/RecSys-Materials/ml-latest-small/movies.csv")  # Replace with your file path

# Merge ratings and movies data to get movie names for each user
data = ratings_data.merge(movies_data, on='movieId')

# Group movie names by user
grouped_data = data.groupby('userId')['title'].apply(list).reset_index()

print(grouped_data)

     userId                                              title
0         1  [Dangerous Minds (1995), Dumbo (1941), Sleeper...
1         2  [GoldenEye (1995), Sense and Sensibility (1995...
2         3  [Braveheart (1995), Pulp Fiction (1994), Forre...
3         4  [Star Trek: The Motion Picture (1979), French ...
4         5  [Antz (1998), Clueless (1995), Apollo 13 (1995...
..      ...                                                ...
666     667  [Sense and Sensibility (1995), Braveheart (199...
667     668  [Pulp Fiction (1994), Silence of the Lambs, Th...
668     669  [French Connection, The (1971), Clerks (1994),...
669     670  [Seven (a.k.a. Se7en) (1995), Usual Suspects, ...
670     671  [Blazing Saddles (1974), Usual Suspects, The (...

[671 rows x 2 columns]


In [15]:
num_permutations = 256
num_recommendations = 10
my_seed = 1000

In [20]:
forest = get_lsh_forest(grouped_data, num_permutations, my_seed)

It took 0.8607230186462402 seconds to build forest.


In [21]:
test_user = ['Titanic (1997)'
    'Toy Story (1995)',
    'Inception (2010)',
    'The Hunger Games (2012)',
    'Ice Age 4: Continental Drift (2012)',
    'Gone Girl (2014)',
    'Harry Potter and the Deathly Hallows: Part 1 (2010)',
    'Winnie the Pooh (2011)',
    'Frozen (2013)']
similar_users = predict(test_user, grouped_data, num_permutations, my_seed, num_recommendations, forest)
print('\n Most similar User(s) are \n', similar_users)

It took 0.007055997848510742 seconds to query forest.

 Most similar User(s) are 
    userId                                              title
0     578  [Sense and Sensibility (1995), Usual Suspects,...


In [23]:
# Test of validity of returned users by using an existing user as test user
first_user = ['Dangerous Minds (1995)', 'Dumbo (1941)', 'Sleepers (1996)', 'Escape from New York (1981)', 'Cinema Paradiso (Nuovo cinema Paradiso) (1989)', 'Deer Hunter, The (1978)', 'Ben-Hur (1959)', 'Gandhi (1982)', "Dracula (Bram Stoker's Dracula) (1992)", 'Cape Fear (1991)', 'Star Trek: The Motion Picture (1979)', 'Beavis and Butt-Head Do America (1996)', 'French Connection, The (1971)', 'Tron (1982)', 'Gods Must Be Crazy, The (1980)', 'Willow (1988)', 'Antz (1998)', 'Fly, The (1986)', 'Time Bandits (1981)', 'Blazing Saddles (1974)']
similar_users = predict(first_user, grouped_data, num_permutations, my_seed, num_recommendations, forest)
print('\n Most similar User(s) are \n', similar_users)

It took 0.00601506233215332 seconds to query forest.

 Most similar User(s) are 
    userId                                              title
0       1  [Dangerous Minds (1995), Dumbo (1941), Sleeper...
1     290  [Dracula (Bram Stoker's Dracula) (1992), Star ...
2      35  [Dumbo (1941), Tron (1982), Time Bandits (1981...
3     325  [Dangerous Minds (1995), Sleepers (1996), Esca...
4       9  [Antz (1998), Sense and Sensibility (1995), Se...
5     618  [Dracula (Bram Stoker's Dracula) (1992), Usual...
6     207  [Ben-Hur (1959), Dracula (Bram Stoker's Dracul...
7     337  [Dracula (Bram Stoker's Dracula) (1992), Termi...
8     310  [Dangerous Minds (1995), Dracula (Bram Stoker'...
9     634  [Escape from New York (1981), Gandhi (1982), S...
