In [1]:
import numpy as np
import pandas as pd
import time
from datasketch import MinHash, MinHashLSHForest
from collections import Counter

#### Minhash + LSH to find similar users

In [2]:
# minhash each user's data separatedly and then aggregate

def minhash_for_one_user(movie_names, num_permutations, seed):
    # Set seed for the permutations to ensure consistent comparison across users
    m = MinHash(num_perm=num_permutations, seed = seed)
    for movie_name in movie_names:
            m.update(movie_name.encode('utf8'))
    return m

def aggregate_minhashes(data, num_permutations, seed):
     start_time = time.time()
     minhashes = []
     # Aggregate all users' minhashes
     for movie_names in data['title']:
         one_user_minhash = minhash_for_one_user(movie_names, num_permutations, seed)
         minhashes.append(one_user_minhash)

     print('It took %s seconds to create the minhashes for all users and aggregate them.' %(time.time()-start_time))
    
     return minhashes
     
def get_lsh_forest(data, num_permutations, seed):
    start_time = time.time()
    
    minhashes = aggregate_minhashes(data, num_permutations, seed)
    # Use the LSH Forest algorithm for efficient search of neighbours
    # Build a forest of all the MinHashed strings
    forest = MinHashLSHForest(num_perm=num_permutations)
 
    for i, minmash in enumerate(minhashes):
        forest.add(i, minmash)

    # Index the forest to make it searchable      
    forest.index()

    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [30]:
def predict(rated_movies, database, num_permutations, seed, num_results, forest):
    start_time = time.time()
    
    # The test user 
    m = MinHash(num_perm=num_permutations, seed=seed)
    for movie_name in rated_movies:
        m.update(movie_name.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        print('\n We couldn not find similar users to your favourite movies. \n')
        return None
    
    similar_users = database.iloc[idx_array].reset_index(drop=True)[['userId', 'title']]
    
    print('It took %s seconds to query forest.' % (time.time() - start_time))
    
    return similar_users

In [1]:
import pandas as pd
from datasketch import MinHash, MinHashLSHForest

# Load ratings data
ratings_data = pd.read_csv("/Users/yushiyang/desktop/RecSys-Materials/ml-latest-small/ratings.csv")  # Replace with your file path

# Load movies data
movies_data = pd.read_csv("/Users/yushiyang/desktop/RecSys-Materials/ml-latest-small/movies.csv")  # Replace with your file path

# Merge ratings and movies data to get movie names for each user
data = ratings_data.merge(movies_data, on='movieId')

# Group movie names by user
grouped_data = data.groupby('userId')['title'].apply(list).reset_index()

print(grouped_data)

     userId                                              title
0         1  [Dangerous Minds (1995), Dumbo (1941), Sleeper...
1         2  [GoldenEye (1995), Sense and Sensibility (1995...
2         3  [Braveheart (1995), Pulp Fiction (1994), Forre...
3         4  [Star Trek: The Motion Picture (1979), French ...
4         5  [Antz (1998), Clueless (1995), Apollo 13 (1995...
..      ...                                                ...
666     667  [Sense and Sensibility (1995), Braveheart (199...
667     668  [Pulp Fiction (1994), Silence of the Lambs, Th...
668     669  [French Connection, The (1971), Clerks (1994),...
669     670  [Seven (a.k.a. Se7en) (1995), Usual Suspects, ...
670     671  [Blazing Saddles (1974), Usual Suspects, The (...

[671 rows x 2 columns]


In [4]:
num_permutations = 256
num_recommendations = 10
my_seed = 1000

In [5]:
forest = get_lsh_forest(grouped_data, num_permutations, my_seed)

It took 0.863814115524292 seconds to create the minhashes for all users and aggregate them.
[<datasketch.minhash.MinHash object at 0x132674050>, <datasketch.minhash.MinHash object at 0x10ebe7e10>, <datasketch.minhash.MinHash object at 0x10d637350>, <datasketch.minhash.MinHash object at 0x132676550>, <datasketch.minhash.MinHash object at 0x132750c10>, <datasketch.minhash.MinHash object at 0x132824350>, <datasketch.minhash.MinHash object at 0x132824390>, <datasketch.minhash.MinHash object at 0x1328242d0>, <datasketch.minhash.MinHash object at 0x132824410>, <datasketch.minhash.MinHash object at 0x10e7388d0>, <datasketch.minhash.MinHash object at 0x132824110>, <datasketch.minhash.MinHash object at 0x132824450>, <datasketch.minhash.MinHash object at 0x1328244d0>, <datasketch.minhash.MinHash object at 0x132824510>, <datasketch.minhash.MinHash object at 0x1328243d0>, <datasketch.minhash.MinHash object at 0x10dd64f90>, <datasketch.minhash.MinHash object at 0x132824550>, <datasketch.minhash.Min

In [31]:
test_user = ['Titanic (1997)'
    'Toy Story (1995)',
    'Inception (2010)',
    'The Hunger Games (2012)',
    'Ice Age 4: Continental Drift (2012)',
    'Gone Girl (2014)',
    'Harry Potter and the Deathly Hallows: Part 1 (2010)',
    'Winnie the Pooh (2011)',
    'Frozen (2013)']
similar_users = predict(test_user, grouped_data, num_permutations, my_seed, num_recommendations, forest)
print('\n Most similar User(s) are \n', similar_users)

It took 0.0074880123138427734 seconds to query forest.

 Most similar User(s) are 
    userId                                              title
0     578  [Sense and Sensibility (1995), Usual Suspects,...


In [43]:
# Test of validity of returned users by using an existing user as the basis for a test user
first_user = ['Dangerous Minds (1995)', 'Dumbo (1941)', 'Sleepers (1996)', 'Escape from New York (1981)', 'Cinema Paradiso (Nuovo cinema Paradiso) (1989)', 'Deer Hunter, The (1978)', 'Ben-Hur (1959)', 'Gandhi (1982)', "Dracula (Bram Stoker's Dracula) (1992)", 'Cape Fear (1991)', 'Star Trek: The Motion Picture (1979)', 'Beavis and Butt-Head Do America (1996)', 'Tron (1982)', 'Willow (1988)', 'Antz (1998)', 'Fly, The (1986)']
similar_users = predict(first_user, grouped_data, num_permutations, my_seed, num_recommendations, forest)
print('\n Most similar User(s) are \n', similar_users)

It took 0.003210306167602539 seconds to query forest.

 Most similar User(s) are 
    userId                                              title
0       1  [Dangerous Minds (1995), Dumbo (1941), Sleeper...
1     290  [Dracula (Bram Stoker's Dracula) (1992), Star ...
2      35  [Dumbo (1941), Tron (1982), Time Bandits (1981...
3     325  [Dangerous Minds (1995), Sleepers (1996), Esca...
4       9  [Antz (1998), Sense and Sensibility (1995), Se...
5     618  [Dracula (Bram Stoker's Dracula) (1992), Usual...
6     207  [Ben-Hur (1959), Dracula (Bram Stoker's Dracul...
7     337  [Dracula (Bram Stoker's Dracula) (1992), Termi...
8     310  [Dangerous Minds (1995), Dracula (Bram Stoker'...
9     634  [Escape from New York (1981), Gandhi (1982), S...


#### Return Top 10 popular movies based on similar users

In [47]:
# Flatten the list of titles excluding movies seen by test user
remaining_movie_titles = [movie for movies_list in similar_users['title'] for movie in movies_list if movie not in first_user]

# Count the occurrences of each remaining movie title
remaining_movie_counts = Counter(remaining_movie_titles)

# Sort remaining movies based on counts in descending order
sorted_remaining_movies = sorted(remaining_movie_counts.items(), key=lambda x: x[1], reverse=True)

# Print the ranked top 10 remaining movies
print("Top 10 Ranked Remaining Movies:")
for rank, (movie, count) in enumerate(sorted_remaining_movies[:10], start=1):
    print(f"Rank {rank}: {movie} - Count: {count}")

Top 10 Ranked Remaining Movies:
Rank 1: Ronin (1998) - Count: 4
Rank 2: My Fair Lady (1964) - Count: 3
Rank 3: Beverly Hills Cop (1984) - Count: 3
Rank 4: Junior (1994) - Count: 3
Rank 5: Seven (a.k.a. Se7en) (1995) - Count: 2
Rank 6: Usual Suspects, The (1995) - Count: 2
Rank 7: Star Wars: Episode IV - A New Hope (1977) - Count: 2
Rank 8: Star Trek: Generations (1994) - Count: 2
Rank 9: Godfather, The (1972) - Count: 2
Rank 10: Star Trek VI: The Undiscovered Country (1991) - Count: 2


#### Encrypt movie names locally, then aggregate to compute ranking - diffferent results due to encryption/decryption

In [62]:
# Create a symmetric key for encryption and decryption
key = Fernet.generate_key()
cipher_suite = Fernet(key)

# Encrypt the movie titles
encrypted_movie_titles = [cipher_suite.encrypt(movie.encode()) for movies_list in similar_users['title'] for movie in movies_list if movie not in first_user]

# Count the occurrences of each encrypted remaining movie title
encrypted_movie_counts = Counter(encrypted_movie_titles)

# Sort remaining movies based on counts in descending order
sorted_remaining_movies = sorted(encrypted_movie_counts.items(), key=lambda x: x[1], reverse=True)

# Decrypt the top 10 remaining movies and print the decrypted results
print("Top 10 Ranked Remaining Movies:")
for rank, (encrypted_movie, count) in enumerate(sorted_remaining_movies[:10], start=1):
    decrypted_movie = cipher_suite.decrypt(encrypted_movie).decode()
    print(f"Rank {rank}: {decrypted_movie} - Count: {count}")

Top 10 Ranked Remaining Movies:
Rank 1: GoldenEye (1995) - Count: 1
Rank 2: Seven (a.k.a. Se7en) (1995) - Count: 1
Rank 3: Usual Suspects, The (1995) - Count: 1
Rank 4: Apollo 13 (1995) - Count: 1
Rank 5: Die Hard: With a Vengeance (1995) - Count: 1
Rank 6: Fugitive, The (1993) - Count: 1
Rank 7: Aristocats, The (1970) - Count: 1
Rank 8: Fahrenheit 9/11 (2004) - Count: 1
Rank 9: Star Wars: Episode IV - A New Hope (1977) - Count: 1
Rank 10: Star Trek: Generations (1994) - Count: 1


In [1]:
SYFT_VERSION = ">=0.8.2.b0,<0.9"
import syft as sy
sy.requires(SYFT_VERSION)



✅ The installed version of syft==0.8.2b4 matches the requirement >=0.8.2b0 and the requirement <0.9


In [2]:
node = sy.orchestra.launch(name="my-domain", port=8080, dev_mode=True, reset=True)

NameError: name 'NodeType' is not defined

In [3]:
domain_client = sy.login(port=8080, email="info@openmined.org", password="changethis")



ConnectionError: HTTPConnectionPool(host='localhost', port=8080): Max retries exceeded with url: /api/v2/metadata (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x164f12a50>: Failed to establish a new connection: [Errno 61] Connection refused'))