In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances

# Load MovieLens small dataset
ratings_data = pd.read_csv("/Users/yushiyang/desktop/RecSys-Materials/ml-latest-small/ratings.csv")  # Replace with your file path
movies_data = pd.read_csv("/Users/yushiyang/desktop/RecSys-Materials/ml-latest-small/movies.csv")  # Replace with your file path

# Create a user-movie matrix
user_movie_matrix = ratings_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Choose a user for whom you want to find similar users
target_user_id = 1  # Change this to the desired user ID

# Get the movie ratings vector for the target user
target_user_vector = user_movie_matrix.loc[target_user_id].values.reshape(1, -1)

#### K-means Clustering to find similar users

In [4]:
# Perform K-Means clustering
num_clusters = 50  # You can adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_assignments = kmeans.fit_predict(user_movie_matrix)

# Find users in the same cluster as the target user
target_user_cluster = cluster_assignments[target_user_id - 1]
similar_users = np.where(cluster_assignments == target_user_cluster)[0]

# Calculate pairwise distances between the target user and similar users
pairwise_dist = pairwise_distances(target_user_vector, user_movie_matrix.iloc[similar_users], metric='cosine')

# Sort similar users by distance (similarity)
sorted_similar_users = similar_users[np.argsort(pairwise_dist)]

# Print similar user IDs
print(f"Similar users to user {target_user_id} are: {sorted_similar_users + 1}")  # Adding 1 to user IDs for display

# You can also get the movie recommendations for the target user using the similar users
# For example, find the top-rated movie among the similar users
# similar_users_ratings = user_movie_matrix.iloc[sorted_similar_users]
# top_movie = similar_users_ratings.mean(axis=0).idxmax()
# recommended_movie_title = movies_data[movies_data['movieId'] == top_movie]['title'].values[0]
# print(f"Recommended movie for user {target_user_id}: {recommended_movie_title}")



Similar users to user 1 are: [[  1 325 634 341 310 207  35 485 229 403 539 391 276 290 280 470 604 335
   49 477 202 669 545 546 315 320 521 601 167  76 321 618 330 617 424 372
  398 360 198 103 436 141 233  54  90 613 404  27 576 511 196 252 661  70
  551 614 318 419 326 211 257 395 186 467 507 611 606 337 488 193 222 351
  226 154 656 190  25  96 626 482   9 386  98 557 308 579 474  87  81  33
  361  79 453 476 414 429 435 411 469 465 454 397 437 438 406 462 443 444
  445 459 455 446 448 451 399 464 413 526 489 609 610 612 616 622 625 628
  630 631 632 635 636 637 638 640 642 643 644 645 650 651 653 657 663 668
  600 484 591 581 490 491 492 493 498 499 503 504 506 512 515 524 538 540
  541 543 549 552 554 565 566 567 571 573 578 583 393 334 383 112 113 115
  116 117 122 123 127 129 131 132 135 109 140 147 153 156 158 162 166 170
  171 172 173 174 179 142 180 107 100   3   6  10  11  12  13  14  16  18
   24  28  29 106  37  45  46  51  52  53  55  58  60  62  65  71  80  44
  181 183

#### K-means Clustering from Scratch

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load MovieLens small dataset
ratings_data = pd.read_csv("/Users/yushiyang/desktop/RecSys-Materials/ml-latest-small/ratings.csv")  # Replace with your file path
movies_data = pd.read_csv("/Users/yushiyang/desktop/RecSys-Materials/ml-latest-small/movies.csv")  # Replace with your file path

# Create a user-movie interaction matrix
interaction_matrix = ratings_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Fill missing values with 0 (no interaction)
interaction_matrix = interaction_matrix.fillna(0)

# Convert the interaction matrix to a numpy array
user_item_matrix = interaction_matrix.values

# K-Means implementation from scratch
def kmeans(data, k, max_iters=100):
    n_samples, n_features = data.shape
    centroids = data[np.random.choice(n_samples, k, replace=False)]

    for _ in range(max_iters):
        # Assign each data point to the nearest centroid
        labels = np.argmin(np.linalg.norm(data[:, np.newaxis] - centroids, axis=2), axis=1)

        # Update centroids
        new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])

        # Check for convergence
        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids

    return centroids, labels

# Set the number of clusters (K)
k = 5

# Run K-Means
centroids, labels = kmeans(user_item_matrix, k)

# Display cluster sizes
cluster_sizes = np.bincount(labels)
for cluster_id, size in enumerate(cluster_sizes):
    print(f"Cluster {cluster_id}: {size} users")