In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from concurrent.futures import ThreadPoolExecutor, as_completed

### Get a percentage of movies from dataset. (Get movies with most reviews first.)

In [7]:
# Program uses {percentageDatasetUsed} * total movies.
percentageDatasetUsed = 0.01

# Load the datasets
ratings = pd.read_csv('dataset/ratings.csv')
movies = pd.read_csv('dataset/movies.csv')

# Get a percentage of movie ids. Favor movies with more reviews.
movie_rating_counts = ratings['movieId'].value_counts()
sorted_movies = movie_rating_counts.sort_values(ascending=False)
top_movies = sorted_movies.head(int(len(sorted_movies) * percentageDatasetUsed)).index

print("Number of movies:", len(top_movies))

# Ratings of top movies
filtered_ratings = ratings[ratings['movieId'].isin(top_movies)]

Number of movies: 844


In [8]:
# Create sparse matrix of user,movie to rating
row = filtered_ratings['movieId'].astype('category').cat.codes
col = filtered_ratings['userId'].astype('category').cat.codes
data = filtered_ratings['rating']
movie_user_sparse = csr_matrix((data, (row, col)))

# Using movie to movie as cited by the paper. Compute movie similarities
movie_similarity = cosine_similarity(movie_user_sparse)
movie_similarity_df = pd.DataFrame(movie_similarity, index=filtered_ratings['movieId'].unique(), columns=filtered_ratings['movieId'].unique())

# Use the movie similarity matrix to create a sparse matrix for training the KNN model
row, col = np.tril_indices(movie_similarity_df.shape[0], -1)
data = movie_similarity_df.values[row, col]
movie_movie_sparse = csr_matrix((data, (row, col)), shape=movie_similarity_df.shape)

# Split data
train_data, test_data = train_test_split(movie_movie_sparse, test_size=0.2)

In [9]:
nearestNeighbors = 5  # Nearest neighbors to use
knn = NearestNeighbors(n_neighbors=nearestNeighbors, metric='cosine')
knn.fit(train_data)

# Precompute all nearest neighbors
distances, indices = knn.kneighbors(test_data, n_neighbors=nearestNeighbors)
print("Computed all nearest neighbors.")

# Function to compute predicted ratings
def compute_rating(movie_idx, user_idx, test_data, train_data, indices, distances):
    actual_rating = test_data[movie_idx, user_idx]
    neighbor_ratings = train_data[indices[movie_idx], user_idx].toarray().flatten()
    
    weights = 1 - distances[movie_idx]
    weighted_sum = np.dot(weights, neighbor_ratings)
    weight_sum = np.sum(weights)
    
    predicted_rating = weighted_sum / weight_sum if weight_sum > 0 else np.nan
    return actual_rating, predicted_rating

Computed all nearest neighbors.


In [10]:
# Prepare data
test_ratings = []
predicted_ratings = []

# Total number of non-zero entries to process
total_entries = len(test_data.nonzero()[0])

print(f"Starting {total_entries} jobs")
# Using ThreadPoolExecutor to parallelize the loop
with ThreadPoolExecutor() as executor:
    # Submit all jobs to the executor
    futures = [
        executor.submit(compute_rating, idx[0], idx[1], test_data, train_data, indices, distances)
        for idx in zip(*test_data.nonzero())
    ]
    
    # Track progress as the jobs complete
    for i, future in enumerate(as_completed(futures)):
        actual_rating, predicted_rating = future.result()
        test_ratings.append(actual_rating)
        predicted_ratings.append(predicted_rating)
        
        # Print progress every 10%
        if i % (total_entries // 10) == 0:
            print(f"Progress: {i / total_entries * 100:.1f}%")

print("Finished all jobs")

Starting 67414 jobs
Progress: 0.0%
Progress: 10.0%
Progress: 20.0%
Progress: 30.0%
Progress: 40.0%
Progress: 50.0%
Progress: 60.0%
Progress: 70.0%
Progress: 80.0%
Progress: 90.0%
Progress: 100.0%
Finished all jobs


In [None]:
# Denormalize the ratings by multiplying by 10 and rounding to 2 decimal places
denormalized_test_ratings = [round(rating * 10, 2) for rating in test_ratings]
denormalized_predicted_ratings = [round(rating * 10, 2) for rating in predicted_ratings]

firstX = 10
print("\nExample ratings:")
print(f"Test Ratings (first {firstX}):     ", [f"{rating:.2f}" for rating in denormalized_test_ratings[:firstX]])
print(f"Predicted Ratings (first {firstX}):", [f"{rating:.2f}" for rating in denormalized_predicted_ratings[:firstX]])

# Calculate the MSE
mse = mean_squared_error(denormalized_test_ratings, denormalized_predicted_ratings)
print(f"\nMean Squared Error: {mse:.5f}")


Example ratings:
Test Ratings (first 10):      ['1.77', '1.14', '1.42', '2.02', '2.97', '4.03', '1.60', '1.39', '2.12', '2.60']
Predicted Ratings (first 10): ['1.55', '1.38', '1.53', '2.06', '3.28', '2.87', '1.34', '1.48', '2.23', '2.23']

Mean Squared Error on the test set: 0.18604
