In [1]:
import pandas as pd
from scipy import sparse
import json

In [2]:
# Load ratings from JSON file instead of CSV
with open('users_data.json', 'r') as f:
    users_data = json.load(f)

# Load movies data to get movie titles
movies = pd.read_csv('dataset/movies.csv')

# Convert JSON ratings to DataFrame format similar to original CSV
ratings_data = []
for username, user_info in users_data['users'].items():
    for movie_id, rating in user_info['ratings'].items():
        ratings_data.append({
            'userId': username,  # Using username instead of numeric ID
            'movieId': int(movie_id),
            'rating': rating
        })

ratings = pd.DataFrame(ratings_data)

# Merge with movies to get titles (note: using 'id' column from movies.csv)
ratings = pd.merge(movies, ratings, left_on='id', right_on='movieId').drop(['genre','movieId'], axis=1)

print(ratings.shape)
ratings.head()

(393, 10)


Unnamed: 0,id,title,original_language,overview,popularity,release_date,vote_average,vote_count,userId,rating
0,278,The Shawshank Redemption,en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,op,10
1,278,The Shawshank Redemption,en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,jin,10
2,278,The Shawshank Redemption,en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,juan,1
3,278,The Shawshank Redemption,en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,dude,1
4,19404,Dilwale Dulhania Le Jayenge,hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,yy,1


In [3]:
userRatings = ratings.pivot_table(index=['userId'],columns=['title'],values='rating')
userRatings.head()
print("Before: ",userRatings.shape)
# For JSON data, we don't filter by threshold since we have fewer but more meaningful ratings
# userRatings = userRatings.dropna(thresh=10, axis=1).fillna(0,axis=1)
userRatings.fillna(0, inplace=True)
print("After: ",userRatings.shape)

Before:  (54, 162)
After:  (54, 162)


In [4]:
corrMatrix = userRatings.corr(method='pearson')
corrMatrix.head()

title,(500) Days of Summer,10 Things I Hate About You,28 Days Later,A Quiet Place Part II,A Silent Voice: The Movie,A Whisker Away,Alien,Anatomy of a Murder,Avatar,Avengers: Age of Ultron,...,Us Again,Violet Evergarden: The Movie,Weathering with You,What Ever Happened to Baby Jane?,Wild Tales,Wolf Children,X-Men: Days of Future Past,Your Eyes Tell,Your Name.,Zack Snyder's Justice League
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer,1.0,1.0,-0.03881,-0.038851,-0.039721,-0.018868,-0.03881,-0.0269,-0.018868,-0.026939,...,-0.018868,-0.057098,-0.033171,-0.026768,-0.018868,-0.018868,-0.03327,0.700404,-0.110479,-0.032927
10 Things I Hate About You,1.0,1.0,-0.03881,-0.038851,-0.039721,-0.018868,-0.03881,-0.0269,-0.018868,-0.026939,...,-0.018868,-0.057098,-0.033171,-0.026768,-0.018868,-0.018868,-0.03327,0.700404,-0.110479,-0.032927
28 Days Later,-0.03881,-0.03881,1.0,0.998937,-0.081704,-0.03881,1.0,-0.055332,-0.03881,-0.055411,...,-0.03881,-0.117447,-0.06823,0.673417,-0.03881,-0.03881,-0.068434,-0.055411,-0.211436,-0.067728
A Quiet Place Part II,-0.038851,-0.038851,0.998937,1.0,-0.081791,-0.038851,0.998937,-0.05539,-0.038851,-0.05547,...,-0.038851,-0.117572,-0.068302,0.688973,-0.038851,-0.038851,-0.068507,-0.05547,-0.212057,-0.0678
A Silent Voice: The Movie,-0.039721,-0.039721,-0.081704,-0.081791,1.0,0.510265,-0.081704,-0.05663,-0.039721,-0.056712,...,0.510265,0.453563,0.546118,-0.056352,-0.039721,-0.039721,-0.070041,-0.056712,0.366181,-0.069318


In [5]:
def get_similar(movie_name,rating):
    similar_ratings = corrMatrix[movie_name]*(rating-5.5)  # Adjusted center for 1-10 scale
    similar_ratings = similar_ratings.sort_values(ascending=False)
    return similar_ratings

In [6]:
# Test with anime/animation lover profile using our actual popular movies
anime_lover = [("Your Name.",10),("Spider-Man: Into the Spider-Verse",9),("Spirited Away",10),("Violet Evergarden: The Movie",8)]
similar_movies = pd.DataFrame()
for movie,rating in anime_lover:
    if movie in userRatings.columns:  # Check if movie exists
        similar_movies = pd.concat([similar_movies, get_similar(movie,rating).to_frame().T], ignore_index=True)

similar_movies.head(10)

title,Your Name.,Violet Evergarden: The Movie,Spirited Away,Steins;Gate: The Movie - Load Region of Déjà Vu,Rascal Does Not Dream of a Dreaming Girl,I Want to Eat Your Pancreas,"Josee, the Tiger and the Fish",A Silent Voice: The Movie,Weathering with You,Sword Art Online: The Movie – Ordinal Scale,...,Nobody,28 Days Later,Alien,A Quiet Place Part II,Justice League Dark: Apokolips War,The Long Goodbye,Pulp Fiction,Spider-Man: Far From Home,Se7en,Parasite
0,4.5,2.529692,2.344998,2.336864,1.918406,1.68474,1.681169,1.647812,1.40985,1.199331,...,-0.944537,-0.951464,-0.951464,-0.954257,-0.98513,-1.091128,-1.094761,-1.155096,-1.214305,-1.216817
1,0.671031,-0.177851,0.67798,-0.159719,0.098774,0.11285,0.128989,0.132017,-0.445613,0.605548,...,-0.519679,-0.521374,-0.521374,-0.521929,0.168995,-0.588919,-0.587124,1.580303,-0.650052,-0.651397
2,2.344998,1.367097,4.5,2.205635,0.233189,-0.536833,-0.536262,-0.548851,-0.458338,0.897078,...,-0.534519,-0.536262,-0.536262,-0.536833,-0.599889,-0.605735,-0.603889,-0.605735,-0.668615,-0.669998
3,1.405384,2.5,0.759498,1.621381,1.932355,1.167984,1.156043,1.133907,0.87833,0.509727,...,-0.292663,-0.293617,-0.293617,-0.29393,-0.328455,-0.331655,-0.330645,-0.331655,-0.366083,-0.366841


In [7]:
similar_movies.sum().sort_values(ascending=False).head(20)

title
Your Name.                                         8.921414
Spirited Away                                      8.282477
Violet Evergarden: The Movie                       6.218937
Steins;Gate: The Movie - Load Region of Déjà Vu    6.004161
Spider-Man: Into the Spider-Verse                  5.107407
Rascal Does Not Dream of a Dreaming Girl           4.182724
Sword Art Online: The Movie – Ordinal Scale        3.211684
Josee, the Tiger and the Fish                      2.429939
I Want to Eat Your Pancreas                        2.428740
A Silent Voice: The Movie                          2.364886
Coco                                               1.890503
Spider-Man: Homecoming                             1.753637
Weathering with You                                1.384229
A Whisker Away                                     0.972893
Ride Your Wave                                     0.972893
The Garden of Words                                0.972893
Millennium Actress                

In [8]:
# Test with superhero/action lover profile using our actual popular movies
action_lover = [("Spider-Man: No Way Home",10),("Avengers: Endgame",9),("The Dark Knight",10),("Avengers: Infinity War",8)]
similar_movies = pd.DataFrame()
for movie,rating in action_lover:
    if movie in userRatings.columns:  # Check if movie exists
        similar_movies = pd.concat([similar_movies, get_similar(movie,rating).to_frame().T], ignore_index=True)

similar_movies.head(10)
similar_movies.sum().sort_values(ascending=False).head(20)

title
Avengers: Endgame                         7.964257
Avengers: Infinity War                    7.381418
Captain America: Civil War                7.305559
X-Men: Days of Future Past                7.195013
Captain America: The Winter Soldier       7.169166
Avengers: Age of Ultron                   6.481438
The Suicide Squad                         6.474189
Thor: Ragnarok                            6.468229
Guardians of the Galaxy                   6.432079
Spider-Man: No Way Home                   6.139969
The Empire Strikes Back                   6.068254
Star Wars                                 6.068254
The Avengers                              5.991861
The Dark Knight                           5.982149
Justice League Dark: Apokolips War        5.823627
Avatar                                    4.591689
Iron Man                                  4.591689
The Tomorrow War                          4.487563
Everything Everywhere All at Once         4.487563
Justice League: The Flash

In [9]:
# Alternative: Use movies that definitely exist in our dataset
print("Top 10 most rated movies in our data that we can use for testing:")
movie_counts = (userRatings > 0).sum().sort_values(ascending=False).head(10)
for i, (movie, count) in enumerate(movie_counts.items(), 1):
    avg_rating = userRatings[movie][userRatings[movie] > 0].mean()
    print(f"{i}. '{movie}' ({count} users, avg: {avg_rating:.1f}/10)")

print(f"\nDataset stats:")
print(f"Users: {userRatings.shape[0]}")
print(f"Movies: {userRatings.shape[1]}")
print(f"Total ratings: {(userRatings > 0).sum().sum()}")
print(f"Sparsity: {((userRatings == 0).sum().sum() / (userRatings.shape[0] * userRatings.shape[1]) * 100):.1f}%")

Top 10 most rated movies in our data that we can use for testing:
1. 'Your Name.' (26 users, avg: 7.7/10)
2. 'Spider-Man: Into the Spider-Verse' (14 users, avg: 7.9/10)
3. 'Spider-Man: No Way Home' (10 users, avg: 8.4/10)
4. 'Spirited Away' (9 users, avg: 8.7/10)
5. 'Violet Evergarden: The Movie' (8 users, avg: 9.5/10)
6. 'The Dark Knight' (8 users, avg: 8.8/10)
7. 'Avengers: Endgame' (8 users, avg: 9.8/10)
8. 'Spider-Man' (7 users, avg: 9.4/10)
9. 'Steins;Gate: The Movie - Load Region of Déjà Vu' (7 users, avg: 9.9/10)
10. 'Parasite' (6 users, avg: 9.5/10)

Dataset stats:
Users: 54
Movies: 162
Total ratings: 393
Sparsity: 95.5%


In [11]:
# User-based Collaborative Filtering Implementation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def standardize(row):
    """Standardize ratings by normalizing with mean and range"""
    # Handle cases where all values are the same or all zeros
    if row.max() == row.min():
        # If all ratings are the same, return zeros (no preference)
        return pd.Series(np.zeros(len(row)), index=row.index)
    
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

# Apply standardization to user ratings (row-wise for users)
ratings_std = userRatings.apply(standardize, axis=1)

# Fill any remaining NaN values with 0
ratings_std = ratings_std.fillna(0)

# We are doing user similarity instead of item similarity
# Users are already in rows, so we can directly calculate cosine similarity
user_similarity = cosine_similarity(ratings_std)
print("User similarity matrix shape:", user_similarity.shape)
print("\nUser similarity matrix:")
print(user_similarity)

User similarity matrix shape: (54, 54)

User similarity matrix:
[[ 1.          0.49531971  0.27319549 ...  0.55606104 -0.03199828
  -0.03477275]
 [ 0.49531971  1.          0.57375304 ...  0.52799027 -0.01584938
  -0.01722363]
 [ 0.27319549  0.57375304  1.         ...  0.29418683 -0.02762404
  -0.00750481]
 ...
 [ 0.55606104  0.52799027  0.29418683 ...  1.         -0.0275825
  -0.02997409]
 [-0.03199828 -0.01584938 -0.02762404 ... -0.0275825   1.
  -0.04395039]
 [-0.03477275 -0.01722363 -0.00750481 ... -0.02997409 -0.04395039
   1.        ]]


In [12]:
# Rating Prediction and Evaluation
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

def predict_user_rating(user_idx, movie_idx, user_similarity_matrix, ratings_std_matrix, original_ratings):
    """
    Predict rating for a user-movie pair using user-based collaborative filtering
    """
    # Get similarities for the target user
    user_similarities = user_similarity_matrix[user_idx]
    
    # Get users who have rated this movie (excluding the target user)
    movie_ratings = original_ratings.iloc[:, movie_idx]
    users_who_rated = movie_ratings[movie_ratings > 0].index
    users_who_rated = [i for i, user in enumerate(original_ratings.index) if user in users_who_rated and i != user_idx]
    
    if len(users_who_rated) == 0:
        # No similar users found, return global average
        return original_ratings[original_ratings > 0].mean().mean()
    
    # Calculate weighted average of similar users' ratings
    weighted_sum = 0
    similarity_sum = 0
    
    for similar_user_idx in users_who_rated:
        similarity = user_similarities[similar_user_idx]
        if similarity > 0:  # Only consider positive similarities
            rating = original_ratings.iloc[similar_user_idx, movie_idx]
            weighted_sum += similarity * rating
            similarity_sum += abs(similarity)
    
    if similarity_sum == 0:
        # No positive similarities, return global average
        return original_ratings[original_ratings > 0].mean().mean()
    
    predicted_rating = weighted_sum / similarity_sum
    return max(1, min(10, predicted_rating))  # Clip to valid range

# Split data for evaluation - use some ratings for training, some for testing
def create_train_test_split(ratings_matrix, test_ratio=0.2):
    """Create train/test split by hiding some ratings"""
    train_matrix = ratings_matrix.copy()
    test_data = []
    
    for user_idx in range(len(ratings_matrix)):
        user_ratings = ratings_matrix.iloc[user_idx]
        rated_movies = user_ratings[user_ratings > 0].index.tolist()
        
        if len(rated_movies) > 1:  # Only if user has more than 1 rating
            # Randomly select movies to hide for testing
            n_test = max(1, int(len(rated_movies) * test_ratio))
            test_movies = np.random.choice(rated_movies, size=n_test, replace=False)
            
            for movie in test_movies:
                movie_idx = ratings_matrix.columns.get_loc(movie)
                actual_rating = ratings_matrix.iloc[user_idx, movie_idx]
                test_data.append((user_idx, movie_idx, actual_rating))
                # Hide the rating in training matrix
                train_matrix.iloc[user_idx, movie_idx] = 0
    
    return train_matrix, test_data

# Create train/test split
np.random.seed(42)  # For reproducibility
train_ratings, test_data = create_train_test_split(userRatings, test_ratio=0.2)

print(f"Original ratings: {(userRatings > 0).sum().sum()}")
print(f"Training ratings: {(train_ratings > 0).sum().sum()}")
print(f"Test predictions to make: {len(test_data)}")

Original ratings: 393
Training ratings: 315
Test predictions to make: 78


In [16]:
# Create user similarity DataFrame with user names as index and columns
user_similarity_df = pd.DataFrame(user_similarity, index=userRatings.index, columns=userRatings.index)
user_similarity_df

userId,11,22,33,44,at,but,cj,dd,deez,df,...,test7,th,uu,wee,wx,yx,yy,zenu,zz,zzz
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,1.0,0.49532,0.273195,-0.017789,0.568247,0.007895,0.357452,-0.045696,0.292753,-0.017789,...,0.293889,0.49532,-0.028293,0.698323,-0.021827,0.365399,0.277345,0.556061,-0.031998,-0.034773
22,0.49532,1.0,0.573753,-0.008811,0.573753,0.049159,0.370855,-0.022634,0.612334,-0.008811,...,0.61471,1.0,-0.014014,0.779354,-0.010811,0.399486,0.186018,0.52799,-0.015849,-0.017224
33,0.273195,0.573753,1.0,-0.015357,0.320755,0.015578,0.646367,-0.03945,0.343234,-0.015357,...,0.344566,0.573753,-0.024426,0.442606,-0.018843,0.623731,0.352474,0.294187,-0.027624,-0.007505
44,-0.017789,-0.008811,-0.015357,1.0,-0.015357,-0.015849,-0.02376,-0.03211,-0.015278,-0.0125,...,-0.015337,-0.008811,-0.019881,-0.012423,-0.015337,-0.023689,-0.046644,-0.015334,0.627955,-0.024434
at,0.568247,0.573753,0.320755,-0.015357,1.0,0.015578,0.421181,-0.03945,0.343234,-0.015357,...,0.671634,0.573753,-0.024426,0.442606,0.308225,0.212687,0.069867,0.646342,-0.027624,-0.030019
but,0.007895,0.049159,0.015578,-0.015849,0.015578,1.0,-0.006886,-0.040714,0.017988,-0.015849,...,0.018058,0.049159,-0.025208,0.031503,-0.019447,-0.005083,-0.046018,0.012862,-0.028509,-0.030981
cj,0.357452,0.370855,0.646367,-0.02376,0.421181,-0.006886,1.0,-0.061033,0.451012,-0.02376,...,0.428666,0.370855,-0.037789,0.522847,0.187709,0.852758,0.454748,0.178404,-0.042737,-0.031515
dd,-0.045696,-0.022634,-0.03945,-0.03211,-0.03945,-0.040714,-0.061033,1.0,-0.039246,-0.03211,...,-0.039398,-0.022634,-0.05107,-0.031911,-0.039398,-0.060852,-0.119818,-0.03939,-0.057757,-0.062765
deez,0.292753,0.612334,0.343234,-0.015278,0.343234,0.017988,0.451012,-0.039246,1.0,-0.015278,...,0.368613,0.612334,-0.0243,0.472859,-0.018746,0.672059,0.349652,0.314913,-0.027482,-0.029864
df,-0.017789,-0.008811,-0.015357,-0.0125,-0.015357,-0.015849,-0.02376,-0.03211,-0.015278,1.0,...,-0.015337,-0.008811,-0.019881,-0.012423,0.383984,-0.023689,-0.046644,-0.015334,-0.022484,-0.024434


In [15]:
# Recalculate user similarity matrix using training data only
train_ratings_std = train_ratings.apply(standardize, axis=1).fillna(0)
train_user_similarity = cosine_similarity(train_ratings_std)

# Make predictions for test data
predictions = []
actual_ratings = []

print("Making predictions...")
for user_idx, movie_idx, actual_rating in test_data:
    predicted_rating = predict_user_rating(user_idx, movie_idx, train_user_similarity, train_ratings_std, train_ratings)
    predictions.append(predicted_rating)
    actual_ratings.append(actual_rating)

# Calculate evaluation metrics
predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

# Calculate MAE
mae = mean_absolute_error(actual_ratings, predictions)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_ratings, predictions))

# Count total predictions
total_predictions = len(predictions)

# Display results in the same format as reference
print("Rating Prediction Results:")
print(f"• Overall MAE: {mae:.3f}")
print(f"• Overall RMSE: {rmse:.3f}")  
print(f"• Total predictions: {total_predictions}")

# # Show some example predictions vs actual
# print("Sample predictions:")
# for i in range(min(10, len(predictions))):
#     user_name = train_ratings.index[test_data[i][0]]
#     movie_name = train_ratings.columns[test_data[i][1]]
#     print(f"  {user_name} → '{movie_name}': Predicted={predictions[i]:.1f}, Actual={actual_ratings[i]}")

Making predictions...
Rating Prediction Results:
• Overall MAE: 1.434
• Overall RMSE: 2.556
• Total predictions: 78
