In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import matplotlib.pyplot as plt

In [None]:
# Load ratings data
ratings_path = './ml-latest-small/ratings.csv'
ratings_df = pd.read_csv(ratings_path)

# Display ratings dataframe
print("Ratings DataFrame:")
ratings_df

In [None]:
# Show number of users
print("Number of users: ", len(ratings_df['userId'].unique()))

# Show mumber of ratings per user
user_rating_counts = ratings_df['userId'].value_counts()

# Show quantiles
print("\nQuantiles of user ratings:\n", user_rating_counts.quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0]))

rating_cap = 98

# Show number of users with less than rating_cap ratings, which is the 50th percentile
print(f'\nNumber of users with less than {rating_cap} ratings: {len(user_rating_counts[user_rating_counts < rating_cap])}')

In [None]:
# Create a subset of the ratings dataframe with only users with 31 or more ratings
ratings_df = ratings_df[ratings_df['userId'].isin(user_rating_counts[user_rating_counts >= rating_cap].index)]

ratings_df

In [None]:
# Load movies data
movies_path = './ml-latest-small/movies.csv'
movies_df = pd.read_csv(movies_path)

# Display movies dataframe
print("Movies DataFrame:")
movies_df

In [None]:
# Remove movies with no ratings from movies_df
movies_df = movies_df[movies_df.movieId.isin(ratings_df.movieId)].reset_index(drop=True)
movies_df

In [None]:
# Create a new dataframe with movieId, numRatings, meanRating
movieProperties_df = ratings_df.groupby('movieId').agg({'rating': ['size', 'mean']})
movieProperties_df.columns = ['numRatings', 'meanRating']

# Merge with movies_df to add movie title and genres
movieProperties_df = movieProperties_df.merge(movies_df, on='movieId')
movieProperties_df = movieProperties_df[['movieId', 'title', 'numRatings', 'meanRating', 'genres']]

# Display the updated dataframe
print("Movie Properties DataFrame:")
movieProperties_df

In [None]:
# Plot graph of mean ratings vs number of ratings
plt.figure(figsize=(10, 6))
plt.scatter(movieProperties_df['numRatings'], movieProperties_df['meanRating'], alpha=0.5)
plt.title('Mean rating vs. Number of ratings')
plt.xlabel('Number of ratings')
plt.ylabel('Mean rating')
plt.show()

In [None]:
# Consider implementing a strategy to filter out movies with a low number of ratings, as they may not be representative of the true expected rating. Currently, I am only using a threshold for the number of ratings, but a more sophisticated strategy could be implemented.
# Such a strategy could involve using a weighted mean rating, such as beysian average, or adusting the cost function to account for the number of ratings.

# Apply a beysian average to the ratings
# Parameters
itemRatings = movieProperties_df['numRatings'] # V is the number of ratings for the movie
itemRatingsAverage = movieProperties_df['meanRating'] # R is the average rating for the movie
M = ratings_df['rating'].mean() # M is the mean rating across all movies
C = itemRatings.quantile(0.65) # C is the threshhold of the lower 25% of the number of ratings, in this case 65% of the number of ratings, due to scarcity of ratings
print(itemRatings.quantile(0.65))

# Calculate Bayesian Average
movieProperties_df['bayesianAverage'] = (itemRatings * itemRatingsAverage + C * M) / (itemRatings + C)

# Plot the results
plt.figure(figsize=(10, 6))
plt.scatter(movieProperties_df['numRatings'], movieProperties_df['bayesianAverage'], alpha=0.5)
plt.title('Bayesian Average vs. Number of ratings')
plt.xlabel('Number of ratings')
plt.ylabel('Bayesian Average')
plt.show()

In [None]:
# Show new movie properties dataframe, sorted by mean rating, where the number of ratings is greater than 2
movieProperties_df[movieProperties_df['numRatings'] > 0].sort_values(by='bayesianAverage', ascending=False).head(20)

In [None]:
# Preprocess the data to create user-item interaction matrices
user_item_matrix = ratings_df.pivot(index='movieId', columns='userId', values='rating').fillna(0)
Y = user_item_matrix

# Display the user-item matrix
print("User-Item Matrix:")
Y

In [None]:
# Create a binary indicator matrix for Y
R = np.where(Y != 0, 1, 0)

# Display the binary indicator matrix
print("Binary Indicator:\n", R)
print("\nBinary Indicator shape:", R.shape)

In [None]:
def cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.

    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users))           : vector of user parameters
      Y (ndarray (num_movies,num_users))   : matrix of user ratings of movies
      R (ndarray (num_movies,num_users))   : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter

    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [None]:
def movie_search(query, movies_df):
    """
    Returns a list of movies that match the keyword or ID.

    Args:
      query (str or int): keyword to search for or movie ID
      movies_df (DataFrame): DataFrame of movies

    Returns:
      results (DataFrame): DataFrame of movies that match the query
    """

    if isinstance(query, str):
        # Search by keyword in title or genres
        results = movies_df.loc[
            movies_df['title'].str.contains(query, case=False, regex=True) |
            movies_df['genres'].str.contains(query, case=False, regex=True)
        ]
    elif isinstance(query, int):
        # Search by movie ID
        results = movies_df.loc[movies_df['movieId'] == query]
    else:
        raise ValueError("Invalid query type. Please provide a string (keyword) or an integer (movie ID).")

    return results

In [None]:
movie_search('lord of', movies_df)

In [None]:
# Declare num_movies
num_movies = movies_df.shape[0]

# Initialize my_ratings based on the number of unique movies in movies_df
my_ratings = np.zeros(num_movies)
# Print my_ratings.shape
print("my_ratings shape:", my_ratings.shape)

# Create a mapping between original movie IDs and their indices in my_ratings
unique_movie_ids = movies_df["movieId"].unique()
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(unique_movie_ids)}

# Assign ratings at the correct positions using the mapping
my_ratings[movie_id_to_index[2571]] = 4.5 # The Matrix (1999)
my_ratings[movie_id_to_index[32]] = 4.0 # Twelve Monkeys (1995)
my_ratings[movie_id_to_index[260]] = 4.0 # Star Wars: Episode IV - A New Hope (1977)
my_ratings[movie_id_to_index[1196]] = 4.0 # Star Wars: Episode V - The Empire Strikes Back (1980)
my_ratings[movie_id_to_index[296]] = 4.5 # Pulp Fiction (1994)
my_ratings[movie_id_to_index[480]] = 3.5 # Jurassic Park (1993)
my_ratings[movie_id_to_index[356]] = 4.5 # Forrest Gump (1994)
my_ratings[movie_id_to_index[1]] = 4.0 # Toy Story (1995)
my_ratings[movie_id_to_index[527]] = 4.0 # Schindler's List (1993)
my_ratings[movie_id_to_index[4993]] = 5.0 # Lord of the Rings: The Fellowship of the Ring, The (2001)
my_ratings[movie_id_to_index[5952]] = 5.0 # Lord of the Rings: The Two Towers, The (2002)
my_ratings[movie_id_to_index[7153]] = 5.0 # Lord of the Rings: The Return of the King, The (2003)
my_ratings[movie_id_to_index[6539]] = 4.5 # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_ratings[movie_id_to_index[40815]] = 4.5 # Harry Potter and the Goblet of Fire (2005)
my_ratings[movie_id_to_index[1704]] = 4.5 # Good Will Hunting (1997)

print('\n\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for   {movies_df.loc[i,"title"]}')

In [None]:
print("Y shape:")
# Print Y shape before adding new user ratings
print("before:", Y.shape)

# Add the new user ratings to the Y matrix
Y = np.c_[my_ratings, Y]
# Print Y shape after adding new user ratings
print("after:", Y.shape)

print("\n\nR shape:")
# Print R shape before adding new user ratings
print("before:", R.shape)

# Update the binary indicator R matrix
R = np.c_[np.where(my_ratings != 0, 1, 0), R]
# Print R shape after adding new user ratings
print("after:", R.shape)

In [None]:
def normalizeRatings(Y, R):
    """
    Normalizes Y so that each movie has a rating of 0 on average, and returns the mean rating in Ymean.
    Args:
      Y (ndarray (num_movies,num_users)): matrix of user ratings of movies
      R (ndarray (num_movies,num_users)): matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
    Returns:
      Ynorm (ndarray (num_movies,num_users)): normalized Y with mean ratings subtracted
      Ymean (ndarray (num_movies,1)): vector of mean ratings for each movie
    """
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R)
    return Ynorm, Ymean

In [None]:
# Normalize ratings
Ynorm, Ymean = normalizeRatings(Y, R)

num_movies, num_users = Y.shape 
num_features = 100 # number of features to learn

# Initialize parameters W, X, b
tf.random.set_seed(1234) # set the random seed so this always produces the same results
W = tf.Variable(tf.random.normal([num_users, num_features], dtype=tf.float64, name='W'))
X = tf.Variable(tf.random.normal([num_movies, num_features], dtype=tf.float64, name='X'))
b = tf.Variable(tf.random.normal([1, num_users], dtype=tf.float64, name='b'))

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-1)

In [None]:
def train_model(X, W, b, Y, R, lambda_, optimizer, iterations=200, print_every=20):
    """
    Trains the collaborative filtering model using a custom training loop

    Args:
      X (ndarray (num_movies,num_features)): Matrix of item features
      W (ndarray (num_users,num_features)) : Matrix of user parameters
      b (ndarray (1, num_users)            : Vector of user parameters
      Y (ndarray (num_movies,num_users)    : Matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : Binary indicator matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): Regularization parameter
      optimizer (tf.keras.optimizers): Optimizer to use for training
      iterations (int): Number of iterations of gradient descent

    Returns:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
    """
    # Initialize the cost history
    cost_history = []

    # Train using a custom training loop
    for i in range(iterations):

        # Record the operations to compute the cost        
        with tf.GradientTape() as tape:
            J = cost_func(X, W, b, Y, R, lambda_)

        # Compute the gradients using automatic differentiation    
        grads = tape.gradient(J, [X, W, b])

        # Update the parameters using the Adam optimizer
        optimizer.apply_gradients(zip(grads, [X, W, b]))

        # Append the cost to the history
        cost_history.append(J.numpy())

        # Print the cost every 100 iterations
        if i % print_every == 0:
            print(f'Cost at iteration {i} is {J.numpy()}')
    
    return X, W, b, cost_history

In [None]:
# Train using a custom training loop
X, W, b, cost_history = train_model(X, W, b, Ynorm, R, lambda_=1, optimizer=optimizer, iterations=200, print_every=20)

#Print the cost history
print("\nCost History:", cost_history)

In [None]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean

my_predictions = pm[:,0]

# Save my predictions to a dataframe, and sort the predictions from highest to lowest
my_predictions_df = pd.DataFrame({'title': movieProperties_df['title'], 'number of ratings': movieProperties_df["numRatings"], 'mean rating': movieProperties_df["meanRating"], 'prediction': my_predictions.round(2), 'genres': movieProperties_df['genres']})
my_predictions_df = my_predictions_df.sort_values(by='prediction', ascending=False)

# Sort predictions and get the indices
ix = np.argsort(my_predictions)[::-1]

# Set a cap for the number of movies to print
print_cap = 50
# Set a cap for the release year
release_year_cap = 1970
# Set a cap for the number of ratings
num_ratings_cap = 20

# Initialize a counter for printed movies
printed_count = 0

for i in range(my_predictions.shape[0]):
    j = ix[i]

    try:
        # Extract the release year using a regular expression
        match = re.search(r'\((\d{4})\)', movies_df.loc[j, "title"])

        if match:
            release_year = int(match.group(1))

            # Check if the movie has over x ratings
            if movieProperties_df.loc[j, "numRatings"] > num_ratings_cap and release_year > release_year_cap:
                # Print the top 20 movies
                print(f'numRatings: {movieProperties_df.loc[j, "numRatings"]}, Predicted {my_predictions[j]:0.2f} for    {movies_df.loc[j, "title"]}')
                
                # Increment the counter
                printed_count += 1

                # Check if 20 movies have been printed
                if printed_count == print_cap:
                    break
    except (ValueError, IndexError):
      # Handle errors in extracting the release year or accessing the list
      print(f"Error processing movie at index {j}. Skipping...")
      continue

# Show original ratings and compare with predicted ratings
print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for    {movies_df.iloc[i]["title"]}')

In [None]:
# Calculate the mean rating of all movies in the pm matrix, including all users, and store it in a vector
mean_rating_pm = np.mean(pm, axis=1)
# Add predictions mean rating to the my_predictions_df, using the pm Matrix
my_predictions_df.insert(4, 'predictionsMeanRating', mean_rating_pm[my_predictions_df.index])
# Display the updated dataframe sorted by index
print("My Predictions DataFrame:")
my_predictions_df.sort_index()

In [None]:
# Plot a graph showing the mean rating of a movie vs My predictions rating
plt.figure(figsize=(10, 6))
plt.scatter(my_predictions_df['prediction'], my_predictions_df['mean rating'], alpha=0.5)
plt.title('My predictions rating vs. Mean rating')
plt.xlabel('My predictions rating')
plt.ylabel('Mean rating')
plt.show()

In [None]:
# Plot a graph showing predictions mean ratings vs my predictions ratings
plt.figure(figsize=(10, 6))
plt.scatter(my_predictions_df['prediction'], my_predictions_df['predictionsMeanRating'], alpha=0.5)
plt.title('My predictions rating vs. Predictions mean rating')
plt.xlabel('My predictions rating')
plt.ylabel('Predictions mean rating')
plt.show()

In [None]:
# Plot a graph showing the mean rating of a movie vs predictions mean rating
plt.figure(figsize=(10, 6))
plt.scatter(my_predictions_df['mean rating'], my_predictions_df['predictionsMeanRating'], alpha=0.5)
plt.title('Mean rating vs. Predictions mean rating')
plt.xlabel('Mean rating')
plt.ylabel('Predictions mean rating')
plt.show()

In [None]:
def beysian_average(itemRatings, itemRatingsAverage, m, c):
    """ 
    Returns the Bayesian average for the ratings
    
    Args:
      itemRatings (ndarray): number of ratings for the movie
      itemRatingsAverage (ndarray): average rating for the movie
      m (float): parameter for Bayesian average, mean rating across all movies
      c (float): parameter for Bayesian average, threshhold of the lower 25% of the number of ratings

    Returns:
        bayesianAverage (ndarray): Bayesian average for the ratings
    """
    bayesianAverage = (itemRatings * itemRatingsAverage + c * m) / (itemRatings + c)
    return bayesianAverage

In [None]:
# Apply a beysian average to predictions

# Parameters
c = movieProperties_df['numRatings'].quantile(0.65) # C is the threshhold of the lower 25% of the number of ratings, in this case 65% of the number of ratings, due to scarcity of ratings

# Calculate BayesianPrediction
#my_predictions_df['bayesianPrediction'] = (my_predictions_df['number of ratings'] * my_predictions_df['prediction'] + c * m) / (my_predictions_df['number of ratings'] + c)
my_predictions_df['bayesianPrediction'] = beysian_average(my_predictions_df['number of ratings'], my_predictions_df['prediction'], my_predictions_df['prediction'].mean(), c)
# Place BayesianPrediction column after the prediction column

# Calculate bayesianPredictionMeanRating
my_predictions_df['bayesianPredictionMeanRating'] = beysian_average(my_predictions_df['number of ratings'], my_predictions_df['predictionsMeanRating'], pm.mean(), c)
#my_predictions_df['bayesianPredictionMeanRating'] = (my_predictions_df['number of ratings'] * my_predictions_df['predictionsMeanRating'] + c * m) / (my_predictions_df['number of ratings'] + c)

# Calculate bayesianMeanRating
#my_predictions_df['bayesianMeanRating'] = (my_predictions_df['number of ratings'] * my_predictions_df['mean rating'] + c * m) / (my_predictions_df['number of ratings'] + c)
my_predictions_df['bayesianMeanRating'] = beysian_average(my_predictions_df['number of ratings'], my_predictions_df['mean rating'], ratings_df['rating'].mean(), c)

# Order the columns
my_predictions_df = my_predictions_df[['title', 'number of ratings', 'mean rating', 'bayesianMeanRating', 'prediction', 'bayesianPrediction', 'predictionsMeanRating', 'bayesianPredictionMeanRating', 'genres']]

In [None]:
# Display my_predictions_df, sorted by movie popularity (number of ratings)
my_predictions_df.sort_values(by='bayesianPrediction', ascending=False).head(60)

In [None]:
# Plot a graph showing the bayesian mean rating of a movie vs My bayesian predictions rating
plt.figure(figsize=(10, 6))
plt.scatter(my_predictions_df['bayesianPrediction'], my_predictions_df['bayesianMeanRating'], alpha=0.5)
plt.title('My bayesian prediction rating vs. bayesian mean rating')
plt.xlabel('My bayesian prediction rating')
plt.ylabel('Bayesian mean rating')
plt.show()

In [None]:
def calculate_rmse_for_my_rated(predictions, targets, rated_indices):
    """
    Calculate Root Mean Squared Error for rated items between predictions and targets.

    Args:
      predictions (ndarray): Matrix of predicted ratings
      targets (ndarray): Matrix of actual ratings
      rated_indices (list): List of indices corresponding to rated items

    Returns:
      rmse (float): Root Mean Squared Error for rated items
    """
    rated_predictions = predictions[rated_indices]
    rated_targets = targets[rated_indices]

    mse = np.mean((rated_predictions - rated_targets) ** 2)
    rmse = np.sqrt(mse)
    return rmse

In [None]:
def calculate_rmse_for_all_users(predictions, targets, mask):
    """
    Calculate Root Mean Squared Error for all users between predictions and targets.

    Args:
      predictions (ndarray): Matrix of predicted ratings
      targets (ndarray): Matrix of actual ratings
      mask (ndarray): Binary indicator matrix (1 for rated items, 0 for unrated items)

    Returns:
      rmse (float): Root Mean Squared Error for all users
    """
    # Apply the mask to focus on rated items
    rated_predictions = predictions * mask
    rated_targets = targets * mask

    # Calculate RMSE only for rated items
    mse = np.sum((rated_predictions - rated_targets) ** 2) / np.sum(mask)
    rmse = np.sqrt(mse)
    return rmse

In [None]:
def calculate_mae_for_my_rated(predictions, targets, rated_indices):
    """
    Calculate Mean Absolute Error for rated items between predictions and targets.

    Args:
      predictions (ndarray): Matrix of predicted ratings
      targets (ndarray): Matrix of actual ratings
      rated_indices (list): List of indices corresponding to rated items

    Returns:
      mae (float): Mean Absolute Error for rated items
    """
    rated_predictions = predictions[rated_indices]
    rated_targets = targets[rated_indices]

    mae = np.mean(np.abs(rated_predictions - rated_targets))
    return mae

In [None]:
def calculate_mae_for_all_users(predictions, targets, mask):
    """
    Calculate Mean Absolute Error for all users between predictions and targets.

    Args:
      predictions (ndarray): Matrix of predicted ratings
      targets (ndarray): Matrix of actual ratings
      mask (ndarray): Binary indicator matrix (1 for rated items, 0 for unrated items)

    Returns:
      mae (float): Mean Absolute Error for all users
    """
    # Apply the mask to focus on rated items
    rated_predictions = predictions * mask
    rated_targets = targets * mask

    # Calculate MAE only for rated items
    mae = np.sum(np.abs(rated_predictions - rated_targets)) / np.sum(mask)
    return mae

In [None]:
# Define variables for RMSE and MAE all users calculations
predictions = pm
targets = Y
mask = R

# Get indices of rated items
rated_indices = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

# Calculate RMSE for rated items
rmse_for_rated = calculate_rmse_for_my_rated(my_predictions, my_ratings, rated_indices)
print(f'RMSE for my rated items: {rmse_for_rated}')

# Calculate RMSE for all users
rmse_for_all_users = calculate_rmse_for_all_users(predictions, targets, mask)
print(f'\nRMSE for all users: {rmse_for_all_users}')

# Calculate MAE for rated items
mae_for_rated = calculate_mae_for_my_rated(my_predictions, my_ratings, rated_indices)
print(f'\nMAE for my rated items: {mae_for_rated}')

# Calculate MAE for all users
mae_for_all_users = calculate_mae_for_all_users(predictions, targets, mask)
print(f'\nMAE for all users: {mae_for_all_users}')

In [None]:
movie_search('Adventure', my_predictions_df).sort_values(by='bayesianPrediction', ascending=False).head(50)

In [None]:
# Create a search sort function to sort predictions by genre, release year, or popularity
def search_sort(df, genre=None, release_year=None, num_ratings=None):
    """
    Returns a list of movies that match the genre, release year, or number of ratings.

    Args:
      df (DataFrame): DataFrame of movies
      genre (str): genre to search for
      release_year (int): minimum release year to search for
      num_ratings (int): number of ratings to search for

    Returns:
      results (DataFrame): DataFrame of movies that match the query
    """
    conditions = []

    if genre:
        conditions.append(df['genres'].str.contains(genre, case=False, regex=True))
    if release_year:
        # Extract release year from the title and compare with the provided release_year
        conditions.append(df['title'].str.extract(r'\((\d{4})\)', expand=False).astype(float) >= release_year)
    if num_ratings:
        conditions.append(df['number of ratings'] > num_ratings)

    if conditions:
        combined_condition = pd.DataFrame(conditions).all()
        results = df.loc[combined_condition]
        return results.sort_values(by='bayesianPrediction', ascending=False)
    else:
        raise ValueError("At least one search term is required.")

In [None]:
search_sort(my_predictions_df, genre='action', release_year=2017).head(50)