In [1]:
import numpy as np
import pandas as pd
import matrix_factorization_utilities

## Factor Review Matrix

In [6]:
# Load user ratings
raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.to_numpy(),
                                                                    num_features=15,
                                                                    regularization_amount=0.1)

# Find all predicted ratings by multiplying the U by M
predicted_ratings = np.matmul(U, M)

# Save all the ratings to a csv file
predicted_ratings_df = pd.DataFrame(index=ratings_df.index,
                                    columns=ratings_df.columns,
                                    data=predicted_ratings)
predicted_ratings_df.to_csv("predicted_ratings.csv")

         Current function value: 32.504379
         Iterations: 3000
         Function evaluations: 4533
         Gradient evaluations: 4533


## Find Similar Products

In [11]:

# Load user ratings
df = pd.read_csv('movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.to_numpy(),
                                                                    num_features=15,
                                                                    regularization_amount=1.0)

# Swap the rows and columns of product_features just so it's easier to work with
M = np.transpose(M)

# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))

# Get the features for movie #1 we found via matrix factorization
current_movie_features = M[movie_id - 1]

print("The attributes for this movie are:")
print(current_movie_features)

# The main logic for finding similar movies:

# 1. Subtract the current movie's features from every other movie's features
difference = M - current_movie_features

# 2. Take the absolute value of that difference (so all numbers are positive)
absolute_difference = np.abs(difference)

# 3. Each movie has 15 features. Sum those 15 features to get a total 'difference score' for each movie
total_difference = np.sum(absolute_difference, axis=1)

# 4. Create a new column in the movie list with the difference score for each movie
movies_df['difference_score'] = total_difference

# 5. Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies_df.sort_values('difference_score')

# 6. Print the result, showing the 5 most similar movies to movie_id #1
print("The five most similar movies are:")
print(sorted_movie_list[['title', 'difference_score']][0:5])

Optimization terminated successfully.
         Current function value: 312.762757
         Iterations: 1468
         Function evaluations: 2191
         Gradient evaluations: 2191
We are finding movies similar to this movie:
Movie title: The Big City Judge 2
Genre: legal drama
The attributes for this movie are:
[ 0.66523693 -0.82921935 -0.72683696  0.52196015 -0.84881208 -1.84118768
 -0.78741868  0.25951176 -0.11970143  0.11416591 -0.1510272  -0.17742577
 -0.23366574 -0.81280514  1.0830647 ]
The five most similar movies are:
                            title  difference_score
movie_id                                           
5            The Big City Judge 2          0.000000
10        Surrounded by Zombies 1          1.872522
9                     Biker Gangs          2.599746
3                   The Sheriff 2          2.695354
24           The Big City Judge 3          2.788348


## Make Recommendations

In [13]:

# Load user ratings
raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id',
                            columns='movie_id',
                            aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.to_numpy(),
                                                                    num_features=15,
                                                                    regularization_amount=0.1)

# Find all predicted ratings by multiplying U and M matrices
predicted_ratings = np.matmul(U, M)

print("Enter a user_id to get recommendations (Between 1 and 100):")
user_id_to_search = int(input())

print("Movies previously reviewed by user_id {}:".format(user_id_to_search))

reviewed_movies_df = raw_dataset_df[raw_dataset_df['user_id'] == user_id_to_search]
reviewed_movies_df = reviewed_movies_df.join(movies_df, on = 'movie_id')

print(reviewed_movies_df[['title', 'genre', 'value']])

input("Press enter to continue.")

print("Movies we will recommend:")

user_ratings = predicted_ratings[user_id_to_search - 1]
movies_df['rating'] = user_ratings

already_reviewed = reviewed_movies_df['movie_id']
recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False]
recommended_df = recommended_df.sort_values(by=['rating'], ascending=False)

print(recommended_df[['title', 'genre', 'rating']].head(5))

         Current function value: 32.504379
         Iterations: 3000
         Function evaluations: 4533
         Gradient evaluations: 4533
Enter a user_id to get recommendations (Between 1 and 100):
14
Movies previously reviewed by user_id 14:
                   title                 genre  value
87  The Big City Judge 1           legal drama      4
88         The Sheriff 2  crime drama, western      5
89  The Big City Judge 3           legal drama      5
90  The Big City Judge 2           legal drama      5
91         The Sheriff 3  crime drama, western      5
92         The Sheriff 1  crime drama, western      5
Press enter to continue.
Movies we will recommend:
                             title                     genre    rating
movie_id                                                              
33                    Sports Nerds                    comedy  5.199235
21                 Political Gaffs  comedy, political satire  4.785242
10         Surrounded by Zombies 1    hor

## Measuring Accuracy

In [24]:
# Load user ratings
raw_training_dataset_df = pd.read_csv('movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv')

# Convert the running list of user ratings into a matrix
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_training_df.to_numpy(),
                                                                    num_features=11,
                                                                    regularization_amount=1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Measure RMSE
rmse_training = matrix_factorization_utilities.RMSE(ratings_training_df.to_numpy(), predicted_ratings)
rmse_testing = matrix_factorization_utilities.RMSE(ratings_testing_df.to_numpy(), predicted_ratings)

print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))


Optimization terminated successfully.
         Current function value: 288.083398
         Iterations: 735
         Function evaluations: 1107
         Gradient evaluations: 1107
Training RMSE: 0.22834276996301733
Testing RMSE: 1.2098648707121624
