In [1]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# data science imports
import numpy as np
import pandas as pd

# ALS Imports
import implicit
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn.preprocessing import MinMaxScaler

# Trees Imports
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
# Set data location
system_path = ''
data_path = system_path + '/Movie-Recommendation-System/Data/Reviews-1M/'
# model_path = system_path + '/Movie-Recommendation-System/Without-Spark/Models/'

In [3]:
%%time
# Import movies data
movies_df = pd.read_csv(data_path + 'movies_metadata_ohe_subset.csv')
movies_df = movies_df.set_index(movies_df.itemId)

# Import movies genre & people metadata
movies_gp_df = pd.read_csv(data_path + 'movies_genre_and_people_metadata_ohe_subset.csv')
movies_gp_df = movies_gp_df.set_index(movies_gp_df.itemId)

# Import ratings data
ratings_df = pd.read_csv(data_path + 'ratings.dat',
                        sep = '::', header = None)
ratings_df.columns = ['userId', 'itemId', 'label', 'timestamp']
ratings_df.drop(['timestamp'], axis = 1, inplace = True)

  # This is added back by InteractiveShellApp.init_path()


Wall time: 15.3 s


In [4]:
# User input function - takes user input data, strpis it down, and calls other functions on that data
# Takes in user age, gender, list of favorite movies
# All movies in the list of favorite movies will be rated 5 stars
def new_user_input(fav_movies, all_ratings, movies_df, num_recs = 10, 
                   age = None, gender = None, movies_gp = None):
    # age should be an integer in 1 - 100
    # gender should be M or F
    # fav_movies should be in the form of ["Iron Man", "The Shawshank Redemption", "Robin Hood"]
    #    If there are multiple versions of the movie and the user wishes for one other than the most recent one, they
    #    should specify with a year in parenthesis, like "Robin Hood (1993)"
    
    # collect favorite movie ids
    print('Collecting favorite movie IDs')
    movieIds = get_movieId(movies_df, fav_movies)
    print('Favorite movies in the available set')
    print(movies_df[['itemId', 'title', 'year']].loc[movieIds])
    
    print('Adding ratings to full set')
    # add new user movie ratings to all ratings dataframe
    all_ratings_updated, new_user_ratings = add_new_user_to_data(all_ratings, movieIds)
    del all_ratings
    
    print('Creating prediction set')
    # used for content-based filtering models
    all_user_unrated = get_inference_data(all_ratings_updated, movieIds, movies_df)

    print('ALS Data formatting')
    all_ratings_updated.dropna(inplace = True)
    # The implicit library expects data as a item-user matrix so we
    # create two matricies, one for fitting the model (item-user) 
    # and one for recommendations (user-item)
    sparse_item_user = sparse.csr_matrix((all_ratings_updated['label'].astype(float), 
                                          (all_ratings_updated['itemId'], all_ratings_updated['userId'])))
    sparse_user_item = sparse.csr_matrix((all_ratings_updated['label'].astype(float), 
                                          (all_ratings_updated['userId'], all_ratings_updated['itemId'])))
    
    print('Training ALS model')
    # Build the recommendation model using Alternating Least Squares from the Implicit package
    num_factors = 10 # Latent factors
    num_iters = 10 # Model iterations (like NN epochs)
    reg_param = 0.1 # Regularization parameter to reduce overfitting

    # initialize a model
    model = implicit.als.AlternatingLeastSquares(factors=num_factors, iterations = num_iters, 
                                                 regularization = reg_param, 
                                                 use_cg = True, # cg is a faster optimization method
                                                 calculate_training_loss = True) # keep loss values (for plotting if needed)

    # Calculate the confidence by multiplying it by our alpha value.
    alpha_val = 1 # Still not entirely sure what this parameter does...
    data_conf = (sparse_item_user * alpha_val).astype('double')

    #Fit the model
    model.fit(data_conf)
    del all_ratings_updated
    
    print('Making ALS Predictions')
    # keep top 30 predictions
    # Use the implicit recommender.
    userId = new_user_ratings.userId.unique()[0]
    recommended = model.recommend(userId, sparse_user_item, N = num_recs)
    
    movie_recs = []
    scores = []
    # Get movie titles from ids
    for item in recommended:
        idx, score = item
        movie_recs.append(movies_df.title.loc[movies_df.itemId == idx].iloc[0])
        scores.append(score)

    # Create a dataframe of movie titles and scores
    recommendations = pd.DataFrame({'movies': movie_recs, 'score': scores})
    print('')
    print('User', userId, 'Movie Recommendations')
    print(recommendations)
    
    # format data for prediction using GBTs
    # create user_id x item_id matrix need to get data in the form of user_id, item_id, label, then pivot
    # filter movies_gp dataframe by the movieIds. pivot new_user_ratings into a vector, 
    # then multiply by the filtered movies_gp dataframe; divide by binarized user ratings; 
    # this should now be a vector of user preferences. 
    # join a OHE age, gender, and possibly occupation, to the user preferences
    print('CBF Data Formatting')
    user_summary = get_user_preferences(user_ratings = new_user_ratings, movieIds = movieIds, 
                                        movies_gp = movies_gp, age = age, gender = gender)
    
    # Extract movie ids from the top 5*num_recs for Gradient Boosted Trees prediction
    userId = new_user_ratings.userId.unique()[0]
    recommended = model.recommend(userId, sparse_user_item, N = num_recs*3)

    als_top_3xn_ids = [id_val[0] for id_val in recommended]
    all_user_unrated_top_3xn = all_user_unrated.loc[als_top_3xn_ids]
    all_user_unrated_top_3xn.reset_index(drop = True, inplace = True)

    top_3xn_movies_metadata = movies_df.loc[als_top_3xn_ids]
    top_3xn_movies_metadata.reset_index(drop = True, inplace = True)

    # lastly, replicate the user pref rows for each rated movieId, then join with the filtered movies dataframe
    # (MAKE SURE ALL COLUMNS ARE ORDERED AND NAMED CORRECTLY)
    # Index join - both indexes are itemId
    unrated_with_movie_metadata = all_user_unrated_top_3xn \
                                    .merge(top_3xn_movies_metadata, 
                                           left_on = 'itemId', right_on = 'itemId',
                                          how = 'left')
    unrated_with_full_metadata = unrated_with_movie_metadata \
                                    .merge(user_summary, 
                                           left_on = 'userId', right_on = 'userId', 
                                           how = 'left') \
                                    .set_index('itemId') \
                                    .drop(columns = ['userId', 'title', 'imdb_id']) \
                                    .fillna(0)

    # load the content-based filtering model (cbf_model) from disk
    cbf_model = pickle.load(open(model_path + 'rfr_model.sav', 'rb'))
    
    print('Making CBF Predictions')
    unrated_with_full_metadata['predictions'] = cbf_model.predict(unrated_with_full_metadata)
    unrated_with_full_metadata.reset_index(inplace = True)
    cbf_model_preds = unrated_with_full_metadata[['itemId', 'predictions']]

    # sort by predicted rating and keep top [num_recs] recommendations
    cbf_top_n_predictions = cbf_model_preds.sort_values(by = 'predictions', ascending=False).head(num_recs)

    cbf_movie_recs = movies_df[['title', 'year']].loc[cbf_top_n_predictions.itemId]
    print('Content-Based Filtering Recommendations')
    print(cbf_movie_recs)

In [5]:
def get_movieId(movies_df, fav_movie_list):
    """
    return all movieId(s) of user's favorite movies
    
    Parameters
    ----------
    movies_df: DataFrame, all movieIds and info
    
    fav_movie_list: list, user's list of favorite movies
    
    Return
    ------
    movieId_list: list of movieId(s)
    """
    movieId_list = []
    for movie in fav_movie_list:
        if movie[0:4] == 'The ':
            movie = movie[4:]
        elif movie[0:3] == 'An ':
            movie = movie[3:]
        elif movie[0:3] == 'La ':
            movie = movie[3:]
        elif movie[0:2] == 'A ':
            movie = movie[3:]

        if movie[-6:-5] == '(':
            year = int(movie[-5:-1])
            movie = movie[0:-7]
            movieIds = movies_df.itemId[(movies_df.title.str.contains(movie)) & (movies_df.year == year)]
            movieId_list.extend(movieIds)
        elif len(movie.split(' ')) == 1:
            movieIds = movies_df.itemId[movies_df.title == movie]
            movieId_list.extend(movieIds)
        else:
            movieIds = movies_df.itemId[movies_df.title.str.contains(movie)]
            movieId_list.extend(movieIds)
    return movieId_list

In [6]:
def add_new_user_to_data(train_data, movieIds):
    """
    add new rows with new user's movie(s) and 
    rating(s) to existing ratings data

    Parameters
    ----------
    train_data: DataFrame, ratings data
    
    movieIds: list, a set of movieId(s)
    
    Return
    ------
    new train data (DataFrame) with the new user's rows
    """
    # get new user id
    new_id = np.max(train_data.userId) + 1
    # get max rating
    max_rating = np.max(train_data.label)
    # create new user df for max rating
    user_rows_max = [(new_id, movieId, max_rating) for movieId in movieIds]
    new_df_max = pd.DataFrame(user_rows_max, columns=['userId', 'itemId', 'label'])
    # return new train data
    return pd.concat([train_data, new_df_max], axis = 0), new_df_max

In [7]:
def get_inference_data(train_data, movieIds, movies_df):
    """
    return a rdd with the userid and all movies (except ones in movieId_list)

    Parameters
    ----------
    train_data: DataFrame, ratings data

    df_movies: DataFrame, movies data
    
    movieId_list: list, list of movieId(s)

    Return
    ------
    user_unrated: DataFrame, all movies not rated by the new user
    """
    # get new user id
    new_id = np.max(train_data.userId)
    
    distinct_unrated_items = movies_df[['itemId']].loc[~movies_df.index.isin(movieIds)]
    distinct_unrated_items['userId'] = new_id
    user_unrated = distinct_unrated_items[['userId', 'itemId']]
    
    return user_unrated

In [8]:
def get_user_preferences(user_ratings, movieIds, movies_gp, age, gender):
    user_demog = pd.DataFrame({'gender_M': 0, 'gender_F': 0, 'age_group_1': 0, 
                               'age_group_18': 0, 'age_group_25': 0, 'age_group_35': 0, 
                               'age_group_45': 0, 'age_group_50': 0, 'age_group_56': 0}, index = [0])
    # Bin user by age
    if age < 18:
        user_demog.age_group_1[0] = 1
    elif age < 25:
        user_demog.age_group_18[0] = 1
    elif age < 35:
        user_demog.age_group_25[0] = 1
    elif age < 45:
        user_demog.age_group_35[0] = 1
    elif age < 50:
        user_demog.age_group_45[0] = 1
    elif age < 56:
        user_demog.age_group_50[0] = 1
    else:
        user_demog.age_group_56[0] = 1
    
    if gender == 'M':
        user_demog.gender_M[0] = 1
    else:
        user_demog.gender_F[0] = 1
        
    # new_user_ratings
    pivoted_user_ratings_df = user_ratings.pivot(index='userId', columns='itemId', values='label').fillna(0)
    pivoted_user_ratings_df_binary = pivoted_user_ratings_df / pivoted_user_ratings_df
    
    movies_gp_filtered_df = movies_gp.loc[movieIds].drop(columns = ['itemId'])
    
    user_summary_total = pivoted_user_ratings_df.dot(movies_gp_filtered_df)
    user_summary_count = pivoted_user_ratings_df_binary.dot(movies_gp_filtered_df)
    user_summary_avg = (user_summary_total / user_summary_count).fillna(0)
    user_summary_avg = user_summary_avg.add_suffix('_avg_rating').reset_index()
    
    user_summary = pd.concat([user_summary_avg, user_demog], axis = 1)
    sorted_columns = list(user_summary.columns.sort_values())
    return user_summary[sorted_columns]

### Step by Step Walkthrough of Main Function (to show runtime)

In [9]:
%%time
fav_movies = ['Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings']
# collect favorite movie ids
print('Collecting favorite movie IDs')
movieIds = get_movieId(movies_df, fav_movies)
print('Favorite movies in the available set')
print(movies_df[['itemId', 'title', 'year']].loc[movieIds])

Collecting favorite movie IDs
Favorite movies in the available set
        itemId                      title  year
itemId                                         
318        318  Shawshank Redemption, The  1994
2116      2116     Lord of the Rings, The  1978
Wall time: 102 ms


In [10]:
%%time
print('Adding ratings to full set')
# add new user movie ratings to all ratings dataframe
all_ratings_updated, user_ratings = add_new_user_to_data(ratings_df, movieIds)

Adding ratings to full set
Wall time: 170 ms


In [11]:
%%time
print('Creating prediction set')
# used for content-based filtering models
all_user_unrated = get_inference_data(all_ratings_updated, movieIds, movies_df)

Creating prediction set
Wall time: 128 ms


In [12]:
%%time
print('Data formatting')
all_ratings_updated.dropna(inplace = True)
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((all_ratings_updated['label'].astype(float), 
                                      (all_ratings_updated['itemId'], all_ratings_updated['userId'])))
sparse_user_item = sparse.csr_matrix((all_ratings_updated['label'].astype(float), 
                                      (all_ratings_updated['userId'], all_ratings_updated['itemId'])))

print('Training ALS model')
# Build the recommendation model using Alternating Least Squares from the Implicit package
num_factors = 10 # Latent factors
num_iters = 10 # Model iterations (like NN epochs)
reg_param = 0.1 # Regularization parameter to reduce overfitting

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=num_factors, iterations = num_iters, 
                                             regularization = reg_param, 
                                             use_cg = True, # cg is a faster optimization method
                                             calculate_training_loss = True) # keep loss values (for plotting if needed)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 1 # Still not entirely sure what this parameter does...
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)
del all_ratings_updated

Data formatting
Training ALS model


100%|███████████████████████████████████████████████████████████████████| 10.0/10 [00:09<00:00,  1.15it/s, loss=0.0548]


Wall time: 9.78 s


In [13]:
%%time
print('Making Predictions')
# keep top 10 predictions
num_recs = 10
# Use the implicit recommender.
userId = user_ratings.userId.unique()[0]
recommended = model.recommend(userId, sparse_user_item, N = num_recs)

movie_recs = []
scores = []
# Get movie titles from ids
for item in recommended:
    idx, score = item
    movie_recs.append(movies_df.title.loc[movies_df.itemId == idx].iloc[0])
    scores.append(score)

# Create a dataframe of movie titles and scores
recommendations = pd.DataFrame({'movies': movie_recs, 'score': scores})
print('')
print('User', userId, 'Movie Recommendations')
print(recommendations)

Making Predictions

User 6041 Movie Recommendations
                                           movies     score
0                                Schindler's List  0.069036
1                       Silence of the Lambs, The  0.068612
2                      E.T. the Extra-Terrestrial  0.065599
3              Star Wars: Episode IV - A New Hope  0.063483
4  Star Wars: Episode V - The Empire Strikes Back  0.063276
5                                    Pulp Fiction  0.061077
6                                           Fargo  0.060845
7                             Saving Private Ryan  0.060103
8                               Wizard of Oz, The  0.057101
9                                  Lion King, The  0.056933
Wall time: 42.5 ms


In [14]:
%%time
# import GBT model input data format
# Create user_id x item_id matrix need to get data in the form of user_id, item_id, label, then pivot
# filter movies_gp dataframe by the movieIds. pivot new_user_ratings into a vector, 
# then multiply by the filtered movies_gp dataframe; divide by binarized user ratings; 
# this should now be a vector of user preferences. 
# join a OHE age, gender, and possibly occupation, to the user preferences
user_summary_df = get_user_preferences(user_ratings = user_ratings, movieIds = movieIds,
                                       movies_gp = movies_gp_df, age = 26, gender = 'M')

Wall time: 27.3 ms


In [15]:
%%time
userId = user_ratings.userId.unique()[0]
recommended = model.recommend(userId, sparse_user_item, N = 30)

als_top_3xn_ids = [id_val[0] for id_val in recommended]
all_user_unrated_top_3xn = all_user_unrated.loc[als_top_3xn_ids]
all_user_unrated_top_3xn.reset_index(drop = True, inplace = True)

top_3xn_movies_metadata = movies_df.loc[als_top_3xn_ids]
top_3xn_movies_metadata.reset_index(drop = True, inplace = True)

Wall time: 5.62 ms


In [16]:
%%time
# lastly, replicate the user pref rows for each rated movieId, then join with the filtered movies dataframe
# (MAKE SURE ALL COLUMNS ARE ORDERED AND NAMED CORRECTLY)

# Index join - both indexes are itemId
unrated_with_movie_metadata = all_user_unrated_top_3xn \
                                .merge(top_3xn_movies_metadata, 
                                       left_on = 'itemId', right_on = 'itemId',
                                      how = 'left')
unrated_with_full_metadata = unrated_with_movie_metadata \
                                .merge(user_summary_df, 
                                       left_on = 'userId', right_on = 'userId', 
                                       how = 'left') \
                                .set_index('itemId') \
                                .drop(columns = ['userId', 'title', 'imdb_id']) \
                                .fillna(0)

Wall time: 20.8 ms


In [17]:
%%time
# load the content-based filtering model (cbf_model) from disk
cbf_model = pickle.load(open(model_path + 'rfr_model.sav', 'rb'))

unrated_with_full_metadata['predictions'] = cbf_model.predict(unrated_with_full_metadata)
unrated_with_full_metadata.reset_index(inplace = True)
cbf_model_preds = unrated_with_full_metadata[['itemId', 'predictions']]

Wall time: 414 ms


In [18]:
%%time
cbf_top_n_predictions = cbf_model_preds.sort_values(by = 'predictions', ascending=False).head(10)

cbf_movie_recs = movies_df[['title', 'year']].loc[cbf_top_n_predictions.itemId]
print('Content-Based Filtering Recommendations')
print(cbf_movie_recs)

Content-Based Filtering Recommendations
                                     title  year
itemId                                          
1097            E.T. the Extra-Terrestrial  1982
1225                               Amadeus  1984
3471    Close Encounters of the Third Kind  1977
1704                     Good Will Hunting  1997
924                  2001: A Space Odyssey  1968
150                              Apollo 13  1995
1193       One Flew Over the Cuckoo's Nest  1975
1584                               Contact  1997
1240                       Terminator, The  1984
32                          Twelve Monkeys  1995
Wall time: 53.5 ms


### Full Function Recommendation Examples

In [19]:
%%time
fav_movies = ['Iron Man', 'Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings', 'Harry Potter',
             'The Family Stone', 'Shaun of the Dead', 'Up', 'A View to a Kill']
new_user_input(fav_movies = fav_movies, all_ratings = ratings_df, 
               movies_df = movies_df, num_recs = 10, 
               age = 26, gender = 'M', movies_gp = movies_gp_df)

Collecting favorite movie IDs
Favorite movies in the available set
        itemId                      title  year
itemId                                         
318        318  Shawshank Redemption, The  1994
2116      2116     Lord of the Rings, The  1978
2376      2376          View to a Kill, A  1985
Adding ratings to full set
Creating prediction set
ALS Data formatting
Training ALS model


100%|███████████████████████████████████████████████████████████████████| 10.0/10 [00:09<00:00,  1.13it/s, loss=0.0548]


Making ALS Predictions

User 6041 Movie Recommendations
                                           movies     score
0                      E.T. the Extra-Terrestrial  0.061443
1                       Silence of the Lambs, The  0.061046
2                                Schindler's List  0.057942
3  Star Wars: Episode V - The Empire Strikes Back  0.057218
4              Star Wars: Episode IV - A New Hope  0.055770
5                                         Contact  0.055141
6                                     Matrix, The  0.054857
7                                           Fargo  0.054047
8                                    Pulp Fiction  0.053924
9                             Saving Private Ryan  0.053806
CBF Data Formatting
Making CBF Predictions
Content-Based Filtering Recommendations
                                     title  year
itemId                                          
1097            E.T. the Extra-Terrestrial  1982
3471    Close Encounters of the Third Kind  1977
150  

In [20]:
%%time
fav_movies = ['Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings']
new_user_input(fav_movies = fav_movies, all_ratings = ratings_df, 
               movies_df = movies_df, num_recs = 10, 
               age = 26, gender = 'M', movies_gp = movies_gp_df)

Collecting favorite movie IDs
Favorite movies in the available set
        itemId                      title  year
itemId                                         
318        318  Shawshank Redemption, The  1994
2116      2116     Lord of the Rings, The  1978
Adding ratings to full set
Creating prediction set
ALS Data formatting
Training ALS model


100%|███████████████████████████████████████████████████████████████████| 10.0/10 [00:09<00:00,  1.14it/s, loss=0.0548]


Making ALS Predictions

User 6041 Movie Recommendations
                                           movies     score
0                       Silence of the Lambs, The  0.071588
1                                Schindler's List  0.068174
2                                           Fargo  0.064769
3              Star Wars: Episode IV - A New Hope  0.064167
4                      E.T. the Extra-Terrestrial  0.062314
5  Star Wars: Episode V - The Empire Strikes Back  0.062287
6                                    Pulp Fiction  0.061708
7                             Saving Private Ryan  0.060981
8                                     Matrix, The  0.060200
9                      Terminator 2: Judgment Day  0.058360
CBF Data Formatting
Making CBF Predictions
Content-Based Filtering Recommendations
                             title  year
itemId                                  
1097    E.T. the Extra-Terrestrial  1982
1704             Good Will Hunting  1997
924          2001: A Space Odyssey  1

In [21]:
%%time
fav_movies = ['Frozen', 'Tangled', 'Oceans Eleven', 'Toy Story', 'The Princess Bride',  
              'The Incredibles', 'Castle in the Sky', 'Monsters, Inc']
new_user_input(fav_movies = fav_movies, all_ratings = ratings_df, 
               movies_df = movies_df, num_recs = 10, 
               age = 26, gender = 'M', movies_gp = movies_gp_df)

Collecting favorite movie IDs
Favorite movies in the available set
        itemId                title  year
itemId                                   
3114      3114          Toy Story 2  1999
1            1            Toy Story  1995
1197      1197  Princess Bride, The  1987
Adding ratings to full set
Creating prediction set
ALS Data formatting
Training ALS model


100%|███████████████████████████████████████████████████████████████████| 10.0/10 [00:09<00:00,  1.13it/s, loss=0.0549]


Making ALS Predictions

User 6041 Movie Recommendations
                                           movies     score
0                                 American Beauty  0.167533
1              Star Wars: Episode IV - A New Hope  0.149947
2                             Shakespeare in Love  0.148860
3                              Back to the Future  0.142713
4      Star Wars: Episode VI - Return of the Jedi  0.142558
5  Star Wars: Episode V - The Empire Strikes Back  0.140051
6                                   Groundhog Day  0.136137
7                                   Bug's Life, A  0.135006
8                            Being John Malkovich  0.131713
9                         Raiders of the Lost Ark  0.129880
CBF Data Formatting
Making CBF Predictions
Content-Based Filtering Recommendations
                                            title  year
itemId                                                 
2716                                 Ghostbusters  1984
318                     Shawshank

In [22]:
%%time
fav_movies = ['Frozen', 'Tangled', 'Oceans Eleven', 'Toy Story', 'The Princess Bride',  
              'The Incredibles', 'Castle in the Sky', 'Monsters, Inc']
new_user_input(fav_movies = fav_movies, all_ratings = ratings_df, 
               movies_df = movies_df, num_recs = 10, 
               age = 8, gender = 'F', movies_gp = movies_gp_df)

Collecting favorite movie IDs
Favorite movies in the available set
        itemId                title  year
itemId                                   
3114      3114          Toy Story 2  1999
1            1            Toy Story  1995
1197      1197  Princess Bride, The  1987
Adding ratings to full set
Creating prediction set
ALS Data formatting
Training ALS model


100%|███████████████████████████████████████████████████████████████████| 10.0/10 [00:09<00:00,  1.08it/s, loss=0.0548]


Making ALS Predictions

User 6041 Movie Recommendations
                                           movies     score
0                                 American Beauty  0.165054
1                             Shakespeare in Love  0.147280
2              Star Wars: Episode IV - A New Hope  0.147050
3      Star Wars: Episode VI - Return of the Jedi  0.139444
4                              Back to the Future  0.138852
5  Star Wars: Episode V - The Empire Strikes Back  0.137411
6                                   Groundhog Day  0.135921
7                                   Bug's Life, A  0.135677
8                            Being John Malkovich  0.131261
9                         Raiders of the Lost Ark  0.130023
CBF Data Formatting
Making CBF Predictions
Content-Based Filtering Recommendations
                             title  year
itemId                                  
2716                  Ghostbusters  1984
318      Shawshank Redemption, The  1994
1097    E.T. the Extra-Terrestrial  1

In [23]:
%%time
fav_movies = ['The Sound of Music', 'Blackhawk Down', 'Pearl Harbor', 'Toy Story', 'The Princess Bride',  
              'Foreign Student', 'Star Wars', 'The Shining', 'Rear Window', 'Groundhog Day', 'Ghostbusters', 
              'Robin Hood (1993)', 'Die Hard']
new_user_input(fav_movies = fav_movies, all_ratings = ratings_df, 
               movies_df = movies_df, num_recs = 10, 
               age = 40, gender = 'M', movies_gp = movies_gp_df)

Collecting favorite movie IDs
Favorite movies in the available set
        itemId                                           title  year
itemId                                                              
1035      1035                             Sound of Music, The  1965
3114      3114                                     Toy Story 2  1999
1            1                                       Toy Story  1995
1197      1197                             Princess Bride, The  1987
572        572                                 Foreign Student  1994
1196      1196  Star Wars: Episode V - The Empire Strikes Back  1980
1210      1210      Star Wars: Episode VI - Return of the Jedi  1983
2628      2628       Star Wars: Episode I - The Phantom Menace  1999
260        260              Star Wars: Episode IV - A New Hope  1977
904        904                                     Rear Window  1954
1265      1265                                   Groundhog Day  1993
2716      2716                      

100%|███████████████████████████████████████████████████████████████████| 10.0/10 [00:09<00:00,  1.12it/s, loss=0.0549]


Making ALS Predictions

User 6041 Movie Recommendations
                       movies     score
0             American Beauty  0.441349
1     Raiders of the Lost Ark  0.421321
2          Back to the Future  0.404817
3         Saving Private Ryan  0.404373
4               Jurassic Park  0.398972
5                  Braveheart  0.393934
6                 Matrix, The  0.387971
7  Terminator 2: Judgment Day  0.387509
8         Shakespeare in Love  0.386203
9            Sixth Sense, The  0.381529
CBF Data Formatting
Making CBF Predictions
Content-Based Filtering Recommendations
                             title  year
itemId                                  
3175                  Galaxy Quest  1999
1097    E.T. the Extra-Terrestrial  1982
1580                  Men in Black  1997
608                          Fargo  1996
2355                 Bug's Life, A  1998
318      Shawshank Redemption, The  1994
2571                   Matrix, The  1999
356                   Forrest Gump  1994
2762       