Recommender Systems

In [None]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

### implementation

In [None]:
ds = pd.read_csv("lab10_text_data.csv")

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]

    results[row['id']] = similar_items[1:]
    
# print('done!')

In [None]:
def item(id):
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [None]:
recommend(item_id=110, num=10)

## collaborative filtering with knn

### libraries

In [None]:
import os
import pandas as pd

from scipy.sparse import csr_matrix

# pip install fuzzywuzzy
from fuzzywuzzy import fuzz # for quick similarity check in large data

from sklearn.neighbors import NearestNeighbors

### data preprocessing

In [None]:
# data_path = 'ml-1m/'
data_path = ''
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [None]:
print("df_movies.shape", df_movies.shape)
print("df_ratings.shape", df_ratings.shape)

In [None]:
num_users = len(df_ratings.userId.unique())
num_items = len(df_ratings.movieId.unique())
print('There are %s unique users and %s unique movies in this data set'%(num_users, num_items))

In [None]:
df_ratings_cnt_tmp = pd.DataFrame(df_ratings.groupby('rating').size(), columns=['count'])
df_ratings_cnt_tmp

In [None]:
# there are a lot more counts in rating of zero
total_cnt = num_users * num_items
rating_zero_cnt = total_cnt - df_ratings.shape[0]

df_ratings_cnt = df_ratings_cnt_tmp.append(
    pd.DataFrame({'count': rating_zero_cnt}, index=[0.0]),
    verify_integrity=True,
).sort_index()
df_ratings_cnt

In [None]:
import numpy as np
df_ratings_cnt['log_count'] = np.log(df_ratings_cnt['count'])
df_ratings_cnt

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

get_ipython().run_line_magic('matplotlib', 'inline')
ax = df_ratings_cnt[['count']].reset_index().rename(columns={'index': 'rating score'}).plot(
    x='rating score',
    y='count',
    kind='bar',
    figsize=(12, 8),
    title='Count for Each Rating Score (in Log Scale)',
    logy=True,
    fontsize=12,
)
ax.set_xlabel("movie rating score")
ax.set_ylabel("number of ratings")

In [None]:
# get rating frequency
#number of ratings each movie got.
df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count'])
df_movies_cnt.head()

### Pivot Ratings into Movie-Features

In [None]:
# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

mat_movie_features = csr_matrix(df_movie_features.values)

In [None]:
print("df_movie_features.shape", df_movie_features.shape)
df_movie_features.head()

In [None]:
#now we need to take only movies that have been rated atleast 50 times to get some idea of the reactions of users towards it

popularity_thres = 50
popular_movies = list(set(df_movies_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_movies = df_ratings[df_ratings.movieId.isin(popular_movies)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular movies: ', df_ratings_drop_movies.shape)

In [None]:
# get number of ratings given by every user
df_users_cnt = pd.DataFrame(df_ratings_drop_movies.groupby('userId').size(), columns=['count'])
df_users_cnt.head()

In [None]:
# filter data to come to an approximation of user likings.
ratings_thres = 50
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_movies[df_ratings_drop_movies.userId.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', df_ratings_drop_users.shape)

In [None]:
movie_user_mat = df_ratings_drop_users.pivot(index='movieId', columns='userId', values='rating').fillna(0)
#map movie titles to images
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}
# transform matrix to scipy sparse matrix --- for efficient handling of large data
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [None]:
movie_user_mat_sparse

In [None]:
#make an object for the NearestNeighbors Class.
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit the dataset
model_knn.fit(movie_user_mat_sparse)

In [None]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. 
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True
    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

In [None]:
def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie
    Parameters
    ----------
    model_knn: sklearn model, knn model
    data: movie-user matrix
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    n_recommendations: int, top n recommendations
    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    # print("idx", idx)
    
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    
    raw_recommends =         sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [None]:
# favorite_movie = 'Star Wars: Episode V - The Empire Strikes Back (1980)'
# favorite_movie = 'Matrix' # will raise error, no match is found

favorite_movie = "Jumanji (1995)"

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=favorite_movie,
    mapper=movie_to_idx,
    n_recommendations=10)

## SVD

In [1]:
import os
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

### data preprocessing

In [2]:
data_path = ''
# data_path = 'ml-1m/'
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
#     movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
#     ratings_filename,
    usecols=['userId', 'movieId', 'rating'],
    
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [26]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


In [71]:
df_ratings_new = df_ratings.copy()
df_ratings_new = df_ratings_new[(df_ratings_new["userId"] <= 500) & (df_ratings_new["movieId"] <= 700)]
df_ratings_new['userId'] = 'U' + df_ratings_new['userId'].astype(str)
df_ratings_new['movieId'] = 'M' + df_ratings_new['movieId'].astype(str)

df_ratings_new = df_ratings_new.rename({"rating": "overall_rating"}, axis=1) # rename the rating column

# df_ratings_new["acting_rating"] = np.amax(np.vstack((np.ones(len(df_ratings_new["overall_rating"])),(df_ratings_new["overall_rating"]+np.random.randint(low=-2, high=2)).to_numpy())).T, 1)

df_movie_features_new = df_ratings_new.pivot(
    index='userId',
    columns='movieId',
    values='overall_rating'
).fillna(0)

print("df_movie_features_new.shape: %d %d" %(df_movie_features_new.shape))

df_ratings_new = df_ratings_new.sample(frac=1)
df_ratings_new.to_csv("movie_ratings_new.csv", index=False)

df_movie_features_new.shape: 495 584


In [69]:
# np.random.randint(low=2, high=5, size=10)
# np.random.randint(low=-2, high=2)
np.ones(len(df_ratings_new["overall_rating"]))*5

array([5., 5., 5., ..., 5., 5., 5.])

In [60]:
# len(df_ratings_new["overall_rating"])
# np.amax(np.array(np.zeros(df_ratings_new["overall_rating"]), df_ratings_new["overall_rating"]-2),1)
# np.array(np.zeros(df_ratings_new["overall_rating"]), (df_ratings_new["overall_rating"]-2).to_numpy())
#  
# np.vstack((np.zeros(len(df_ratings_new["overall_rating"])),(df_ratings_new["overall_rating"]-2).to_numpy())).T

array([[0., 2.],
       [0., 0.],
       [0., 1.],
       ...,
       [0., 3.],
       [0., 1.],
       [0., 3.]])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [23]:
# To gain a better interpretation of the data, pivot the dataframe to have userId as rows and movieId as columns, filling the null values with 0.0.
df_movie_features = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

print("df_movie_features.shape: %d %d" %(df_movie_features.shape))

df_movie_features.shape: 6040 3706


In [5]:
df_movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### scipy library to implement SVD

In [12]:
R = df_movie_features.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [15]:
U, sigma, Vt = svds(R_demeaned, k = 50)

'''
Sigma$ returned is just the values instead of a diagonal matrix. 
This is useful, but since we are going to leverage matrix multiplication to get predictions 
let's convert it to the diagonal matrix form.
'''

sigma = np.diag(sigma)

In [17]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [18]:
# print("df_movie_features.columns", df_movie_features.columns)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_movie_features.columns)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.28886,0.143056,-0.19508,-0.018843,0.012233,-0.176602,-0.07412,0.141358,-0.059553,-0.195951,...,0.027807,0.00164,0.026395,-0.022024,-0.085416,0.403527,0.105577,0.031911,0.05045,0.088909
1,0.744713,0.16966,0.335419,0.000758,0.022475,1.353047,0.051425,0.071258,0.161601,1.567247,...,-0.056502,-0.013732,-0.01058,0.062576,-0.016248,0.155792,-0.418735,-0.101102,-0.054098,-0.140187
2,1.818823,0.456136,0.090978,-0.043037,-0.025695,-0.158617,-0.131778,0.098977,0.030552,0.735471,...,0.040481,-0.005301,0.012832,0.029349,0.020867,0.12153,0.076205,0.012344,0.015147,-0.109956
3,0.408055,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,...,0.00857,-0.005426,-0.0085,-0.003417,-0.083982,0.094513,0.057557,-0.02605,0.014841,-0.034224
4,1.574266,0.021241,-0.0513,0.246884,-0.032405,1.552281,-0.199629,-0.01492,-0.060498,0.450513,...,0.110151,0.04601,0.006934,-0.015941,-0.05008,-0.052538,0.507188,0.03383,0.125706,0.199244


In [19]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):

    '''
    make a function that uses factorized matrices to recommend movies to a user, given a user_id.
    '''
    
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1

    user_data = original_ratings_df[original_ratings_df.userId == (userID)]

    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                    sort_values(['rating'], ascending=False))

    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',
            right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]

    return user_full, recommendations

In [20]:
given_userID = 330

already_rated, predictions = recommend_movies(preds_df, given_userID, df_movies, df_ratings, 10)
print("already_rated\n", already_rated.head(10))
print("\n\n prediction\n", predictions)

already_rated
     userId  movieId  rating                                title
18     330      913     5.0           Maltese Falcon, The (1941)
69     330     1225     5.0                       Amadeus (1984)
63     330     3827     5.0                 Space Cowboys (2000)
23     330      954     5.0  Mr. Smith Goes to Washington (1939)
22     330      953     5.0         It's a Wonderful Life (1946)
62     330     3753     5.0                  Patriot, The (2000)
20     330      858     5.0                Godfather, The (1972)
19     330      919     5.0             Wizard of Oz, The (1939)
68     330     1221     5.0       Godfather: Part II, The (1974)
17     330      912     5.0                    Casablanca (1942)


 prediction
       movieId                                              title
1160     1196  Star Wars: Episode V - The Empire Strikes Back...
314       318                   Shawshank Redemption, The (1994)
897       923                                Citizen Kane (1

## SVD from python file

In [21]:
from svd_ml1m import svd_recommend_movies

In [22]:
given_userID = 330
num_recommendations = 10
svd_recommend_movies('movies.csv', 'ratings.csv', given_userID, num_recommendations)

(    userId  movieId  rating                                  title
 18     330      913     5.0             Maltese Falcon, The (1941)
 69     330     1225     5.0                         Amadeus (1984)
 63     330     3827     5.0                   Space Cowboys (2000)
 23     330      954     5.0    Mr. Smith Goes to Washington (1939)
 22     330      953     5.0           It's a Wonderful Life (1946)
 ..     ...      ...     ...                                    ...
 2      330     3793     3.0                           X-Men (2000)
 75     330     3785     3.0                     Scary Movie (2000)
 60     330     3821     2.0  Nutty Professor II: The Klumps (2000)
 43     330     3623     2.0           Mission: Impossible 2 (2000)
 72     330     2971     2.0                   All That Jazz (1979)
 
 [76 rows x 4 columns],
       movieId                                              title
 1160     1196  Star Wars: Episode V - The Empire Strikes Back...
 314       318            