In [25]:
import os
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import gc
import time
from fuzzywuzzy import fuzz

# configure file path
data_path = '../datasets/ml-latest-small/'
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'
# read data
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
# print(df_movies)

### Data Preprocessing

In [19]:
movie_rating_thres = 50
user_rating_thres = 50

df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count'])
popular_movies = list(set(df_movies_cnt.query('count >= @movie_rating_thres').index))  # noqa
movies_filter = df_ratings.movieId.isin(popular_movies).values

df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(), columns=['count'])
active_users = list(set(df_users_cnt.query('count >= @user_rating_thres').index))  # noqa
users_filter = df_ratings.userId.isin(active_users).values

df_ratings_filtered = df_ratings[movies_filter & users_filter]

# pivot and create movie-user matrix
movie_user_mat = df_ratings_filtered.pivot(
    index='movieId', columns='userId', values='rating').fillna(0)
# create mapper from movie title to index
hashmap = {movie: i for i, movie in
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)) # noqa
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

# clean up
del df_movies, df_movies_cnt, df_users_cnt
del df_ratings, df_ratings_filtered, movie_user_mat
gc.collect()

#return  movie_user_mat_sparse, hashmap

94

### Building Model

In [22]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

def make_recommendations(fav_movie, n_recommendations):

    data = movie_user_mat_sparse
    model_knn.fit(data)
    # get input movie index
    print('You have input movie:', fav_movie)
#     idx = fuzzy_matching(hashmap, fav_movie)
    
    match_tuple = []
    # get match
    for title, idx in hashmap.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
    else:
        print('Found possible matches in our database: '
              '{0}\n'.format([x[0] for x in match_tuple]))
         
    idx = match_tuple[0][1]
    
    # inference
    print('Recommendation system start to make inference')
    print('......\n')
    t0 = time.time()
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(),
                                     distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
    print('It took my system {:.2f}s to make inference \n\'.format(time.time() - t0)')
    
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance ''of {2}'.format(i+1, reverse_hashmap[idx], dist))

### Get recommendations

In [24]:
make_recommendations("Star Wars: Episode VI - Return of the Jedi (1983)", 10)

You have input movie: Star Wars: Episode VI - Return of the Jedi (1983)
Found possible matches in our database: ['Star Wars: Episode VI - Return of the Jedi (1983)', 'Star Wars: Episode III - Revenge of the Sith (2005)', 'Star Wars: Episode II - Attack of the Clones (2002)', 'Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode I - The Phantom Menace (1999)']

Recommendation system start to make inference
......

It took my system {:.2f}s to make inference 
'.format(time.time() - t0)
Recommendations for Star Wars: Episode VI - Return of the Jedi (1983):
1: Independence Day (a.k.a. ID4) (1996), with distance of 0.3401036858558655
2: Star Wars: Episode I - The Phantom Menace (1999), with distance of 0.32803618907928467
3: Saving Private Ryan (1998), with distance of 0.3274661898612976
4: Terminator, The (1984), with distance of 0.3177783489227295
5: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) 