In [1]:
import os
import time
import gc
import argparse
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
#
# Fuzzy string matching like a boss. It uses Levenshtein Distance to
# calculate the differences between sequences in a simple-to-use package.
from fuzzywuzzy import fuzz

# Here I just use a small dataset

In [2]:
# configure file path
data_path = os.path.join(os.getcwd(), 'ml-latest-small')
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

In [3]:
# read data
df_movies = pd.read_csv(
                        os.path.join(data_path, movies_filename), 
                        usecols = ['movieId', 'title'],
                        dtype = {'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv(
                        os.path.join(data_path, ratings_filename),
                        usecols=['userId', 'movieId', 'rating'],
                        dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [4]:
df_movies.sample(10)

Unnamed: 0,movieId,title
184,216,Billy Madison (1995)
3257,4403,"Fall of the House of Usher, The (House of Ushe..."
7736,90738,"Double, The (2011)"
4410,6506,Fulltime Killer (Chuen jik sat sau) (2001)
7042,69118,In the Electric Mist (2009)
6638,56367,Juno (2007)
9716,188797,Tag (2018)
9,10,GoldenEye (1995)
4783,7122,King of Hearts (1966)
9528,172215,Saved by the Bell: Hawaiian Style (1992)


In [5]:
df_ratings.sample(10)

Unnamed: 0,userId,movieId,rating
47027,307,2023,3.5
84275,539,8641,3.5
2106,18,55765,4.5
46914,307,1207,3.0
80144,503,32914,3.0
51292,331,36529,3.0
21413,140,3798,3.0
95201,600,562,4.0
32446,221,2997,4.5
14633,91,6242,4.5


In [6]:
df_ratings.shape

(100836, 3)

In [7]:
df_movies.shape

(9742, 2)

In [8]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042541
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [9]:
df_movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [10]:
# count the number of ratings done for movie 1 (movieId==1)
df_ratings.query('movieId == 1')['movieId'].count()

215

In [11]:
# count the number of ratings done for movie 2 (movieId==2)
df_ratings.query('movieId == 2')['movieId'].count()

110

In [12]:
# Let's put that in a DataFrame
df_movies_count = pd.DataFrame(
            df_ratings.groupby('movieId').size(),
            columns=['count'])
df_movies_count.head(10)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


In [13]:
movie_rating_threshold = 60
popular_movies = list(set(df_movies_count.query('count >= @movie_rating_threshold').index))
movies_filter = df_ratings.movieId.isin(popular_movies).values

In [14]:
# list id's of popular movies (which rating count>=60)
popular_movies

[1,
 2,
 6,
 2054,
 10,
 11,
 16,
 17,
 19,
 21,
 25,
 32,
 2081,
 34,
 36,
 39,
 47,
 48,
 50,
 51255,
 62,
 2115,
 30793,
 95,
 104,
 110,
 111,
 49272,
 2174,
 4226,
 141,
 2194,
 150,
 4246,
 153,
 158,
 161,
 163,
 165,
 4262,
 8360,
 173,
 8368,
 185,
 6333,
 208,
 4306,
 6365,
 223,
 6373,
 231,
 6377,
 235,
 2291,
 253,
 260,
 266,
 2321,
 2324,
 2329,
 288,
 292,
 293,
 296,
 300,
 2353,
 2355,
 316,
 317,
 318,
 329,
 337,
 339,
 344,
 2396,
 349,
 4447,
 353,
 356,
 357,
 364,
 367,
 368,
 377,
 380,
 6539,
 410,
 434,
 435,
 440,
 442,
 8636,
 8644,
 454,
 2502,
 457,
 51662,
 466,
 8665,
 474,
 480,
 2542,
 500,
 508,
 509,
 520,
 2571,
 55820,
 527,
 539,
 541,
 551,
 552,
 553,
 555,
 6711,
 2617,
 68157,
 2628,
 586,
 587,
 588,
 589,
 590,
 592,
 593,
 594,
 595,
 596,
 597,
 2640,
 41566,
 608,
 2657,
 2683,
 2692,
 648,
 2700,
 653,
 70286,
 2706,
 2710,
 45722,
 2716,
 60069,
 8874,
 2762,
 6863,
 33493,
 6874,
 74458,
 733,
 736,
 2791,
 2797,
 750,
 8961,
 778,
 7

In [15]:
# filter the movies which is popular in df_ratings false if not in and true if in 
movies_filter

array([ True, False,  True, ..., False, False, False])

In [16]:
# We do the same for users too
user_rating_threshold = 50
df_users_count = pd.DataFrame(
df_ratings.groupby('userId').size(),
columns=['count'])
active_users = list(set(df_users_count.query('count >= @user_rating_threshold').index))
users_filter = df_ratings.userId.isin(active_users).values

In [17]:
df_users_count.head(10)

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
6,314
7,152
8,47
9,46
10,140


In [18]:
# We combine the two filtered in order to extract the popular movie and most active users
df_ratings_filtered = df_ratings[movies_filter & users_filter]


In [19]:
df_ratings_filtered.sample(10)

Unnamed: 0,userId,movieId,rating
99657,610,1784,4.0
82491,524,32,3.0
26241,182,1080,2.0
78537,489,434,2.5
734,6,367,4.0
96961,603,3994,1.0
49257,318,4993,4.0
12959,83,318,5.0
66799,432,2,4.0
99654,610,1721,3.0


In [20]:
movie_user_mat = df_ratings_filtered.pivot(
index='movieId', columns='userId', values='rating').fillna(0)
hashmap = {
movie: i for i, movie in
enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [21]:
# explanations
df_movies.set_index('movieId').loc[movie_user_mat.index].title

movieId
1                       Toy Story (1995)
2                         Jumanji (1995)
6                            Heat (1995)
10                      GoldenEye (1995)
11        American President, The (1995)
                       ...              
79132                   Inception (2010)
89745               Avengers, The (2012)
91529      Dark Knight Rises, The (2012)
99114            Django Unchained (2012)
109487               Interstellar (2014)
Name: title, Length: 335, dtype: object

In [22]:
movie_user_mat.sample(10)

userId,1,4,6,7,10,11,15,16,17,18,...,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50872,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.5,...,0.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0
858,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.5,5.0,4.0,...,4.0,5.0,0.0,5.0,0.0,0.0,4.0,4.0,5.0,5.0
316,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,4.0,0.0,0.0,3.0,3.0,0.0
2174,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,4.0,0.0,3.5,4.0,0.0,3.0,0.0
300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0
2011,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.5,...,4.5,0.0,0.0,1.0,0.0,4.0,3.5,0.0,2.5,0.0
551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,3.0,0.0,3.0,0.0,3.5,0.0
318,0.0,0.0,5.0,0.0,0.0,4.0,5.0,4.0,5.0,5.0,...,3.5,5.0,5.0,0.0,0.0,0.0,3.5,5.0,4.5,3.0
2321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,3.5,4.0,0.0,3.5,0.0
1198,5.0,3.0,0.0,0.0,0.0,0.0,4.0,3.5,4.5,4.0,...,4.0,4.0,0.0,4.0,0.0,0.0,3.5,0.0,0.0,5.0


In [23]:
c = 0
for (i, j) in hashmap.items():
    print('{} : {}'.format(i, j))
    c+=1
    if c == 10:
        break

Toy Story (1995) : 0
Jumanji (1995) : 1
Heat (1995) : 2
GoldenEye (1995) : 3
American President, The (1995) : 4
Casino (1995) : 5
Sense and Sensibility (1995) : 6
Ace Ventura: When Nature Calls (1995) : 7
Get Shorty (1995) : 8
Leaving Las Vegas (1995) : 9


In [24]:


# pivot ratings into movie features
df_movies_features = df_ratings.pivot(
                    index = 'movieId', #index <==> ligne
                    columns = 'userId',
                    values = 'rating').fillna(0) # valeurs sur lesquelles seront calculer la nvelle matrice

In [25]:
# convert dataframe of movie features to scipy sparse matrix
#mat_movie_features = csr_matrix(df_movie_features.values)
df_movies_features.sample(5)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1777,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0
8921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# from sklearn.neighbors import NearestNeighbors


# model_knn = NearestNeighbors(metric = 'cosine', 
#                              algorithm='brute',
#                             n_neighbors=20,
#                             n_jobs=-1)



In [27]:
class KnnRecommender:
    """
    Here is an item-based collaborative filtering recommender with
    KNN implemented by sklearn
    """

    def __init__(self, path_movies, path_ratings):
        """
        Initialization

        params:
        path_movies: path to movies dataset
        path_ratings: path to ratings dataset
        """
        self.path_movies = path_movies
        self.path_ratings = path_ratings
        self.movie_rating_threshold = 0
        self.user_rating_threshold = 0
        self.model = NearestNeighbors()

    def set_filter_params(self, movie_rating_threshold, user_rating_threshold):
        """
        set rating freq threshold to filter less-known movies and less active users
        params:
        -------
        movie_rating_threshold: int, is the min number of ratings received by users
        user_rating_threshold: int, is the min number of ratings a user gives

        """
        self.movie_rating_threshold = movie_rating_threshold
        self.user_rating_threshold = user_rating_threshold

    def set_models_params(self, n_neighbors, algo, metric, n_jobs=None):
        """
        set model params for NeirestNeighbors algorithm
        params:
        -------
        n_neighbors: int, optional(default = 5)

        algo: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional

        metric: string or callable, default 'minkowski', or one of
                ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']


        n_jobs: int or none, optional (default=None)
        """

        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(**{
            'n_neighbors': n_neighbors,
            'algorithm': algo,
            'metric': metric,
            'n_jobs': n_jobs})

    def _prep_data(self):
        """
        prepare data for recommender

        1. movie-user scipy sparse matrix
        2. hashmap of movie of row index in movie-user scipy sparse matrix

        """
        #------------------------------- read data
        df_movies = pd.read_csv(
            os.path.join(self.path_movies),
            usecols=['movieId', 'title'],
            dtype={'movieId': 'int32', 'title': 'str'})

        df_ratings = pd.read_csv(
            os.path.join(self.path_ratings),
            usecols=['userId', 'movieId', 'rating'],
            dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

        # ------------------------------filter data
        ###FOR MOVIES
        df_movies_count = pd.DataFrame(
            df_ratings.groupby('movieId').size(),
            columns=['count'])

        popular_movies = list(set(df_movies_count.query('count >= @self.movie_rating_threshold').index))
        movies_filter = df_ratings.movieId.isin(popular_movies).values

        ###FOR USERS
        df_users_count = pd.DataFrame(
            df_ratings.groupby('userId').size(),
            columns=['count'])
        active_users = list(set(df_users_count.query('count >= @self.user_rating_threshold').index))
        users_filter = df_ratings.userId.isin(active_users).values

        ## new one=  combination of filtered movies and users
        df_ratings_filtered = df_ratings[movies_filter & users_filter]

        # pivot and create movie-user matrix
        movie_user_mat = df_ratings_filtered.pivot(
            index='movieId', columns='userId', values='rating').fillna(0)
        # create mapper from movie title to index

        hashmap = {
            movie: i for i, movie in
            enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))

        }
        # trasform it now to scipy sparse matrix

        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

        # now we can delete non needed data
        del df_movies, df_movies_count, df_users_count
        del df_ratings, df_ratings_filtered, movie_user_mat
        # garbage collector not needed for this small dataset but !!
        gc.collect()
        return movie_user_mat_sparse, hashmap

    # this function use the Leveinshtein algorithm(or distance edition algo)
    def _fuzzy_matching(self, hashmap, fav_movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None

        Params:
        ----------
        hashmap: dict, map movie title name to index of the movie in data


        fav_movie: str, name of user input movie


        return
        ----------
        index of the closest match
        """
        match_tuple = []
        # get match
        for title, index in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, index, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print("Sorry there is no match!!")
        else:
            print("Found possible matches in db: " +
                  "{0}\n".format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap,
                   fav_movie, n_recommendations):

        """
        return top n similar movie recommendations based on user's input
        movie

        params:
        -------
        model: sklearn model, knn
        data: movie_user matrix
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        n_recommendations: int, top n recommendations

        return
        -------
        list of top n similar movie recommendations
        """
        # fit data
        model.fit(data)
        # get input movie index
        print('Your input is: ', fav_movie)
        index = self._fuzzy_matching(hashmap, fav_movie)
        # inference
        print('Wait until recommendation system finish his job!')
        print(10 * '.', end='\n')
        # let's time it
        t_0 = time.time()
        distances, indices = model.kneighbors(
            data[index],
            n_neighbors=n_recommendations + 1)
        # get list of raw index of recommendations

        raw_recommends = sorted(list(zip(indices.squeeze().tolist(),
                                         distances.squeeze().tolist())),
                                key=lambda x: x[1])[:0:-1]
        print('It took : {:.2f} seconds'.format(time.time() - t_0))

        return raw_recommends

    def make_recommendations(self, fav_movie, n_recommendations):
        """
        make top n recommendations

        params:
        -------
        fav_movie: str, user's input movie

        n_recommendations: int, top n recommendations

        """
        # get data

        movie_user_mat_sparse, hashmap = self._prep_data()

        # get recommendations
        raw_recommends = self._inference(self.model,
                                    movie_user_mat_sparse,
                                    hashmap,
                                    fav_movie,
                                    n_recommendations)

        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('recommendations for {}'.format(fav_movie))
        for i, (index, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance of {2}'.format(i + 1, reverse_hashmap[index], dist))

In [28]:
# in order to make things easy on console let's use a parser
def parse_args():
    """
    The argparse module makes it easy to write user-friendly
    command-line interfaces.
    It parses the defined arguments from the sys.argv.
    The argparse module also automatically generates help and usage messages,
    and issues errors when users give the program invalid arguments.
    :return:
    parsed
    """
    parser = argparse.ArgumentParser(
        prog='Movie Recommender',
        description='Run KNN Movie Recommender')

    parser.add_argument('--path', nargs='?', default='ml-latest-small',
                        help='provide movies filename')
    parser.add_argument('--movies_filename', nargs='?', default='movies.csv',
                        help='input path to the data')
    parser.add_argument('--ratings_filename', nargs='?', default='ratings.csv',
                        help='provide ratings filename')
    parser.add_argument('--movie_name', nargs='?', default='Spider-Man',
                        help='please provide your favorite movie name')
    parser.add_argument('--top_n', type=int, default=10,
                        help='top n movie recommendations')

    return parser.parse_args()

In [36]:
if __name__ == '__main__':
    # get args
#     args = parse_args()
#     data_path = args.path
#     movies_filename = args.movies_filename
#     ratings_filename = args.ratings_filename
#     movie_name = args.movie_name
#     top_n = args.top_n

    # initialize recommender system

    recommender = KnnRecommender(os.path.join(data_path, movies_filename),
                                 os.path.join(data_path, ratings_filename))

    recommender.set_filter_params(50, 50)
    recommender.set_models_params(20, 'brute', 'cosine', -1)
    
    my_fav_movie = 'Iron Man'
    top = 10
    recommender.make_recommendations(my_fav_movie, top)
    

Your input is:  Iron Man
Found possible matches in db: ['Iron Man (2008)']

Wait until recommendation system finish his job!
..........
It took : 0.11 seconds
recommendations for Iron Man
1: Kung Fu Panda (2008), with distance of 0.37368708848953247
2: Inception (2010), with distance of 0.3691744804382324
3: Up (2009), with distance of 0.3688569664955139
4: Guardians of the Galaxy (2014), with distance of 0.36875778436660767
5: Star Trek (2009), with distance of 0.36602938175201416
6: Batman Begins (2005), with distance of 0.36275893449783325
7: Avatar (2009), with distance of 0.3108932375907898
8: WALL·E (2008), with distance of 0.2981378436088562
9: Dark Knight, The (2008), with distance of 0.287839412689209
10: Avengers, The (2012), with distance of 0.2853195071220398


# It seems it's works ಠ_ಠಠ_ಠ