In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz



In [2]:
# Function to ready data for recommendation
def filter_data():
    
    # Read data
    movies  = pd.read_csv("Data/movies.csv",usecols=['movieId', 'title'],dtype={'movieId': 'int32', 'title': 'str'})
    ratings = pd.read_csv("Data/ratings.csv",usecols=['userId', 'movieId', 'rating'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
    
    # The count of each movie
    movies_count = pd.DataFrame(ratings.groupby('movieId').size(),columns=['count'])
    
    # Get popular movies that only have more than 50 ratings
    popular = list(set(movies_count.query('count >= 50').index))  
    movies_filtered = ratings.movieId.isin(popular).values
    
    # The count of each user
    users_count = pd.DataFrame(ratings.groupby('userId').size(),columns=['count'])
    
    # Get popular users that have more than 50 ratings
    active_users = list(set(users_count.query('count >= 50').index))  
    users_filtered = ratings.userId.isin(active_users).values
    
    ratings_filtered = ratings[movies_filtered & users_filtered]

    # Create movie-user matrix which will be needed for KNN, empty values are set to 0
    movie_user_matrix = ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0)
    
    
    # Hashmap that points from movie title to index
    hashmap = {
        movie: i for i, movie in
        enumerate(list(movies.set_index('movieId').loc[movie_user_matrix.index].title)) 
    }
   
    # Transform matrix to scipy sparse matrix
    movie_user_matrix_sparse = csr_matrix(movie_user_matrix.values)

    return movie_user_matrix_sparse, hashmap

In [3]:
# Function that uses KNN and recommends 10 movies for the user based on a movie name they input
def recommend(model, data, movie):
    
    # Fit data to the model
    model.fit(data)
    
    
    distance, index = model.kneighbors(data[movie],n_neighbors=11)
    
    # Fetch and combine recommendations 
    
    recommendations = \
            sorted(
                list(
                    zip(
                        index.squeeze().tolist(), # Remove single dimension entries -> movies with only 1 neighbor
                        distance.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
    
    return recommendations
    


In [7]:
# Function that finds the movie name in dictionary
def find_movie(hashmap, name):
    
    # Uses fuzzywuzzy to get similarity in name and picks closest match
    
    movies = []
    
    for key, val in hashmap.items():
        
        ratio = fuzz.ratio(key.lower(), name.lower())
        
        if ratio >= 60:
            
            movies.append((key, val, ratio))
            
    # Sort --> return first movie in list
    movies = sorted(movies, key=lambda x: x[2])[::-1]
    
    
    if not movies:
        
        print("No such movie found")
    else:
        return movies[0][1]
        
    
    

In [8]:
def main(name):
    
    data, hashmap = filter_data()
    movie = find_movie(hashmap,name)
    model = NearestNeighbors(n_neighbors=11, algorithm='brute',metric='cosine',n_jobs=-1)
    recommendations = recommend(model,data,movie)
    
    # print results
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    print('Recommendations for {}:'.format(name))
    for i, (idx, dist) in enumerate(recommendations):
        print('{0}: {1} '.format(i+1, reverse_hashmap[idx], dist))
        


In [9]:
main("Batman")

Recommendations for Batman:
1: Die Hard: With a Vengeance (1995) 
2: Aladdin (1992) 
3: Braveheart (1995) 
4: Speed (1994) 
5: Mask, The (1994) 
6: Fugitive, The (1993) 
7: Jurassic Park (1993) 
8: Terminator 2: Judgment Day (1991) 
9: Batman Forever (1995) 
10: True Lies (1994) 
