In [1]:
'''
This code was orginally pulled from https://www.geeksforgeeks.org/recommendation-system-in-python/ and modified to fit our
use case. I changed the system to use the data from our class project and added an interactive component to allow users to search
more than once and exit the simple CLI.
'''
# code
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#Setting up data frames for our sysmtem.
ratings = pd.read_csv("ratings.csv")

movies = pd.read_csv("movies.csv")
# I setout to add tags to this system but with first inspection not all movies have tags attached. I am unsure how this could
# impact the outocme so tag support will be considered later.
tags = pd.read_csv('tags.csv')

n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())
#Printing Basic information about the data sets.
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()


# Find Lowest and Highest rated movies:
mean_rating = ratings.groupby('movieId')[['rating']].mean()
# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]
# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]
# show number of people who rated movies rated movie highest
ratings[ratings['movieId']==highest_rated]
# show number of people who rated movies rated movie lowest
ratings[ratings['movieId']==lowest_rated]

## the above movies has very low dataset. We will use bayesian average
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

# Now, we create user-item matrix using scipy csr matrix
from scipy.sparse import csr_matrix

def create_matrix(df):
	#Using the UserId's and MovieId's to set the size of the array.
	N = len(df['userId'].unique())
	M = len(df['movieId'].unique())
	
	# Map Ids to indices
	user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
	movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
	
	# Map indices to IDs
	user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
	movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
	
	user_index = [user_mapper[i] for i in df['userId']]
	movie_index = [movie_mapper[i] for i in df['movieId']]
    

	X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
	
	return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

from sklearn.neighbors import NearestNeighbors
"""
Function find_similar_movies
Input Movie Id from a list, Dataframe, neighborhood size.
Find similar movies using nearest neighbors. The idea is we will create a neighborhood around the target movie that the user
input. Then we will find the 10 closests movies to that object based on the ratings and movie information. Then we will output
those 10 movies as suggestions for the user.
"""


def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
	
	neighbour_ids = []
	
	movie_ind = movie_mapper[movie_id]
	movie_vec = X[movie_ind]
	k+=1
	kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
	kNN.fit(X)
	movie_vec = movie_vec.reshape(1,-1)
	neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
	for i in range(0,k):
		n = neighbour.item(i)
		neighbour_ids.append(movie_inv_mapper[n])
	neighbour_ids.pop(0)
	return neighbour_ids


#Creating a dictionary to store matching movies and ID's for lookup.
movie_titles = dict(zip(movies['movieId'], movies['title']))

#hard code example to show the program does what it needs.
movie_id = 3
#Using k=10 to find 10 movies that share properties with the listed movie.
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]

#Printing the list of similar movies.
print(f"Since you watched {movie_title}")
for i in similar_ids:
	print(movie_titles[i])



FileNotFoundError: [Errno 2] No such file or directory: 'ratings.csv'

In [None]:
#Creating the functions for the CLI

'''
Geykey()
Input value for a dictionar
output matching key
'''
def GetKey(val):
   for key, value in movie_titles.items():
      if val == value:
         return key
    
'''
movie_lookup()
input n/a
Collect user input on call. Checks the dictionary for a movie mathing a value.
If the movie matches called GetKey to use the key to execute the seach for similar movies.
If the movie does not match thorws the information to the user.
'''
def movie_lookup():
    print("Please list a movie you watched in the form title (Year)... For example Balto (1995)")
    movie_title = input()
    if  movie_title in movie_titles.values():
        movie_id = GetKey(movie_title)
        similar_ids = find_similar_movies(movie_id, X, k=10)
        movie_title = movie_titles[movie_id]
        print(f"Since you watched {movie_title}")
        for i in similar_ids:
            print(movie_titles[i])
    else:
        print("Movie not in database")


In [6]:
endLoop = True
while endLoop:
    print("Welcome to my movie suggestion system")
    UserChoice = int(input('Input 1 if you want to lookup a movie, 2 to end the program'))
    if UserChoice == 1:
        movie_lookup()
    if UserChoice == 2:
        print("Good Bye")
        endLoop = False
    else:
        print('Choose a 1 or 2')

Welcome to my movie suggestion system
Input 1 if you want to lookup a movie, 2 to end the program1
Please list a movie you watched in the form title (Year)... For example Balto (1995)
Balto (1995)
Since you watched Balto (1995)
Cats Don't Dance (1997)
Madeline (1998)
Shiloh (1997)
Kiss Me Kate (1953)
Endurance: Shackleton's Legendary Antarctic Expedition, The (2000)
Adanggaman (2000)
Soft Fruit (1999)
Oliver & Company (1988)
Big Green, The (1995)
Kid in King Arthur's Court, A (1995)
Choose a 1 or 2
Welcome to my movie suggestion system
Input 1 if you want to lookup a movie, 2 to end the program2
Good Bye


<h1> Recommender System </h1>

<h2> Steps to this example </h2>

<ol>
    <li> First I executed some EDA to get a n idea of the data we are working with</li>
    <li>Then using the two data sets Movies and Ratings we created a joined set of data for our model to operate on. This data mapped ratings from every user to every movie. </li>
    <li> Then I created a KNN model to create groupings around the target movies</li>
    <li> For the model creation we set n_neighbors = 10 to ensure each neighborhood had 10 items around the target </li>
    <li> Lastly we created a simple CLI for users to access the functions and find lists of movies they may enjoy </li>
    </ol>