Aeon Williams 

CS397 DigiPen Spring 2020

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from collections import defaultdict
import heapq
from operator import itemgetter, attrgetter

# Data Wrangling

In [2]:
#read in the raw movie data
df = pd.read_csv('./data/movies/movies_metadata.csv', low_memory=False)
df['id'] = pd.to_numeric(df['id'], errors='coerce')

In [3]:
#read in the keywords data
df_keywords = pd.read_csv('./data/movies/keywords.csv', low_memory=False)
df_keywords['id'] = pd.to_numeric(df_keywords['id'], errors='coerce')
df_keywords = df.merge(df_keywords, on='id')

In [4]:
#find the director in the list of crew
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return ''

In [5]:
#read in the credits data and merge it with the movies & keywords data
df_credits = pd.read_csv('./data/movies/credits.csv', low_memory=False)
df_credits['id'] = pd.to_numeric(df_credits['id'], errors='coerce')
df_credits = df_keywords.merge(df_credits, on='id')

df_credits = df_credits[['keywords', 'genres', 'id', 'original_language', 'overview', 'status', 'title', 'vote_average', 'vote_count', 'cast', 'crew']]
df_credits['index'] = np.arange(len(df_credits))

In [6]:
#parse the lists for ease of analytics later
categories = ['genres', 'cast', 'keywords', 'crew']

for category in categories:
    df_credits[category] = df_credits[category].fillna('')
    df_credits[category] = df_credits[category].apply(literal_eval)
    if category == 'crew':
        df_credits[category] = df_credits[category].apply(get_director)
    else:
        df_credits[category] = df_credits[category].apply(lambda x : [i['name'] for i in x] if isinstance(x, list) else [])
        df_credits[category] = [' '.join(l) for l in df_credits[category]]

In [7]:
#create a dataset with easy to search for titles
df_titles = df_credits.copy()
df_titles['title'] = df_titles['title'].str.lower()

In [8]:
#populate a list of a user's top 3 favorite movies, from the database
def user_movies(movies):
    iterations = 0
    while len(movies) < 3 and iterations < 10:
        movie = input(str(len(movies) + 1) + ": ").lower()
        if movie in df_titles['title'].values:
            movies.append(movie)
        else:
            print("Sorry, that wasn't found in our database. Please try another movie!")
            iterations += 1

In [9]:
#create a combined row of all the information we care about
def combine(row):
    return row['keywords'] + ' ' + row['genres'] + ' ' + row['cast'] + ' ' + row['crew'] + ' ' + row['original_language']

In [10]:
#create a recommender based on the similarity of movies using genres, cast, and crew
def knowledge_based():
    df_combined = df_credits.copy()
    categories = ['genres', 'cast', 'crew', 'original_language', 'keywords']
    
    for category in categories:
        df_combined[category] = df_combined[category].fillna('')

    df_combined['combined'] = df_combined.apply(combine, axis=1)

    cv = CountVectorizer()
    matrix = cv.fit_transform(df_combined['combined'])
    cosine = cosine_similarity(matrix)
    return cosine

In [None]:
#find the cosine similarity based on generes, cast, and crew  
cosine = knowledge_based()

In [None]:
#get the title of a movie based on the index
def get_title(index):
    return df_credits[df_credits.index == index]['title'].values[0]

In [None]:
#get an index of a movie based on the title
def get_index(title):
    title = title.lower()
    return df_titles[df_titles.title == title]['index'].values[0]

In [None]:
#local variable setup
movies = []
movies_enjoyed = []
userID = 6969

In [None]:
#ratings data for collab filter
df_ratings = pd.read_csv('./data/movies/ratings.csv', low_memory=False)
df_ratings.drop('timestamp', axis=1, inplace=True)

# Recommenders

In [None]:
#recommender based ONLY on user ratings
def collab_filter():
    reader = Reader(rating_scale=(1,5))
    data = Dataset.load_from_df(df_ratings, reader)
    training = data.build_full_trainset()
    knn = KNNBasic({'name':'cosine', 'user_based':True})
    knn.fit(training)
    matrix = knn.compute_similarities()
    
    userInnerID = training.to_inner_uid(userID)
    row = matrix[userInnerID]
    similarUsers = []
    for innerID, score in enumerate(row):
        if innerID != userInnerID:
            similarUsers.append((innerID, score))

    kNeighbors = heapq.nlargest(10, similarUsers, key=lambda t: t[1])
    candidates = defaultdict(float)

    for similarUser in kNeighbors:
        innerID = similarUser[0]
        userSimilarityScore = similarUser[1]
        theirRating = training.ur[innerID]
        for rating in theirRating:
            candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

    watched = {}
    for itemID, rating in training.ur[userInnerID]:
        watched[itemID] = 1
    
    pos = 0
    print('\n\nThese are some other movies you might enjoy: \n')
    for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            movieID = training.to_raw_iid(itemID)
            if not get_title(movieID) in movies:
                print(get_title(movieID))
                pos += 1
                if(pos > 10):
                    break

In [None]:
#rate a movie and add it to the ratings dataframe
def rate(df, title, rating=0):
    if rating != 5:
        rating = int(input('Rate the movie between 1-5: '))
    if rating > 3:
        movies_enjoyed.append(title)
    title = title.lower()
    df_new = pd.DataFrame({'userId':[userID],
                          'movieId':get_index(title),
                           'rating':rating})
    return df.append(df_new, ignore_index=True)

In [None]:
#recommender based ONLY on user's liked movies
def knowledge_based():
    sorted_similar_movies = []
    for i in range(len(movies_enjoyed)):
        movie_index = get_index(movies_enjoyed[i])
        similar_movies = (list(enumerate(cosine[movie_index])))
        similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
        for i in range(500):
            if get_title(similar_movies[i][0]) in movies_enjoyed or get_title(similar_movies[i][0]) in movies:
                similar_movies.remove(similar_movies[i])
        sorted_similar_movies.extend(similar_movies[0:10])

    sorted_similar_movies = sorted(sorted_similar_movies,key=lambda x:x[1],reverse=True)[1:]

    print("\n\nThese are some other movies you might enjoy:\n")
    sorted_similar_movies = list(set(sorted_similar_movies))
    length = 10 if len(sorted_similar_movies) > 15 else len(sorted_similar_movies)
    for i in range(length):
        print('-' + get_title(sorted_similar_movies[i][0]))

# User loop

In [None]:
## generate initial recommendation list
print("What are your top three favorite movies (in no specific order)?")

#movies = ['Alien: Covenant', 'Ice Age', 'Jumanji']
movies = []
while(len(movies) < 1):
    user_movies(movies)

movies_enjoyed = movies.copy()

for i in range(len(movies)):
    df_ratings = rate(df_ratings, movies[i], 5)

knowledge_based()

#let the user use the software to watch movies and rate them and get recommendations
choice = -1
while choice != 0:
    print('\n\nPress 0 to exit, 1 to watch, 2 to rate:')
    choice = int(input())
    if choice == 0:
        break
    flavour = 'What movie would you like to '
    if choice == 1:
        flavour += 'watch: '
    elif choice == 2:
        flavour += 'rate: '
    else:
        continue
    movie = input(flavour).lower()
    if movie not in df_titles['title'].values:
        print("Sorry, that wasn't found in our database. Please try another movie!")
        continue
    elif movie in movies:
        print("This movie has already been watched. Please try another movie!")
        continue
    else:
        movies.append(movie)
        df_ratings = rate(df_ratings, movie)
        movies = list(set(movies))
        movies_enjoyed = list(set(movies_enjoyed))
    if choice == 1:
        #this would realistically have a higher threshold for real-world use
        if(len(movies) < 4):
            knowledge_based()
        else:
            collab_filter()