In [1]:
import pandas as pd
import numpy as np
import sklearn
import json
from scipy.spatial import distance
import collections
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize.casual import TweetTokenizer
from tqdm import tqdm

In [2]:
MOVIE_PATH = '/Users/aidanwhite/Desktop/tmdb-5000-movie-dataset/tmdb_5000_movies.csv'
CAST_PATH = '/Users/aidanwhite/Desktop/tmdb-5000-movie-dataset/tmdb_5000_credits.csv'
DOC2VEC_PATH = '/Users/aidanwhite/Desktop/doc2vec_25d.model'

tknz = TweetTokenizer()
d2v_model = Doc2Vec.load(DOC2VEC_PATH)

In [3]:
df = pd.read_csv(MOVIE_PATH)
# Filter out non-english movies (for now)
df = df[df['original_language'] == 'en']
df_credits = pd.read_csv(CAST_PATH)
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [17]:
# Generate a list of the top x actors/actresses

actors = collections.defaultdict(int)

for cast in list(df_credits['cast']):
    cast = json.loads(cast)
    for item in cast:
        actors[item['name']] += 1
        
sort = sorted(actors.items(), key=lambda a: a[1], reverse=True)

top_actors = [ tup[0] for tup in sort ][:200]
actor2idx = { a : i for i, a in enumerate(top_actors) }
id2cast = { tup[1] : tup[3] for tup in df_credits.itertuples() }

In [5]:
lst = df['genres']
genre_dict = {}
for genrelist in list(lst):
    glist = json.loads(genrelist)
    for gitem in glist:
        genre_dict[gitem['id']] = gitem['name']    

In [6]:
# Create a list of genres and mappings between genres and indices (for use in feature vector)

genres = [ item[1] for item in genre_dict.items() ]
idx2genre = { i : genre for i, genre in enumerate(genres) } 
genre2idx = { genre : i for i, genre in idx2genre.items() }

In [7]:
# Map id's to movies
movies = df['id']
id2movie = { row[4] : row[7] for row in df.itertuples() }
movie2id = { v : k for k, v in id2movie.items() }
movie2idx = { tup[7] : i for i, tup in enumerate(df.itertuples()) }

In [18]:
feat_vecs = []

for movie in df.itertuples():
    # one-hot encoding: 
    # first 1 index, next 20 indices = genres, next 200 = top 200 actors, 
    # next 5 = doc embedding of overview
    vec = np.zeros(1 + 20 + 200 + 25, int)
    
    # Review/Popularity
    vec[0] = int(movie[9]) / 100 # Seems helpful to scale down popularity a little bit
    
    # Genres
    genres = json.loads(movie[2])
    for genre in genres:
        idx = genre2idx[genre['name']]
        vec[idx + 1] = 1
    
    # Actors
    _id = movie[4]
    cast = json.loads(id2cast[_id])
    for item in cast:
        if item['name'] in top_actors:
            vec[1 + 20 + actor2idx[item['name']]] = 1
    
    # Paragraph embedding of overview
    emb = d2v_model.infer_vector(tknz.tokenize(str(movie[8])))
    for i, v in enumerate(emb):
        vec[len(vec) - 25 + i] = emb[i]
    
    feat_vecs.append(vec)

v = np.array(feat_vecs)
print(v.shape)

(4505, 246)


In [9]:
# Pass in a feature vector (could be a sum of liked movies, for example)
def get_recommendations(v):
    recs = []
    distances = distance.cdist([v], feat_vecs, "cosine")[0]
    min_index = np.argsort(distances)[:10]
    for idx in min_index:
        recs.append( list(df.itertuples())[idx][7] )
    return recs

In [10]:
# Get similar movies to a movie title
def get_recommendations_for_movie(m):
    target_vector = feat_vecs[movie2idx[m]]
    return get_recommendations(target_vector)

In [11]:
get_recommendations_for_movie("Iron Man")

['Iron Man',
 'The Avengers',
 'Iron Man 2',
 'Iron Man 3',
 'Avengers: Age of Ultron',
 'The Incredible Hulk',
 'Captain America: Civil War',
 'TRON: Legacy',
 'Star Wars',
 'Star Wars: Episode III - Revenge of the Sith']

In [12]:
get_recommendations_for_movie("Anchorman: The Legend of Ron Burgundy")

['Anchorman: The Legend of Ron Burgundy',
 'Old School',
 'The Goods: Live Hard, Sell Hard',
 'Starsky & Hutch',
 'Envy',
 'The Watch',
 'The Guilt Trip',
 "Gulliver's Travels",
 'Get Hard',
 'Neighbors 2: Sorority Rising']

In [13]:
get_recommendations_for_movie("The Dark Knight")

['The Dark Knight',
 'The Dark Knight Rises',
 'Batman Begins',
 'Harry Brown',
 'Harsh Times',
 'Get Carter',
 'Jimmy and Judy',
 'Need for Speed',
 'Blood and Wine',
 'Running Scared']

In [14]:
get_recommendations_for_movie("Paranormal Activity")

['The House of the Devil',
 'Final Destination 2',
 'Session 9',
 'Paranormal Activity',
 'The Final Destination',
 'Silent House',
 'The Witch',
 'The Blair Witch Project',
 "April Fool's Day",
 'Final Destination 3']

In [11]:
get_recommendations_for_movie("Dead Poets Society")

['Dead Poets Society',
 'The Night Listener',
 'House of D',
 'August Rush',
 'Brick Lane',
 'Animals',
 'The Brown Bunny',
 'Harvard Man',
 'Wah-Wah',
 'Rosewater']

### Idea for evaluation
Use movielens dataset (which has tmdb ids) to calculate the avg user rating for the predicted movies of users who had highly(?) rated the current movie, use that rating to evaluate the model

In [15]:
ML_LINKS = '/Users/aidanwhite/Desktop/ml-latest-small/links.csv'
ML_RATINGS = '/Users/aidanwhite/Desktop/ml-latest-small/ratings.csv'
df_link = pd.read_csv(ML_LINKS)
df_ratings = pd.read_csv(ML_RATINGS)
df_link.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [16]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [17]:
# TODO Account for movies that may be in the movielens data but not in training data
def movieid2tmdbid(movieid):
    return int(df_link[df_link['movieId'] == movieid]['tmdbId'])

def tmdbid2movieid(tmdbid):
    return int(df_link[df_link['tmdbId'] == tmdbid]['movieId'])

In [47]:
def users_who_liked(tmdbid):
    return list(df_ratings[(df_ratings['movieId'] == tmdbid2movieid(tmdbid)) & (df_ratings['rating'] >= 4)]['userId'])

def get_rating(tmdbid, user):
    try:
        res = df_ratings[(df_ratings['movieId'] == tmdbid2movieid(tmdbid)) & (df_ratings['userId'] == user) ]['rating']
    except:
#         print('Something has gone awry...', end="", flush=True)
        return None
    return float(res) if len(res) > 0 else None

In [52]:
def evaluate_model(limit=None):
    count = 0
    total = 0
    i = 0
    
    movies = list(df['id'])
    for movie in tqdm(movies):
        recommendations = get_recommendations_for_movie(id2movie[movie])
        likes = users_who_liked(movie)
        for rec in recommendations:
            rec = movie2id[rec]
            for user in likes:
                rating = get_rating(rec, user)
                if rating is None:
                    continue
                if rating >= 4:
                    count += 1
                total += 1
        i += 1
        if limit and i > limit:
            break
    print("Accuracy: " + str(count/total) if total is not 0 else 'Not enough information')

In [54]:
evaluate_model(limit=200)


  0%|          | 0/4505 [00:00<?, ?it/s][A
  0%|          | 1/4505 [00:01<1:23:25,  1.11s/it][A
  0%|          | 2/4505 [00:01<1:11:55,  1.04it/s][A
  0%|          | 3/4505 [00:02<58:11,  1.29it/s]  [A
  0%|          | 4/4505 [00:03<1:06:29,  1.13it/s][A
  0%|          | 5/4505 [00:03<51:19,  1.46it/s]  [A
  0%|          | 6/4505 [00:03<42:14,  1.78it/s][A
  0%|          | 7/4505 [00:04<40:29,  1.85it/s][A
  0%|          | 8/4505 [00:04<37:27,  2.00it/s][A
  0%|          | 9/4505 [00:05<47:07,  1.59it/s][A
  0%|          | 10/4505 [00:05<37:18,  2.01it/s][A
  0%|          | 11/4505 [00:06<32:39,  2.29it/s][A
  0%|          | 12/4505 [00:06<31:53,  2.35it/s][A
  0%|          | 13/4505 [00:07<36:36,  2.05it/s][A
  0%|          | 14/4505 [00:07<29:21,  2.55it/s][A
  0%|          | 15/4505 [00:07<26:58,  2.77it/s][A
  0%|          | 16/4505 [00:07<24:57,  3.00it/s][A
  0%|          | 17/4505 [00:08<40:15,  1.86it/s][A
  0%|          | 18/4505 [00:09<34:54,  2.14it/s][A


TypeError: cannot convert the series to <class 'int'>