# Determine Movie Genre by Neighboring Movies

Using the k-nearest neighbors method, use the top-k most similar movies to a target film to predict the target movie's genre.

Use Jaccard similarity based on actors in each movie to rank movies and select the top-k most similar movies.

In [None]:
import json

import pandas as pd
import numpy as np

from scipy.sparse import lil_matrix

import matplotlib.pyplot as plt


In [None]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("../data/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
        
        # Skip movies with no ratings
        if len(this_movie["rating"]) == 0:
            continue
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"],
            "rating": this_movie["rating"]["avg"]
        })

In [None]:
print("Known Actors:", len(actor_name_map))
print("Known Movies:", len(movie_actor_map))

In [None]:
actor_id_to_index = {actor_id:i for i,actor_id in enumerate(actor_name_map.keys())}


In [None]:
target_movie_id = "tt0317705" # The Incredibles
# target_movie_id = "tt0816692" # Interstellar
# target_movie_id = "tt0332280" # The Notebook

In [None]:
target_movie_object = movie_actor_map[target_movie_id]

In [None]:
target_movie_object

## Find the Most Similar Movies by Jaccard Similarity in Actor

In [None]:
movie_similarities = []

for this_movie_id,this_movie_obj in movie_actor_map.items():
    # Skip the target movie
    if this_movie_id == target_movie_id:
        continue
        
    # TODO: Calculate Jaccard similarity between target_movie_object and this_movie_obj
    #. Recall Jaccard Similarity is the ratio between size of the intersection and size of the union
    ...
    
    # Add this movie and its Jaccard similarity to the list, so we can rank at the end
    movie_similarities.append({
        "movie": this_movie_id,
        "jaccard": jaccard,
    })

In [None]:
similarity_df = pd.DataFrame(movie_similarities, columns=["movie", "jaccard"])

## Use the top-k similar movies to infer genre

In [None]:
k_nn = 1

In [None]:
similarity_df.sort_values(by="jaccard", ascending=False).head(k_nn)

In [None]:
# TODO: count the genres of the top k movies, printing out the most common
#. genres as the predicted genre for the target movie

In [None]:
this_movie

## Use the top-k similar movies to infer rating

In [None]:
# TODO: Take the average rating from these top-k movies