In [1]:
import pandas as pd
import numpy as np
import requests
import json
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
movies_ids = ['475557', '530915', '466272', '398978', '496243', '515001', '331482', '492188']
movies_names = ['Joker', '1917', 'Once Upon a Time... in Hollywood', 'The Irishman', 'Parasite',\
                'Jojo Rabbit', 'Little Women', 'Mariage Story']

columns = ['id', 'title', 'cast', 'crew', 'keywords', 'genres', 'budget', 'popularity', 'revenue',\
           'runtime', 'vote_average', 'vote_count']

new_movies = pd.DataFrame(columns=columns, dtype=object)
new_movies.id = movies_ids
new_movies.title = movies_names

In [3]:
for i, movie in enumerate(movies_ids):
    # Cast and Crew
    URL = 'https://api.themoviedb.org/3/movie/' + movie + '/credits?api_key=c5ccf295da9c5fe71166df5ba9dd6bf5'
    body = json.loads(requests.get(URL).text)
    new_movies.loc[i, 'cast'] = json.dumps(body['cast'])
    new_movies.loc[i, 'crew'] = json.dumps(body['crew'])
    
    # Keywords
    URL = 'https://api.themoviedb.org/3/movie/' + movie + '/keywords?api_key=c5ccf295da9c5fe71166df5ba9dd6bf5'
    body = json.loads(requests.get(URL).text)
    new_movies.loc[i, 'keywords'] = json.dumps(body['keywords'])
    
    # Other
    URL = 'https://api.themoviedb.org/3/movie/' + movie + '?api_key=c5ccf295da9c5fe71166df5ba9dd6bf5&language=en-US'
    body = json.loads(requests.get(URL).text)
    new_movies.loc[i, 'genres'] = json.dumps(body['genres'])
    new_movies.loc[i, 'budget'] = body['budget']
    new_movies.loc[i, 'popularity'] = body['popularity']
    new_movies.loc[i, 'revenue'] = body['revenue']
    new_movies.loc[i, 'runtime'] = body['runtime']
    new_movies.loc[i, 'vote_average'] = body['vote_average']
    new_movies.loc[i, 'vote_count'] = body['vote_count']

In [4]:
new_movies.head()

Unnamed: 0,id,title,cast,crew,keywords,genres,budget,popularity,revenue,runtime,vote_average,vote_count
0,475557,Joker,"[{""cast_id"": 2, ""character"": ""Arthur Fleck / J...","[{""credit_id"": ""5c6dd8aa0e0a262c99a1aed3"", ""de...","[{""id"": 542, ""name"": ""street gang""}, {""id"": 84...","[{""id"": 80, ""name"": ""Crime""}, {""id"": 53, ""name...",55000000,226.101,1060753468,122,8.3,7975
1,530915,1917,"[{""cast_id"": 4, ""character"": ""Schofield"", ""cre...","[{""credit_id"": ""5d51ec030102c96f397647f2"", ""de...","[{""id"": 2504, ""name"": ""world war i""}, {""id"": 4...","[{""id"": 10752, ""name"": ""War""}, {""id"": 18, ""nam...",100000000,204.679,0,119,8.1,413
2,466272,Once Upon a Time... in Hollywood,"[{""cast_id"": 7, ""character"": ""Rick Dalton"", ""c...","[{""credit_id"": ""5de6206f0cd44600143cf4a5"", ""de...","[{""id"": 886, ""name"": ""movie business""}, {""id"":...","[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""name...",95000000,85.312,372353736,162,7.5,4031
3,398978,The Irishman,"[{""cast_id"": 0, ""character"": ""Frank Sheeran"", ...","[{""credit_id"": ""58c568af9251411b7300a8b7"", ""de...","[{""id"": 520, ""name"": ""chicago, usa""}, {""id"": 8...","[{""id"": 80, ""name"": ""Crime""}, {""id"": 36, ""name...",159000000,53.571,607420,209,7.8,2126
4,496243,Parasite,"[{""cast_id"": 0, ""character"": ""Kim Ki-taek"", ""c...","[{""credit_id"": ""5ce304e9c3a36809ec200d36"", ""de...","[{""id"": 1353, ""name"": ""underground""}, {""id"": 5...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 53, ""nam...",11363000,119.761,132338654,132,8.6,2541


In [5]:
imdb_movies = pd.read_csv('../Data/merged_data.csv', index_col=0)[columns + ['Nominations','Awards']]

all_movies = pd.concat([imdb_movies, new_movies], sort=False).set_index('id')

In [6]:
def get_list(x):
    """
    get the list represented by string x 
    -------
    INPUTS
        |---- x : string to be converted to list of dict 
    OUTPUTS
        |---- list_ : the converted string 
    """
    list_dict = json.loads(x)
    if(list_dict):
        list_ = []
        for d in list_dict:
            list_.append(d['name'].replace(' ',''))
        return list_
    else: 
        return []

def join_list(x):
    """
    Join the list elements of x
    """
    return ' '.join(x)

def get_similarity_all(columns, dataset, dist_metric='cosine'):
    '''Function for computing the adjacency matrices
    ------
    INPUTS
        |---- columns: names of the columns we want the adjacency matrix for
        |---- dataset: imdb dataset containing the needed column (imdb_movies or imdb_credits)
        |---- dist_metric : the distance metric to use (from sklearn.metric.pairwise)
    OUTPUT
        |---- csim: sclaed cosine similarity distance adjacency matrix
    '''
    
    # Make a temporary copy for computation
    data_copy = dataset.copy()
    vect = []
    for column in columns:
        # Modify column so we can compute similarity
        data_copy[column] = data_copy[column].apply(get_list)
        #items = sorted(data_copy.explode(column=column)[column].unique())
        data_copy[column] = data_copy[column].apply(join_list)

        # Vectorize and compute similarity
        vectorizer = TfidfVectorizer()
        vect.append(vectorizer.fit_transform(data_copy[column]).toarray())
    # stack all vectors
    vectors = np.concatenate(vect, axis=1)
    
    csim = pairwise_distances(vectors, metric=dist_metric, n_jobs=-1)
    csim = csim.max() - csim # get similarity from distance
    # Min-max scaling
    csim[np.diag_indices(csim.shape[0])] = 0 # remove self similarity
    csim = csim/csim.ptp()
    
    # transformation to DataFrame
    csim = pd.DataFrame(csim, index=data_copy['title'], columns=data_copy['title'])
    
    return csim

In [7]:
csim_all = get_similarity_all(['genres', 'keywords', 'cast', 'crew'], all_movies, dist_metric='cosine')

In [8]:
csim_all.to_pickle('../Data/csim_all_2020')
all_movies.to_csv('../Data/merged_data_2020.csv')