In [1]:
import pandas as pd 
import scipy.spatial 

In [2]:
#load the movie dataset into a dataframe 
df = pd.read_csv("movie_metadata.csv")

#remove N/A values from these columns 
df_filtered = df.dropna(subset=['imdb_score', 'duration', 'budget', 'gross'])

#make a copy of the selected columns
df_selected = df_filtered[['movie_title', 'imdb_score', 'duration', 'budget', 'gross']].copy()

In [3]:
#perform Min-Max normalization on the numerical columns so that scale doens't skew our results as much

df_selected['imdb_score_norm'] = (df_selected['imdb_score'] - df_selected['imdb_score'].min()) / \
                                  (df_selected['imdb_score'].max() - df_selected['imdb_score'].min())

df_selected['duration_norm'] = (df_selected['duration'] - df_selected['duration'].min()) / \
                                (df_selected['duration'].max() - df_selected['duration'].min())

df_selected['budget_norm'] = (df_selected['budget'] - df_selected['budget'].min()) / \
                              (df_selected['budget'].max() - df_selected['budget'].min())

df_selected['gross_norm'] = (df_selected['gross'] - df_selected['gross'].min()) / \
                             (df_selected['gross'].max() - df_selected['gross'].min())

In [26]:
#insert the name here of the target movie
target_movie_name = "Pulp Fiction"

In [27]:
#find the row that coresponds to the target movie selected above
#removing the movie name because it is not numerical
target_movie = df_selected.loc[df_selected['movie_title'].str.strip() == target_movie_name].drop('movie_title', axis=1)

#get the features of that movie 
target_movie_features = target_movie[['imdb_score_norm', 'duration_norm', 'budget_norm', 'gross_norm']].values

#compute the euclidean distances between the target movie and all others in the dataset
distances = scipy.spatial.distance.cdist(df_selected[['imdb_score_norm', 'duration_norm', 'budget_norm', 'gross_norm']].values,
                                         target_movie_features, 
                                         metric='euclidean').flatten()

#connect each movie with their calculated euclidean distances 
query_distances = list(zip(df_selected['movie_title'], distances))


In [28]:
#sorting movies by similarity (smallest euclidean distances) and take the 10 most popular 
for similar_movie_name, similar_score in sorted(query_distances, key=lambda x: x[1])[:11]:
    imdb_score = df_selected.loc[df_selected['movie_title'] == similar_movie_name, 'imdb_score'].values[0]
    duration = int(df_selected.loc[df_selected['movie_title'] == similar_movie_name, 'duration'].values[0])
    budget = int(df_selected.loc[df_selected['movie_title'] == similar_movie_name, 'budget'].values[0])
    gross = int(df_selected.loc[df_selected['movie_title'] == similar_movie_name, 'gross'].values[0])
    
    print(f"Movie: {similar_movie_name.strip()}")
    print(f"Similarity Score: {similar_score:.4f}")
    #print(f"IMDb Score: {imdb_score}")
    #print(f"Duration: {duration} min")
    #print(f"Budget: ${budget}")
    #print(f"Gross: ${gross}")
    print()

Movie: Pulp Fiction
Similarity Score: 0.0000

Movie: Schindler's List
Similarity Score: 0.0284

Movie: The Godfather
Similarity Score: 0.0536

Movie: The Green Mile
Similarity Score: 0.0744

Movie: Braveheart
Similarity Score: 0.0778

Movie: Django Unchained
Similarity Score: 0.0995

Movie: Amadeus
Similarity Score: 0.1077

Movie: Aliens
Similarity Score: 0.1081

Movie: The Departed
Similarity Score: 0.1100

Movie: Inglourious Basterds
Similarity Score: 0.1162

Movie: Interstellar
Similarity Score: 0.1170



IndexError: index 0 is out of bounds for axis 0 with size 0