In [2]:
#import libraries
%matplotlib inline
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Load relevant databases
df_movies = pd.read_csv("../data/movies.csv", index_col = 0)
df_genometags = pd.read_csv("../data/genome-tags.csv", index_col = 0)
df_genome_scores = pd.read_csv("../data/genome-scores.csv")

In [3]:
#Normalize genome scores 
df_genome_scores["relevance"] = df_genome_scores["relevance"] / df_genome_scores.groupby("movieId")["relevance"].transform('sum')
#Split up "genres" table
df_movies['genres'] = df_movies['genres'].str.split('|')
#Add movie titles for readability
df_genome_scores = pd.merge(df_movies, df_genome_scores, on = "movieId")
#Add genome titles for readabability
df_genome_scores = pd.merge(df_genometags, df_genome_scores, on = "tagId")

In [7]:
df_genome_scores = df_genome_scores[["movieId", "title", "genres", "tagId", "tag", "relevance"]]
df_genome_scores.sort_values("movieId")

Unnamed: 0,movieId,title,genres,tagId,tag,relevance
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,007,0.000138
1183434,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",115,based on true story,0.001242
5875646,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",567,islam,0.000299
10993479,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1060,united nations,0.000053
8470896,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",817,prohibition,0.000172
...,...,...,...,...,...,...
7816892,131170,Parallels (2015),[Sci-Fi],753,oscar (best foreign language film),0.000376
7827273,131170,Parallels (2015),[Sci-Fi],754,oscar (best music - original score),0.000095
7837654,131170,Parallels (2015),[Sci-Fi],755,oscar (best music - original song),0.000290
7775368,131170,Parallels (2015),[Sci-Fi],749,oscar (best cinematography),0.000310


In [8]:
#First tests with cosine similiarity
def cosine_sim(title):
    score1 = df_genome_scores.loc[df_genome_scores["title"] == title]["relevance"].tolist()
    all_movies = df_genome_scores.groupby("movieId")["relevance"]
    cos_sims = []
    iterations = 0
    for movie in all_movies:
        score = movie[1]
        sim = np.dot(score1, score) / (np.linalg.norm(score1)) * np.linalg.norm(score) 
        cos_sims.append(sim)
        iterations += 1
    movie_scores = df_genome_scores.drop_duplicates("movieId")
    movie_scores["cosine_similiarity"] = cos_sims
    movie_scores.drop(["tagId", "tag", "relevance"], axis = 1, inplace = True)
    return movie_scores
cosine_sim("Star Wars: Episode IV - A New Hope (1977)").sort_values("cosine_similiarity", ascending = False).head(20)

Unnamed: 0,movieId,title,genres,cosine_similiarity
5136,5944,Star Trek: Nemesis (2002),"[Action, Drama, Sci-Fi, Thriller]",0.002172
4731,5378,Star Wars: Episode II - Attack of the Clones (...,"[Action, Adventure, Sci-Fi, IMAX]",0.002149
8478,61160,Star Wars: The Clone Wars (2008),"[Action, Adventure, Animation, Sci-Fi]",0.002146
2315,2628,Star Wars: Episode I - The Phantom Menace (1999),"[Action, Adventure, Sci-Fi]",0.002127
7408,33493,Star Wars: Episode III - Revenge of the Sith (...,"[Action, Adventure, Sci-Fi]",0.0021
1210,1356,Star Trek: First Contact (1996),"[Action, Adventure, Sci-Fi, Thriller]",0.002077
9971,106489,"Hobbit: The Desolation of Smaug, The (2013)","[Adventure, Fantasy, IMAX]",0.002075
2102,2393,Star Trek: Insurrection (1998),"[Action, Drama, Romance, Sci-Fi]",0.002067
9205,82169,Chronicles of Narnia: The Voyage of the Dawn T...,"[Adventure, Children, Fantasy]",0.002061
1072,1210,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]",0.002058


In [6]:
df_genome_scores[df_genome_scores['title'].str.contains("Star Wars")]

Unnamed: 0,tagId,tag,movieId,title,genres,relevance
245,1,007,260,Star Wars: Episode IV - A New Hope (1977),"[Action, Adventure, Sci-Fi]",0.000198
1059,1,007,1196,Star Wars: Episode V - The Empire Strikes Back...,"[Action, Adventure, Sci-Fi]",0.000174
1072,1,007,1210,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]",0.000239
2315,1,007,2628,Star Wars: Episode I - The Phantom Menace (1999),"[Action, Adventure, Sci-Fi]",0.000391
4731,1,007,5378,Star Wars: Episode II - Attack of the Clones (...,"[Action, Adventure, Sci-Fi, IMAX]",0.000451
...,...,...,...,...,...,...
11700459,1128,zombies,1210,Star Wars: Episode VI - Return of the Jedi (1983),"[Action, Adventure, Sci-Fi]",0.000082
11701702,1128,zombies,2628,Star Wars: Episode I - The Phantom Menace (1999),"[Action, Adventure, Sci-Fi]",0.000091
11704118,1128,zombies,5378,Star Wars: Episode II - Attack of the Clones (...,"[Action, Adventure, Sci-Fi, IMAX]",0.000094
11706795,1128,zombies,33493,Star Wars: Episode III - Revenge of the Sith (...,"[Action, Adventure, Sci-Fi]",0.000087
