In [1]:
import pandas as pd

In [2]:
# Load tags dataframe
tag_df = pd.read_csv('data/tag.csv')
tag_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [3]:
# Remove unnecessary columns
tag_df.drop(['userId', 'timestamp'], axis=1, inplace=True)

In [4]:
# Convert all tags to lower case
tag_df['tag'] = tag_df['tag'].str.lower()

In [5]:
# Aggregate tags by movie id
tag_df = tag_df.groupby('movieId').agg(lambda x: list(x)).reset_index()
tag_df.head()

Unnamed: 0,movieId,tag
0,1,"[watched, computer animation, disney animated ..."
1,2,"[time travel, adapted from:book, board game, c..."
2,3,"[old people that is actually funny, sequel fev..."
3,4,"[chick flick, revenge, characters, chick flick..."
4,5,"[diane keaton, family, sequel, steve martin, w..."


In [6]:
# Load movies dataframe
movie_df = pd.read_csv('data/movie.csv')
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# Create list of genres
all_genres = list()

for genres in movie_df['genres']:
    splitted_genres = genres.split("|")
    all_genres += splitted_genres
    
all_genres = set(all_genres)
all_genres

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [8]:
# Remove '(no genres listed)' from genres list and replace in the dataframe
all_genres.remove('(no genres listed)')
movie_df['genres'] = movie_df['genres'].str.replace('(no genres listed)', '')

In [9]:
# Set all values to lower case to create the tag list
movie_df['genres'] = movie_df['genres'].str.lower()

In [10]:
# Transform genres into a list
movie_df['genres'] = movie_df['genres'].str.split('|')

In [11]:
# Join tags and genres in a dictionary

tags_and_genres = dict()

for movie in movie_df.iterrows():
    movieId = movie[1]['movieId']
    genres = movie[1]['genres']
    tags = []
    if len(tag_df[tag_df['movieId'] == movieId]['tag']) > 0:
        tags = tag_df[tag_df['movieId'] == movieId]['tag'].values[0]
    
    tags_and_genres[movieId] = genres + tags

In [12]:
# Function to retrieve movie id
def getMovieId(name):
    return movie_df[movie_df['title'].str.contains(name)][['movieId', 'title']]

In [13]:
# Function to calculate similarity between two lists

from collections import Counter
import math

def counter_cosine_similarity(list1, list2):
    # Transform list into Counter objects
    c1 = Counter(list1)
    c2 = Counter(list2)

    # Create a list of unique terms
    terms = set(c1).union(c2)

    # Calculate Consine similarity
    # More info: https://en.wikipedia.org/wiki/Cosine_similarity
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0) ** 2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0) ** 2 for k in terms))

    return dotprod / (magA * magB)

In [39]:
# Function to return similar movies

def getSimilarMovies(movieId1):
    tagsMovie1 = tags_and_genres[movieId1]
    similar_movies = []
    for movie in movie_df.iterrows():
        movieId2 = movie[1]['movieId']
        if movieId1 == movieId2:
            continue
        similarity = counter_cosine_similarity(tagsMovie1, tags_and_genres[movieId2])
        if similarity > 0.5:
            rounded_similarity = round(similarity * 100, 1)
            movie_title = movie[1]['title']
            similar_movies.append(movie_title)
            print(f'Similarity {rounded_similarity}% - {movie_title}')
    if len(similar_movies) == 0:
        print('Similar movies not found')

In [44]:
getMovieId('Pulp Fiction')

Unnamed: 0,movieId,title
293,296,Pulp Fiction (1994)


In [45]:
getSimilarMovies(296)

Similarity 57.8% - Four Rooms (1995)
Similarity 74.3% - Reservoir Dogs (1992)
Similarity 66.7% - Jackie Brown (1997)
Similarity 61.1% - Kill Bill: Vol. 1 (2003)
Similarity 60.2% - Kill Bill: Vol. 2 (2004)
Similarity 57.1% - Sin City (2005)
