In [1]:
import pandas as pd

movies = pd.read_csv(r'data\movies.csv', low_memory=False)
ratings = pd.read_csv(r'data\ratings.csv', low_memory=False)
movie_ratings = pd.merge(movies, ratings)
tags = pd.read_csv(r"data\tags.csv", low_memory=False)
movie_ratings.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [2]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [3]:
movie_tags = pd.merge(movies, tags, left_on='movieId', right_on='movieId')

In [4]:
movie_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [5]:
smaller_selection = movie_tags.groupby(['title', 'movieId', 'genres']).agg({'tag': lambda x: x.tolist()}).reset_index()  # Implode tags into a single row inside a list
smaller_selection['tag'] = smaller_selection.tag.apply(lambda x: str(x).replace('[', ''))
smaller_selection['tag'] = smaller_selection.tag.apply(lambda x: str(x).replace(']', ''))

In [6]:
smaller_selection.head()

Unnamed: 0,title,movieId,genres,tag
0,(500) Days of Summer (2009),69757,Comedy|Drama|Romance,"'artistic', 'Funny', 'humorous', 'inspiring', ..."
1,...And Justice for All (1979),3420,Drama|Thriller,'lawyers'
2,10 Cloverfield Lane (2016),152077,Thriller,"'creepy', 'suspense'"
3,10 Things I Hate About You (1999),2572,Comedy|Romance,'Shakespeare sort of'
4,101 Dalmatians (1996),1367,Adventure|Children|Comedy,"'dogs', 'remake'"


In [7]:
# import TfidVectorize
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

smaller_selection['tag'] = smaller_selection['tag'].fillna('')

tfidf_matrix = tfidf.fit_transform(smaller_selection['tag'])

tfidf_matrix.shape

(1572, 1673)

In [8]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
indices = pd.Series(smaller_selection.index, index=smaller_selection.title)

In [10]:
# based on cosine similarity of tag words. find which movie titles are most similar
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return smaller_selection.iloc[movie_indices]

In [18]:
get_recommendations('Interstellar (2014)')

Unnamed: 0,title,movieId,genres,tag
106,Back to the Future (1985),1270,Adventure|Comedy|Sci-Fi,'time travel'
107,Back to the Future Part II (1989),2011,Adventure|Comedy|Sci-Fi,'time travel'
151,Bill & Ted's Bogus Journey (1991),4980,Adventure|Comedy|Fantasy|Sci-Fi,'time travel'
152,Bill & Ted's Excellent Adventure (1989),4571,Adventure|Comedy|Sci-Fi,'time travel'
1077,Primer (2004),8914,Drama|Sci-Fi,'time travel'
1318,Stargate (1994),316,Action|Adventure|Sci-Fi,'time travel'
1415,Time Bandits (1981),2968,Adventure|Comedy|Fantasy|Sci-Fi,'time travel'
446,Final Fantasy: The Spirits Within (2001),4446,Adventure|Animation|Fantasy|Sci-Fi,'sci-fi'
1373,The Butterfly Effect (2004),7254,Drama|Sci-Fi|Thriller,"'alternate reality', 'sci-fi', 'science fictio..."
356,District 9 (2009),70286,Mystery|Sci-Fi|Thriller,'intelligent sci-fi'


In [17]:
movie_tags[movie_tags.title=='Interstellar (2014)']['tag'].values

array(['black hole', 'sci-fi', 'time-travel', 'Christopher Nolan',
       'sci-fi', 'time-travel', 'bad dialogue', 'philosophical issues',
       'thought-provoking', 'visually appealing'], dtype=object)