In [1]:
#Content-based Recommender Systems
#Importing the libraries
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#Reading in the data
movies_df= pd.read_csv('movies.csv')
ratings_df= pd.read_csv('ratings.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [2]:
#Every genre is separated by a | so we call the split function on |
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II,[Comedy],1995.0


In [3]:
#Copying the movie dataframe into a new one 
moviesWithGenres_df= movies_df.copy()
#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
#Filling in the NaNvalues with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df= moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#Ratings data frame
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [5]:
#Drop removes a specified row or column from a dataframe
ratings_df= ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [6]:
#User Ratings
userInput = [
{'title':'BreakfastClub, The', 'rating':5},
{'title':'ToyStory', 'rating':3.5},
{'title':'Jumanji', 'rating':2},
{'title':"PulpFiction", 'rating':5},
{'title':'Akira', 'rating':4.5}
] 
inputMovies= pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,5.0,"BreakfastClub, The"
1,3.5,ToyStory
2,2.0,Jumanji
3,5.0,PulpFiction
4,4.5,Akira


In [7]:
#Filtering out the movies by title
inputId= movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies= pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies= inputMovies.drop('genres', 1).drop('year', 1)
#Final input dataframe
inputMovies

Unnamed: 0,movieId,title,rating
0,2,Jumanji,2.0
1,1274,Akira,4.5


In [8]:
#Filtering out the movies from the input
userMovies= moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1246,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Resetting the index
userMovies= userMovies.reset_index(drop=True)
#Dropping unnecessary columns
userGenreTable= userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
inputMovies['rating']
#Dot product to get weights
userProfile= userGenreTable.transpose().dot(inputMovies['rating'])
#The user profile
userProfile

Adventure             6.5
Animation             4.5
Children              2.0
Comedy                0.0
Fantasy               2.0
Romance               0.0
Drama                 0.0
Action                4.5
Crime                 0.0
Thriller              0.0
Horror                0.0
Mystery               0.0
Sci-Fi                4.5
IMAX                  0.0
Documentary           0.0
War                   0.0
Musical               0.0
Western               0.0
Film-Noir             0.0
(no genres listed)    0.0
dtype: float64

In [23]:
#Now let's get the genres of every movie in our original dataframe
genreTable= moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
#And drop the unnecessary information
genreTable= genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.head()
genreTable.shape

(34208, 20)

In [24]:
#Multiply the genres by the weights and then take the weighted average
recommendationTable_df= ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()
#Sort our recommendations in descending order
recommendationTable_df= recommendationTable_df.sort_values(ascending=False)
#Have a look
recommendationTable_df.head()

movieId
26590     1.000000
6350      1.000000
27155     1.000000
136618    1.000000
8537      0.916667
dtype: float64

In [25]:
#The final recommendation table
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
3655,3745,Titan A.E.,"[Action, Adventure, Animation, Children, Sci-Fi]",2000.0
6252,6350,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,"[Action, Adventure, Animation, Children, Fanta...",1986.0
7883,8537,Kaena: The Prophecy (Kaena: La prophétie),"[Action, Adventure, Animation, Children, Sci-Fi]",2003.0
8942,26590,G.I. Joe: The Movie,"[Action, Adventure, Animation, Children, Fanta...",1987.0
9218,27155,"Batman/Superman Movie, The","[Action, Adventure, Animation, Children, Fanta...",1998.0
9400,27608,Immortel (ad vitam) (Immortal),"[Action, Adventure, Animation, Fantasy, Sci-Fi]",2004.0
10448,37830,Final Fantasy VII: Advent Children,"[Action, Adventure, Animation, Fantasy, Sci-Fi]",2004.0
10575,40339,Chicken Little,"[Action, Adventure, Animation, Children, Comed...",2005.0
11785,52287,Meet the Robinsons,"[Action, Adventure, Animation, Children, Comed...",2007.0
11806,52462,Aqua Teen Hunger Force Colon Movie Film for Th...,"[Action, Adventure, Animation, Comedy, Fantasy...",2007.0
