In [None]:
# Recommender_system - Content_based_filtering:- used to recommend items to users based on information taken from the user.

# Advantage
# (1) Learns user's preferences
# (2) Highly personalized for the user

# Disadvantage

# (1) Doesn't take into account what others think of the item, so low quality item recommendations might happen
# (2) Extracting data is not always intuitive
# (3) Determining what characteristics of the item the user dislikes or likes is not always obvious
import pandas as pd
from math import sqrt # for math function
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

movies_df = pd.read_csv('movies.csv') #movie information
ratings_df = pd.read_csv('ratings.csv') # user information
#movies_df.head()

# ADDING YEAR COLUMN -OPERATION
# Remove the year from the title column and store in a new year column.

#- Please Note: \( - matches literal =( and \d matches literal - digit and \) - matches closed bracket ) and  () - need for selecting the digit and '' - matches string selection
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)  # with bracket copy e.g. (1995) 
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False) # remove brackets e.g. 1995

# TITLE COLUMN -OPERATION
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '') # replace year from title comun
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
#movies_df.head()

#SPLIT THE GENRES DATA
movies_df['genres'] = movies_df.genres.str.split('|') #SPLIT: breaking the string by specified seprator into individual List elements
#movies_df.head()

#ENCODE THE GENERE: with 1 :- it is present and 0:- it is not present
#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy() # copy the data
moviesWithGenres_df.head()
#print(moviesWithGenres_df['genres'])

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows(): # selection of index and row

    for genre in row['genres']: # selection of genre one by one in selected row
        #print(row['genres'])
        #print(index) # index iterates from 0
        #print(genre) # iterates inside genre

        moviesWithGenres_df.at[index, genre] = 1 # put one in selected row and column: PLEASE NOTE: new column is added at last with selected genre

        #Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0) # fillna:- fill nan values with 0
#moviesWithGenres_df.head()

# Rating 
#ratings_df.head()

#Drop removes a timestamp  column from a dataframe
ratings_df = ratings_df.drop('timestamp', 1) # 1 for column
#ratings_df.head()


# build recommender system
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
#print(inputMovies)

#Add movieId to input user
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())] #isin function used to filter the data- only Rows having "title" = "title" are returned.
inputId

#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#print(inputMovies)

#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('genres', 1).drop('year', 1)

#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.

inputMovies  # input user data clubbed with record data - Find the location and assign ID

#Filtering out the movies from the input
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

#Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
#Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
userGenreTable

#  start learning the input's preferences!:- To do this, we're going to turn each genre into weights. We can do this by using the input's reviews and multiplying them into the input's genre table and then summing up the resulting table by column.
inputMovies['rating']
#Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
#The user profile
userProfile

#Now, we have the weights for every of the user's preferences. This is known as the User Profile. Using this, we can recommend movies that satisfy the user's preferences.

#Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
#And drop the unnecessary information
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
print(genreTable.head())

#With the input's profile and the complete list of movies and their genres 
#in hand, we're going to take the weighted average of every movie based on the input profile and recommend the top twenty movies that most satisfy it.

#Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
#recommendationTable_df.head()


#Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
#Just a peek at the values
recommendationTable_df.head()

#Now here's the recommendation table!
#The final recommendation table
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]
