# Movie Recommendation System based on Genre

#### Importing the Libraries and Loading Data

In [1]:
# Importing Pandas and Numpy Libraries
import pandas as pd
import numpy as np

In [2]:
# Loading movies file into the memory
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])

In [3]:
# Output some of the movies from the movies file
movies.head()

Unnamed: 0,title,genres
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,Adventure|Children|Fantasy
2,Grumpier Old Men,Comedy|Romance
3,Waiting to Exhale,Comedy|Drama|Romance
4,Father of the Bride Part II,Comedy


#### Break up the Genre into a string array and converting them to a string value data type

In [4]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

#### Now the algorithm for the recommendation part of the program is set. We use the 'sklearn.feature' library by importing the method 'TfidfVectorizer'. This converts the genres in 2-gram words excluding the stopwords (e.g. ‘the’, ‘and’, etc.). We then filter the dataset by the genres.
###### The concepts of Term Frequency (TF) and Inverse Document Frequency (IDF) are used in content based filtering mechanisms (such as a content based recommender). The Term Frequency-Inverse Document Frequency (TF-IDF) feature extraction is used to evaluate how important a word is to a document within the document body.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Define a TF-IDF Vectorizer Object to remove all english stop words such as 'the', 'a'
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
# Returns T x D i.e. term document matrix
tfidf_matrix = tf.fit_transform(movies['genres'])
# Output the shape of tfidf_matrix
tfidf_matrix.shape

(9742, 177)

#### We now import the 'sklearn.feature' library's method 'cosine_similarity'. Using cosine similarity: to compute similarity between two movies: cos(θ)= A.B / (||A||.||B||)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Output the cosine_sim matrix
cosine_sim[:4, :4]

array([[1.        , 0.31379419, 0.0611029 , 0.05271111],
       [0.31379419, 1.        , 0.        , 0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407],
       [0.05271111, 0.        , 0.35172407, 1.        ]])

#### We now define the main function to get movie recommendations based on the Cosine Similarity.

In [7]:
# Build a 1-dimensional list with movie titles
titles = movies['title']
# Construct a DF of indices of each movie with movie name as index
indices = pd.Series(movies.index, index=movies['title'])

# Define Function that gets movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return titles.iloc[movie_indices]

#### Running the 'genre_recommendations' function to get our recommendation based on the genre.

In [8]:
# Getting recommendation based on genre by giving the Movie Name, Eg: Waiting to Exhale.
genre_recommendations('Waiting to Exhale ')

10                  American President, The 
47                         Mighty Aphrodite 
52               Postman, The (Postino, Il) 
83                          Beautiful Girls 
165                 Something to Talk About 
191                        Don Juan DeMarco 
198    Eat Drink Man Woman (Yin shi nan nu) 
243                           Nobody's Fool 
309                        Corrina, Corrina 
317                     I Like It Like That 
Name: title, dtype: object