In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
movies_data = pd.read_csv('../Data/movies.csv')
movies_data = movies_data.drop_duplicates()
movies_data

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
ratings_data = pd.read_csv('../Data/ratings.csv')
ratings_data = ratings_data.drop('timestamp', axis=1)
ratings_data = ratings_data.drop_duplicates()
ratings_data

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [4]:
tags_data = pd.read_csv('../Data/tags.csv')
tags_data = tags_data.drop_duplicates()
tags_data = tags_data.drop('timestamp', axis=1)
tags_data

Unnamed: 0,userId,movieId,tag
0,3,260,classic
1,3,260,sci-fi
2,4,1732,dark comedy
3,4,1732,great dialogue
4,4,7569,so bad it's good
...,...,...,...
1093355,162521,66934,Neil Patrick Harris
1093356,162521,103341,cornetto trilogy
1093357,162534,189169,comedy
1093358,162534,189169,disabled


In [5]:
# Drop the 'userId' column
tags_data = tags_data.drop('userId', axis=1)

# Convert 'tag' column to string
tags_data['tag'] = tags_data['tag'].astype(str)

# Group by 'movieId' and put all tags of a movie into a single cell separated by a comma
tags_data = tags_data.groupby('movieId')['tag'].apply(', '.join).reset_index()

In [6]:
mean_ratings = ratings_data.groupby('movieId')['rating'].mean()

In [7]:
# Merge the data
merged_data = pd.merge(movies_data, mean_ratings, on='movieId')
merged_data = pd.merge(merged_data, tags_data, on='movieId')
merged_data

Unnamed: 0,movieId,title,genres,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708,"Owned, imdb top 250, Pixar, Pixar, time travel..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527,"Robin Williams, time travel, fantasy, based on..."
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028,"funny, best friend, duringcreditsstinger, fish..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547,"based on novel or book, chick flick, divorce, ..."
4,5,Father of the Bride Part II (1995),Comedy,3.058434,"aging, baby, confidence, contraception, daught..."
...,...,...,...,...,...
41870,208813,Noelle (2019),Children,3.000000,might like
41871,208933,The Devil's Partner (1961),Horror,2.500000,"black and white, deal with the devil"
41872,209035,time for sushi (2017),Animation|Comedy,3.500000,"computer animation, Japan, mass behavior, mass..."
41873,209037,Our Wonderful Nature - The Common Chameleon (2...,(no genres listed),4.000000,"chameleon, computer animation, gluttony, humor..."


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a CountVectorizer object
count = CountVectorizer(stop_words='english')

# Create the count matrix
count_matrix = count.fit_transform(merged_data['tag'])

# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim_tags = cosine_similarity(count_matrix, count_matrix)

# Reset index of your main DataFrame and construct reverse mapping
merged_data = merged_data.reset_index()
indices = pd.Series(merged_data.index, index=merged_data['title'])

# Create the count matrix for genres
count_matrix_genres = count.fit_transform(merged_data['genres'])

# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim_genres = cosine_similarity(count_matrix_genres, count_matrix_genres)

# Function that takes in movie titles as input and outputs most similar movies
def get_recommendations(titles, genre=None, min_rating=0, cosine_sim_tags=cosine_sim_tags, cosine_sim_genres=cosine_sim_genres):
    # Initialize list to store similarity scores
    sim_scores = []

    for title in titles:
        # Get the index of the movie that matches the title
        idx = indices[title]

        # Get the pairwsie similarity scores of all movies with that movie
        sim_scores_tags = list(enumerate(cosine_sim_tags[idx]))
        sim_scores_genres = list(enumerate(cosine_sim_genres[idx]))

        # Average the similarity scores from tags and genres
        sim_scores += [(i, (score_tags + score_genres) / 2) for (i, score_tags), (_, score_genres) in zip(sim_scores_tags, sim_scores_genres)]

    # Convert the list to a DataFrame
    sim_scores_df = pd.DataFrame(sim_scores, columns=['Index', 'Score'])

    # Sum up the similarity scores for each movie
    sim_scores = sim_scores_df.groupby('Index').Score.sum().reset_index()

    # Sort the movies based on the similarity scores
    sim_scores = sim_scores.sort_values('Score', ascending=False)

    # Get the indices of the given movies
    given_indices = [indices[title] for title in titles]

    # Exclude the given movies
    sim_scores = sim_scores[~sim_scores['Index'].isin(given_indices)]

    # Get the movie indices
    movie_indices = sim_scores['Index'].values

    # Get the most similar movies
    recommended_movies = merged_data.iloc[movie_indices].copy()

    # Filter movies based on genre
    if genre:
        recommended_movies = recommended_movies[recommended_movies['genres'].str.contains(genre)]

    # Filter movies based on minimum rating
    recommended_movies = recommended_movies[recommended_movies['rating'] >= min_rating]

    # Get the top 5 most similar movies after all the processing
    recommended_movies = recommended_movies.iloc[:5]

    return recommended_movies['title']

In the code block above, the function to recommend movies is written. The function works as follows:  
1. Import the necessary libraries "CountVectorizer" and "cosine_similarity". CountVectorizer is used to convert a collection of text documents to a matrix of token counts.  
cosine_similarity computes the cosine similiarity between two samples.
2. Secondly a CountVectorizer object is created. This object will use the text data to transform it into a matrix of token counts. The parameter "stop_words='english'" tells the  
vectorizer to ignore common English words like 'the', 'is', 'in', etc.
3. After that a count matrix is created for the 'tag' column. This line of code transforms the values from the 'tag' column into a matrix of token counts.
4. Next, the cosine similarity is computed based on the count matrix from before.
5. Then the same two steps above is done for the 'genres' column.
6. Following up the step above, a function is created to recommend movies. In this function the following steps are done:
    - For each given title, it computes the average similarity scores of all movies with that movie, based on both tags and genres.
    - It then sums up the similarity scores for each movie and sorts the movies based on the similarity scores.
    - After that it excludes the given movies from the list.
    - Then it filters the movie based on the given genre.
    - Finally it returns the titles of the top 5 most similar movies.

In [9]:
get_recommendations(['Social Network, The (2010)', 'Love Actually (2003)'], min_rating=3.5)

37678                                   Stone Years (1985)
34058                                     Past Life (2016)
32573                                      The King (2002)
10549                                     The Queen (2006)
11426    Diving Bell and the Butterfly, The (Scaphandre...
Name: title, dtype: object