In [61]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [62]:
# Load the dataset
movies_df = pd.read_csv('resources/movies_metadata.csv')

In [63]:
# Feature Engineering: TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [64]:
movies_df['overview'] = movies_df['overview'].fillna('')

In [65]:
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies_df['overview'])

In [66]:
# Feature Engineering: Release Year"
movies_df['year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [67]:
# Content-Based Model
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [69]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()


In [76]:
def get_recommendations_based_on_content(title):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df.loc[movie_indices].nlargest(10, 'similarity_score')['title']


In [77]:
# Collaborative Filtering Model
def weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v / (v + m) * R) + (m / (m + v) * C)

# Define a minimum threshold for the number of votes required to be considered
vote_count_threshold = 1000
movies_df = movies_df[movies_df['vote_count'] >= vote_count_threshold]

# Calculate the mean vote average and vote count
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.9)

# Add a new column for the weighted rating
movies_df['weighted_rating'] = movies_df.apply(weighted_rating, args=(m, C), axis=1)


In [78]:
#Collaborative Filtering Model 
def get_recommendations_based_on_collaborative(user_ratings):
    # Calculate the weighted rating of each movie
    movies_df['weighted_rating'] = movies_df.apply(weighted_rating, args=(m, C), axis=1)
    # Sort the movies based on their weighted rating in descending order
    movies_df = movies_df.sort_values('weighted_rating', ascending=False)

    # Return the top 10 movies
    return movies_df['title'].head(10)

In [79]:
def get_recommendations_based_on_mood(mood):
    # Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(movies_df['overview'])

    # Compute the cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Get the indices of the movies that contain the mood keyword in their overview
    indices = movies_df[movies_df['overview'].str.contains(mood, case=False)].index

    # Get the pairwise similarity scores of all movies with those movies
    sim_scores = []
    for idx in indices:
        sim_scores.extend(list(enumerate(cosine_sim[idx])))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [80]:
def get_recommendations_based_on_keywords(keywords):
    # Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(movies_df['overview'])

    # Compute the cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Get the indices of the movies that contain the input keywords in their overview
    indices = movies_df[movies_df['overview'].str.contains(keywords, case=False)].index

    # Get the pairwise similarity scores of all movies with those movies
    sim_scores = []
    for idx in indices:
        sim_scores.extend(list(enumerate(cosine_sim[idx])))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [83]:
# Test the models
print(get_recommendations_based_on_content('Jumanji'))

KeyError: "None of [Int64Index([16650, 29621, 25977, 7543, 5405, 13490, 23261, 7986, 28574, 11041], dtype='int64')] are in the [index]"

In [85]:
# User Interface
def recommend_movies():
    print('Welcome to the movie recommendation system!')
    print('What would you like to do?')
    print('1. Get movie recommendations based on your mood')
    print('2. Get movie recommendations based on keywords')
    choice = input('Enter your choice (1 or 2): ')

    if choice == '1':
        mood = input('Enter your mood: ')
        recommendations = get_recommendations_based_on_mood(mood)
        print('Here are your recommended movies:')
        print(recommendations)
    elif choice == '2':
        keywords = input('Enter some keywords (separated by commas): ')
        recommendations = get_recommendations_based_on_keywords(keywords)
        print('Here are your recommended movies:')
        print(recommendations)
    else:
        print('Invalid choice')

recommend_movies()


Welcome to the movie recommendation system!
What would you like to do?
1. Get movie recommendations based on your mood
2. Get movie recommendations based on keywords


Enter your choice (1 or 2):  2
Enter some keywords (separated by commas):  comedy, action 


Here are your recommended movies:
Series([], Name: title, dtype: object)
