In [12]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movies dataset
movies = pd.read_csv('movies.csv')

# Define selected features for similarity calculation
selected_features_title = ['title', 'keywords', 'tagline', 'cast', 'director']
selected_features_genres = ['title', 'genres', 'tagline', 'cast', 'director']
selected_features_keywords = ['title', 'keywords', 'tagline', 'cast', 'director']
selected_features = ['title', 'genres', 'tagline', 'keywords', 'director']
selected_features_cast = ['title', 'genres', 'tagline', 'keywords', 'cast']
selected_features_director = ['title', 'genres', 'tagline', 'keywords', 'director']

# Replace null values with empty strings for selected features
for feature in selected_features_title:
    movies[feature] = movies[feature].fillna('')

for feature in selected_features_genres:
    movies[feature] = movies[feature].fillna('')

for feature in selected_features_keywords:
    movies[feature] = movies[feature].fillna('')

for feature in selected_features_cast:
    movies[feature] = movies[feature].fillna('')

for feature in selected_features_director:
    movies[feature] = movies[feature].fillna('')

# Combine selected features into single text columns for vectorization
movies['combined_title'] = movies['title'] + ' ' + movies['keywords'] + ' ' + movies['tagline'] + ' ' + movies['cast'] + ' ' + movies['director']
movies['combined_genres'] = movies['title'] + ' ' + movies['genres'] + ' ' + movies['tagline'] + ' ' + movies['cast'] + ' ' + movies['director']
movies['combined_keywords'] = movies['title'] + ' ' + movies['keywords'] + ' ' + movies['tagline'] + ' ' + movies['cast'] + ' ' + movies['director']
movies['combined_cast'] = movies['title'] + ' ' + movies['cast'] + ' ' + movies['tagline'] + ' ' + movies['keywords'] + ' ' + movies['director']
movies['combined_director'] = movies['title'] + ' ' + movies['director'] + ' ' + movies['tagline'] + ' ' + movies['cast'] + ' ' + movies['keywords']

# Initialize TfidfVectorizers for each feature set
vectorizer_title = TfidfVectorizer()
vectorizer_genres = TfidfVectorizer()
vectorizer_keywords = TfidfVectorizer()
vectorizer_cast = TfidfVectorizer()
vectorizer_director = TfidfVectorizer()

# Convert text to feature vectors for each feature set
feature_vectors_title = vectorizer_title.fit_transform(movies['combined_title'])
feature_vectors_genres = vectorizer_genres.fit_transform(movies['combined_genres'])
feature_vectors_keywords = vectorizer_keywords.fit_transform(movies['combined_keywords'])
feature_vectors_cast = vectorizer_cast.fit_transform(movies['combined_cast'])
feature_vectors_director = vectorizer_director.fit_transform(movies['combined_director'])

# Compute cosine similarity matrices for each feature set
similarity_title = cosine_similarity(feature_vectors_title)
similarity_genres = cosine_similarity(feature_vectors_genres)
similarity_keywords = cosine_similarity(feature_vectors_keywords)
similarity_cast = cosine_similarity(feature_vectors_cast)
similarity_director = cosine_similarity(feature_vectors_director)

# Prompt user to input their favorite movie genre, title, keywords, director or cast name
user_input = input('Enter your favorite movie genre, title, keywords, director or cast name: ')

# Check if user input is in genres, titles, keywords, cast, or director
if user_input in movies['genres'].values:
    # Find the index of the movie with the closest matched genre
    index_of_movie = movies[movies['genres'] == user_input].index[0]
    similarity_matrix = similarity_genres
elif user_input in movies['title'].values:
    # Find the index of the movie with the exact matched title
    index_of_movie = movies[movies['title'] == user_input].index[0]
    similarity_matrix = similarity_title
elif user_input in movies['keywords'].values:
    # Find the index of the movie with the closest matched keywords
    index_of_movie = movies[movies['keywords'] == user_input].index[0]
    similarity_matrix = similarity_keywords
elif user_input in movies['cast'].values:
    # Find the index of the movie with the closest matched cast
    index_of_movie = movies[movies['cast'] == user_input].index[0]
    similarity_matrix = similarity_cast
elif user_input in movies['director'].values:
    # Find the index of the movie with the exact matched director
    index_of_movie = movies[movies['director'] == user_input].index[0]
    similarity_matrix = similarity_director
else:
    # Find the closest match for the movie title, genres, keywords, cast, or director given by the user
    close_match_titles = difflib.get_close_matches(user_input, movies['title'].astype(str).tolist())
    close_match_genres = difflib.get_close_matches(user_input, movies['genres'].astype(str).tolist())
    close_match_keywords = difflib.get_close_matches(user_input, movies['keywords'].astype(str).tolist())
    close_match_cast = difflib.get_close_matches(user_input, movies['cast'].astype(str).tolist())
    close_match_director = difflib.get_close_matches(user_input, movies['director'].astype(str).tolist())

    # Evaluate the best match among titles, genres, keywords, cast, and director
    close_matches = {
        'title': close_match_titles,
        'genres': close_match_genres,
        'keywords': close_match_keywords,
        'cast': close_match_cast,
        'director': close_match_director
    }

    best_match_type = None
    best_match_value = None
    for match_type, matches in close_matches.items():
        if matches:
            best_match_type = match_type
            best_match_value = matches[0]
            break

    if best_match_type is None:
        print(f"No close match found for '{user_input}'. Please try different keywords.")
        exit()

    print(f"Close match found for {best_match_type}: {best_match_value}")

    # Find the index of the movie with closest matched best match type
    if best_match_type == 'title':
        index_of_movie = movies[movies['title'] == best_match_value].index[0]
        similarity_matrix = similarity_title
    elif best_match_type == 'genres':
        index_of_movie = movies[movies['genres'] == best_match_value].index[0]
        similarity_matrix = similarity_genres
    elif best_match_type == 'keywords':
        index_of_movie = movies[movies['keywords'] == best_match_value].index[0]
        similarity_matrix = similarity_keywords
    elif best_match_type == 'cast':
        index_of_movie = movies[movies['cast'] == best_match_value].index[0]
        similarity_matrix = similarity_cast
    elif best_match_type == 'director':
        index_of_movie = movies[movies['director'] == best_match_value].index[0]
        similarity_matrix = similarity_director

# Get list of similar movies based on similarity scores from respective similarity matrix
similarity_scores = list(enumerate(similarity_matrix[index_of_movie]))

# Sort movies by similarity scores in descending order
sorted_similar = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

# Print the top similar movies
print('Movies suggested for you:')
for i, movie in enumerate(sorted_similar[:15], 1):
    index = movie[0]
    title = movies.loc[index, 'title']
    print(f"{i}. {title}")


Enter your favorite movie genre, title, keywords, director or cast name: Johnny Depp Orlando Bloom Keira Knightle
Close match found for cast: Johnny Depp Geoffrey Rush Orlando Bloom Keira Knightley Jack Davenport
Movies suggested for you:
1. Pirates of the Caribbean: The Curse of the Black Pearl
2. Pirates of the Caribbean: At World's End
3. Pirates of the Caribbean: Dead Man's Chest
4. Pirates of the Caribbean: On Stranger Tides
5. The Lord of the Rings: The Return of the King
6. Anna and the King
7. Dark Shadows
8. Kingdom of Heaven
9. The Lone Ranger
10. The Lord of the Rings: The Fellowship of the Ring
11. Seeking a Friend for the End of the World
12. The Mexican
13. Cutthroat Island
14. Shine
15. Rango
