In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load data
ratings = pd.read_csv('Resources/ratings.dat', sep='::', engine='python', names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='ISO-8859-1')
movies = pd.read_csv('Resources/movies.dat', sep='::', engine='python', names=['movie_id', 'title', 'genres'], encoding='ISO-8859-1')


In [2]:
# Combine genres and titles for TF-IDF
movies['combined_features'] = movies['genres'] + ' ' + movies['title']

# Create a TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

In [3]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
def get_content_based_recommendations(title, cosine_sim=cosine_sim):
    # Find a matching movie title, using a case-insensitive match
    match = movies[movies['title'].str.contains(title, case=False, na=False)]
    if match.empty:
        return f"No matches found for {title}. Please check the title and try again."
    
    idx = match.index[0]  # Use the first match if multiple matches are found
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 recommendations excluding itself
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]


In [9]:
# Usage
recommendations = get_content_based_recommendations('Godfather')
print(recommendations)

1954       Godfather: Part III, The (1990)
1203        Godfather: Part II, The (1974)
3713             Shaft's Big Score! (1972)
1066                 Candidate, The (1972)
3519    King of Marvin Gardens, The (1972)
1944        Poseidon Adventure, The (1972)
2109                         Frenzy (1972)
3688                         Asylum (1972)
3694                            F/X (1986)
3695                          F/X 2 (1992)
Name: title, dtype: object
