In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load Data
def load_data():
    """
    Load a sample dataset of movies with their titles and genres.
    """
    data = {
        'movie_id': [1, 2, 3, 4, 5],
        'movie_title': ['The Matrix', 'The Lion King', 'The Dark Knight', 'Toy Story', 'Inception'],
        'genres': ['Action Sci-Fi', 'Animation Family', 'Action Crime', 'Animation Family', 'Action Sci-Fi']
    }
    return pd.DataFrame(data)

# 2. Create TF-IDF Matrix for Genres
def create_tfidf_matrix(df):
    """
    Create a TF-IDF matrix based on the genres of the movies.
    """
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['genres'])
    return tfidf_matrix

# 3. Compute Cosine Similarity Between Movies
def compute_cosine_similarity(tfidf_matrix):
    """
    Compute the cosine similarity between all movie pairs based on their genre features.
    """
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim

# 4. Generate Recommendations
def generate_recommendations(df, movie_title, cosine_sim):
    """
    Generate movie recommendations based on cosine similarity for a given movie.
    """
    # Get the index of the movie that matches the movie_title
    idx = df.index[df['movie_title'] == movie_title].tolist()[0]
    
    # Get the pairwise similarity scores for the movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on similarity scores (in descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 3 most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:4]
    
    # Get the movie indices and their similarity scores
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    
    # Return the top 3 recommended movies and their similarity scores
    recommended_movies = df.iloc[movie_indices].copy()  # Create a copy to avoid SettingWithCopyWarning
    
    # Use .loc to assign the similarity_score column properly
    recommended_movies.loc[:, 'similarity_score'] = movie_scores
    
    return recommended_movies[['movie_title', 'similarity_score']]

# 5. Main function to run the recommendation system
def main(movie_title):
    df = load_data()  # Load the movie data
    tfidf_matrix = create_tfidf_matrix(df)  # Create the TF-IDF matrix
    print(f"tfidf_matrix={tfidf_matrix}"); print()
    
    cosine_sim = compute_cosine_similarity(tfidf_matrix)  # Calculate cosine similarity
    print(f"cosine_sim={cosine_sim}"); print()
    
    recommendations = generate_recommendations(df, movie_title, cosine_sim)  # Get movie recommendations
    return recommendations


In [7]:

if __name__ == "__main__":
    movie_title = 'The Matrix'  # Example movie to get recommendations for
    recommendations = main(movie_title)
    print(f"Recommendations for '{movie_title}':")
    print(recommendations)


tfidf_matrix=<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12 stored elements and shape (5, 6)>
  Coords	Values
  (0, 0)	0.5062044059286201
  (0, 5)	0.6098184563533858
  (0, 4)	0.6098184563533858
  (1, 1)	0.7071067811865476
  (1, 3)	0.7071067811865476
  (2, 0)	0.5564505207186616
  (2, 2)	0.830880748357988
  (3, 1)	0.7071067811865476
  (3, 3)	0.7071067811865476
  (4, 0)	0.5062044059286201
  (4, 5)	0.6098184563533858
  (4, 4)	0.6098184563533858

cosine_sim=[[1.         0.         0.28167771 0.         1.        ]
 [0.         1.         0.         1.         0.        ]
 [0.28167771 0.         1.         0.         0.28167771]
 [0.         1.         0.         1.         0.        ]
 [1.         0.         0.28167771 0.         1.        ]]

Recommendations for 'The Matrix':
       movie_title  similarity_score
4        Inception          1.000000
2  The Dark Knight          0.281678
1    The Lion King          0.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_movies.loc[:, 'similarity_score'] = movie_scores
