In [1]:
# Core libraries
import pandas as pd
import numpy as np

# For content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For collaborative filtering (replacing surprise)
from sklearn.decomposition import TruncatedSVD

# For sentiment analysis
from textblob import TextBlob

In [6]:
# --- Data Loading ---
print("Loading datasets...")
movies_df = pd.read_csv('ml-latest-small/movies.csv')
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

Loading datasets...


In [7]:
print("Movies DataFrame:")
print(movies_df.head())
print("\nRatings DataFrame:")
ratings_df.head()

Movies DataFrame:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings DataFrame:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
# Merge movies and ratings data
movie_ratings_df = pd.merge(ratings_df, movies_df, on='movieId')

print("Merged DataFrame:")
print(movie_ratings_df.head())

# Check for any missing values
print("\nMissing values:")
print(movie_ratings_df.isnull().sum())

Merged DataFrame:
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  

Missing values:
userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64


In [9]:
# 1. Create the user-item matrix
print("Creating user-item matrix...")
user_item_matrix = movie_ratings_df.pivot_table(index='userId', columns='title', values='rating')

# 2. Fill missing values with 0 (important for the model)
# This assumes that a non-rating is a neutral score
user_item_matrix_filled = user_item_matrix.fillna(0)

print("Shape of user-item matrix:", user_item_matrix_filled.shape)
print(user_item_matrix_filled.head())

# 3. Apply TruncatedSVD
# This decomposes the matrix into user features and item features
print("\nApplying TruncatedSVD...")
svd = TruncatedSVD(n_components=50, random_state=42) # n_components = number of latent factors
matrix_decomposed = svd.fit_transform(user_item_matrix_filled)

# 4. Reconstruct the matrix with predicted ratings
# This gives us a dense matrix with predicted ratings for all movies by all users
predicted_ratings = np.dot(matrix_decomposed, svd.components_)

# 5. Convert back to a pandas DataFrame for easy lookup
preds_df = pd.DataFrame(predicted_ratings, 
                        index=user_item_matrix_filled.index, 
                        columns=user_item_matrix_filled.columns)

print("Reconstructed predictions matrix:")
print(preds_df.head())

Creating user-item matrix...
Shape of user-item matrix: (610, 9719)
title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              0.0                                      0.0   
2              0.0                                      0.0   
3              0.0                                      0.0   
4              0.0                                      0.0   
5              0.0                                      0.0   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          0.0                  0.0   
2                          0.0                  0.0   
3                          0.0                  0.0   
4                          0.0                  0.0   
5                          0.0                  0.0   

title   'Til There Was You (1997)  'Tis the Season for Love (2015)  \
userId                            

In [14]:
# --- FINAL CORRECTED Section 1.3.2 ---

# 1. First, handle the special case of movies with no genres.
movies_df['genres_cleaned'] = movies_df['genres'].replace('(no genres listed)', '')

# 2. Now, replace the '|' separator, telling pandas it's NOT a regular expression.
#    This is the key fix.
movies_df['genres_processed'] = movies_df['genres_cleaned'].str.replace('|', ' ', regex=False)

# 3. Create a TF-IDF Vectorizer (without stop_words)
#    It will now see full words like "Adventure" and "Animation".
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['genres_processed'])

# 4. Calculate the cosine similarity matrix
print("\nCalculating Content-Based Similarity Matrix...")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. Create a mapping from movie title to index
title_to_index = pd.Series(movies_df.index, index=movies_df['title'])

print("Cosine similarity matrix calculation complete!")


Calculating Content-Based Similarity Matrix...
Cosine similarity matrix calculation complete!


In [15]:
# Simulate Reviews (for demonstration purposes)
dummy_reviews = [
    "Absolutely fantastic, a must-watch!", "A bit slow, but worth it.",
    "Not my cup of tea.", "Loved every minute of it.", "Quite boring."
]
np.random.seed(42)
sample_size = 1000
review_indices = np.random.choice(movie_ratings_df.index, sample_size, replace=False)
reviews_df = movie_ratings_df.loc[review_indices].copy()
reviews_df['review'] = [dummy_reviews[i] for i in np.random.randint(0, len(dummy_reviews), sample_size)]

# Calculate sentiment
reviews_df['sentiment'] = reviews_df['review'].apply(lambda x: TextBlob(x).sentiment.polarity)
movie_sentiment = reviews_df.groupby('title')['sentiment'].mean().reset_index()

print("\nAverage sentiment per movie (sample):")
print(movie_sentiment.head())


Average sentiment per movie (sample):
                          title     sentiment
0           12 Angry Men (1957)  0.000000e+00
1      13th Warrior, The (1999)  5.000000e-01
2  2001: A Space Odyssey (1968) -2.775558e-17
3                   2012 (2009) -2.775558e-17
4                     21 (2008)  7.000000e-01


In [24]:
def get_hybrid_recommendations(user_id, movie_title, top_n=5):
    """
    A robust hybrid recommender that uses a content-based model to generate
    candidates and a collaborative model to re-rank them for the user.
    """
    # 1. --- Candidate Generation (Content-Based) ---
    # Widen the pool to get enough potential matches.
    CANDIDATE_POOL_SIZE = 200
    
    if movie_title not in title_to_index:
        return pd.DataFrame() # Return empty if the seed movie isn't found
        
    movie_idx = title_to_index[movie_title]
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:CANDIDATE_POOL_SIZE + 1]
    similar_movie_indices = [i[0] for i in sim_scores]
    
    # Get the titles of the candidate movies
    candidate_movies = movies_df.iloc[similar_movie_indices]

    # 2. --- Re-Ranking (Collaborative Filtering) ---
    # Filter the candidate list to only include movies our SVD model knows about.
    # Also remove movies the user has already seen.
    
    recommendations = []
    already_rated_titles = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index

    for title in candidate_movies['title']:
        if title in preds_df.columns and title not in already_rated_titles:
            # Get the predicted rating for this movie and this user
            predicted_rating = preds_df.loc[user_id, title]
            recommendations.append((title, predicted_rating))

    # 3. --- Final Output ---
    if not recommendations:
        return pd.DataFrame() # Still possible to be empty if no candidates are valid

    # Create a DataFrame from the successful recommendations
    rec_df = pd.DataFrame(recommendations, columns=['title', 'predicted_rating'])
    
    # Sort the final list by the predicted rating
    rec_df = rec_df.sort_values('predicted_rating', ascending=False)
    
    return rec_df.head(top_n)

# --- This will now give you a valid result ---
recommendations = get_hybrid_recommendations(10, 'Jumanji (1995)')
print("\n--- Final Recommendations ---")
print(recommendations)


--- Final Recommendations ---
                                                 title  predicted_rating
71                               Monsters, Inc. (2001)          1.745377
8    Harry Potter and the Sorcerer's Stone (a.k.a. ...          1.694957
141     Harry Potter and the Chamber of Secrets (2002)          1.662235
9    Chronicles of Narnia: The Lion, the Witch and ...          0.945452
130                                  Inside Out (2015)          0.707324
