Load and Preprocess Data

In [40]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path

In [41]:

path=Path("Resources/movies_cleaned.json")
df = pd.read_json(path)

Feature Engineering for Genres
Convert the cleaned_genres column into a feature set using TF-IDF or one-hot encoding. For TF-IDF:

In [42]:
# Prepare the TF-IDF matrix for movie genres, where each row is a movie
tfidf = TfidfVectorizer(stop_words='english')
# Important: Apply TF-IDF transformation for each movie, not just unique genres
tfidf_matrix = tfidf.fit_transform(df['cleaned_genres'])

Create User Profile

In [43]:
# Map movieIds to their index in tfidf_matrix for easy access
movie_id_to_index = pd.Series(df.index, index=df['movieId']).to_dict()

# Initialize user_profiles DataFrame
user_profiles = pd.DataFrame(0, index=df['userId'].unique(), columns=tfidf.get_feature_names_out())


In [44]:
# Calculate user profiles
for user in df['userId'].unique():
    user_movies = df[df['userId'] == user]
    for _, row in user_movies.iterrows():
        idx = movie_id_to_index[row['movieId']]
        genre_vector = tfidf_matrix[idx].toarray()
        rating = row['rating']
        user_profiles.loc[user] += rating * genre_vector.flatten()

    # Normalize the user profile
    user_profiles.loc[user] = user_profiles.loc[user] / user_movies['rating'].sum()



Generate Recommendation

In [45]:
# Select a specific user to generate recommendations for
example_user_id = 23  # For example, choosing user 23 explicitly for clarity
user_profile = user_profiles.loc[example_user_id].values.reshape(1, -1)


In [46]:
# Calculate similarity scores between the user profile and all movie genre vectors
similarity_scores = cosine_similarity(user_profile, tfidf_matrix)

In [47]:
# Get top 5 recommended movie indices
top_indices = similarity_scores.argsort()[0][-6:-1]  # Exclude the last one as it will be the user's own highest match

In [48]:
# Fetching recommended movie titles
recommended_movies = df.iloc[top_indices]['title'].unique()

print(f"Recommended Movies for User {example_user_id}: {recommended_movies}")

Recommended Movies for User 23: ['Confessions of a Dangerous Mind']
