In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Load MovieLens data
movies = pd.read_csv("/Users/starun/projects/movie_rec/ml-latest-small/movies.csv")

In [3]:
# Split genres and prepare genre list
movies['genres'] = movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(movies['genres'])

In [4]:
# Create final movie feature DataFrame
genre_df = pd.DataFrame(genre_features, columns=mlb.classes_)
movie_features = pd.concat([movies[['movieId', 'title']], genre_df], axis=1)

In [5]:
# Sample user vector from questionnaire (assume selected genres: Comedy + Drama)
user_vector = np.zeros(len(mlb.classes_))
for genre in ['Comedy', 'Drama']:
    if genre in mlb.classes_:
        idx = list(mlb.classes_).index(genre)
        user_vector[idx] = 1

In [6]:
# Compute cosine similarity between user vector and movie vectors
movie_vectors = genre_features
similarities = cosine_similarity([user_vector], movie_vectors)[0]


In [7]:
# Add similarity to DataFrame and get top 10
movie_features['similarity'] = similarities
top_recommendations = movie_features.sort_values(by='similarity', ascending=False).head(10)

In [11]:
top_recommendations.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,similarity
3011,4029,State and Main (2000),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1.0
3813,5339,Husbands and Wives (1992),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1.0
7959,96121,Hope Springs (2012),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1.0
6153,44195,Thank You for Smoking (2006),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1.0
329,371,"Paper, The (1994)",0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1.0
