In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

In [2]:
# Load movies data (MovieLens 100k)
movies = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.item', sep='|', header=None, encoding='latin-1',
                     names=["movie_id", "title", "release_date", "video_release_date", "IMDb_URL",
                            "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
                            "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery",
                            "Romance", "Sci-Fi", "Thriller", "War", "Western"])

genre_cols = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary",
              "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
              "Thriller", "War", "Western"]

movies['genres'] = movies[genre_cols].apply(lambda x: ' '.join([genre_cols[i] for i in range(len(x)) if x[i] == 1]), axis=1)

# TF-IDF on genres
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cos_sim = cosine_similarity(tfidf_matrix)

# Recommend similar movies
def recommend_movies(movie_title, top_n=5):
    idx = movies[movies['title'].str.contains(movie_title, case=False, na=False)].index[0]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# Example
recommend_movies("Toy Story")

  movies['genres'] = movies[genre_cols].apply(lambda x: ' '.join([genre_cols[i] for i in range(len(x)) if x[i] == 1]), axis=1)


421    Aladdin and the King of Thieves (1996)
101                    Aristocats, The (1970)
403                          Pinocchio (1940)
624            Sword in the Stone, The (1963)
945             Fox and the Hound, The (1981)
Name: title, dtype: object

In [3]:
# Load MovieLens ratings data
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2)

# SVD algorithm
model = SVD()
model.fit(trainset)
predictions = model.test(testset)

# Evaluation
print("Collaborative Filtering RMSE:")
rmse(predictions)

Collaborative Filtering RMSE:
RMSE: 0.9432


0.9432041641468383