In [30]:
import pandas as pd
import numpy as np
import re

In [31]:

df1 = pd.read_csv(r'C:\Users\Asus\Downloads\ml-25m\ml-25m\movies.csv')

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from scipy.sparse import lil_matrix

In [33]:
df1.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
df1.loc[df1['genres'] == 'no genres listed', 'genres'] = ''
df1['genres'] = df1['genres'].str.replace('|', ' ')
df1['title'] = df1['title'].apply(lambda x: re.sub(r'\s*\([^)]*\)', '', x))


In [35]:
df1['title'].head(100)

0                         Toy Story
1                           Jumanji
2                  Grumpier Old Men
3                 Waiting to Exhale
4       Father of the Bride Part II
                  ...              
95                             Hate
96                         Shopping
97    Heidi Fleiss: Hollywood Madam
98                        City Hall
99                    Bottle Rocket
Name: title, Length: 100, dtype: object

In [36]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df1['genres'])
tfidf_matrix_dense = tfidf_matrix.toarray()

In [37]:
pca = PCA(n_components=23, svd_solver="arpack")
tfidf_matrix_reduced_pca = pca.fit_transform(tfidf_matrix_dense)

In [13]:
cosine_sim = lil_matrix((tfidf_matrix_reduced_pca.shape[0], tfidf_matrix_reduced_pca.shape[0]))

# Compute cosine similarity in chunks
chunk_size = 100
for i in range(0, tfidf_matrix_reduced_pca.shape[0], chunk_size):
    for j in range(i, min(tfidf_matrix_reduced_pca.shape[0], i+chunk_size)):
        if tfidf_matrix_reduced_pca[i:j, :].shape[0] > 0:
            cosine_sim[i:j, i:j] = cosine_similarity(tfidf_matrix_reduced_pca[i:j], tfidf_matrix_reduced_pca[i:j]) 

In [14]:
def content_based_recommendation(title, cosine_sim=cosine_sim):
    idx = df1[df1['title'] == title].index[0] if len(df1[df1['title'] == title]) > 0 else -1
    if idx >= 0:
        lst = list(enumerate(cosine_sim[idx].toarray()))
        sim_scores=np.array(lst[0][1])
        movie_indices = np.argsort((sim_scores))
        movie_indices=movie_indices[1:11]
        return df1['title'].iloc[movie_indices].tolist()
    else:
        raise ValueError("Error: The movie was not found in theÂ dataframe.")

In [52]:
user_favorite_movies = ['Jumanji', 'Toy Story']  # Assuming these are the user's favorite movies
content_based_recommendations = []
for movie in user_favorite_movies:
    recommendations = content_based_recommendation(movie)
    content_based_recommendations.extend(recommendations)

print("Content-Based Recommendations:", content_based_recommendations[:10])

Content-Based Recommendations: ['Big Bully', 'In the Bleak Midwinter', 'Kicking and Screaming', 'Last Summer in the Hamptons', 'To Die For', 'American President, The', 'Waiting to Exhale', 'Mighty Aphrodite', 'Postman, The', 'Beautiful Girls']


In [16]:
df2=pd.read_csv(r'C:\Users\Asus\Downloads\ml-25m\ml-25m\ratings.csv')

In [56]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

In [18]:
reader=Reader()
data = Dataset.load_from_df(df2[['userId', 'movieId', 'rating']], reader)

In [67]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [70]:
algo = SVD(n_factors=24, reg_all=0.001, lr_all=0.005, n_epochs=20)
algo.fit(trainset)
predictions = algo.test(testset)

In [71]:
accuracy = rmse(predictions)
print("RMSE:", accuracy)

RMSE: 0.7876
RMSE: 0.7876319158302448


In [46]:
def collaborative_filtering_recommendations(user_id):
    # Get predicted ratings for all movies for the user
    user_ratings = []
    for movie_id in df2['movieId'].unique():
        user_ratings.append((movie_id, algo.predict(user_id, movie_id).est))
    # Sort movies by predicted rating
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    # Extract top recommendations
    top_recommendations = [movie_id for movie_id, _ in user_ratings[:10]]
    return df1[df1['movieId'].isin(top_recommendations)]


In [51]:
user_id = 24
collab_filtering_recommendations_result = collaborative_filtering_recommendations(user_id)
collab_filtering_recommendations_result

Unnamed: 0,movieId,title,genres
351,356,Forrest Gump,Comedy Drama Romance War
359,364,"Lion King, The",Adventure Animation Children Drama Musical IMAX
4201,4306,Shrek,Adventure Animation Children Comedy Fantasy Ro...
4887,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure Fantasy
5840,5952,"Lord of the Rings: The Two Towers, The",Adventure Fantasy
6258,6377,Finding Nemo,Adventure Animation Children Comedy
7028,7153,"Lord of the Rings: The Return of the King, The",Action Adventure Drama Fantasy
40597,159817,Planet Earth,Documentary
45593,170705,Band of Brothers,Action Drama War
45741,171011,Planet Earth II,Documentary


In [79]:
from surprise import accuracy

def evaluate_collaborative_filtering(predictions):
    rmse = accuracy.rmse(predictions, verbose=False)
    return rmse
# Evaluation Metrics for Content-Based (Diversity)
def evaluate_content_based_recommendations(recommendations):
    # Create a DataFrame containing recommendations
    recommended_movies = df1[df1['title'].isin(recommendations)]
    # Calculate diversity metric based on genre distribution
    genre_counts = recommended_movies['genres'].str.split().explode().value_counts()
    total_recommendations = len(recommended_movies)
    diversity = len(genre_counts) / total_recommendations
    return diversity



In [76]:
# Evaluate collaborative filtering system
predictions = algo.test(testset)
collaborative_filtering_rmse = evaluate_collaborative_filtering(predictions)
print("RMSE for Collaborative Filtering:", collaborative_filtering_rmse)


RMSE for Collaborative Filtering: 0.7876319158302448


In [80]:
# Evaluate content-based system
content_based_recommendations = content_based_recommendation("Jumanji")  # Assuming Jumanji is the user's favorite movie
content_based_diversity = evaluate_content_based_recommendations(content_based_recommendations)
print("Diversity for Content-Based Recommendations:", content_based_diversity)

Diversity for Content-Based Recommendations: 0.6363636363636364
