Cosine Similarity

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import re

In [2]:
# Load and prepare dataset
movies_df = pd.read_csv('/content/movies.csv')
ratings_df = pd.read_csv('/content/ratings.csv')

In [19]:
movies_df.isna().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0
genres_list,0
clean_title,0


In [20]:
ratings_df.isna().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0


In [3]:
# Data Cleaning
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [4]:
movies_df['genres_list'] = movies_df['genres'].str.replace('|', ' ')
movies_df['clean_title'] = movies_df['title'].apply(clean_title)

In [5]:
# Selecting necessary columns for the recommendation
movies_filtered_df = movies_df[['movieId', 'clean_title', 'genres_list']]
ratings_df.drop(['timestamp'], axis=1, inplace=True)

In [6]:
# Merging movies and ratings data
combined_data = ratings_df.merge(movies_filtered_df, on='movieId')
print("Combined Data:")
print(combined_data.head())

Combined Data:
   userId  movieId  rating                                 clean_title  \
0       1      296     5.0                           Pulp Fiction 1994   
1       1      306     3.5  Three Colors Red Trois couleurs Rouge 1994   
2       1      307     5.0  Three Colors Blue Trois couleurs Bleu 1993   
3       1      665     5.0                            Underground 1995   
4       1      899     3.5                     Singin in the Rain 1952   

                   genres_list  
0  Comedy Crime Drama Thriller  
1                        Drama  
2                        Drama  
3             Comedy Drama War  
4       Comedy Musical Romance  


In [7]:
# Cosine Similarity based on Titles
title_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_title_matrix = title_vectorizer.fit_transform(movies_filtered_df['clean_title'])

In [8]:
# Cosine Similarity based on Genres
genres_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_genres_matrix = genres_vectorizer.fit_transform(movies_filtered_df['genres_list'])


In [9]:
# Function to search by title using Cosine Similarity
def search_by_title(title):
    title = clean_title(title)
    query_vec = title_vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_title_matrix).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_filtered_df.iloc[indices][::-1]
    return results

In [10]:
# Function to search by genre similarity
def search_similar_genres(genres):
    query_vec = genres_vectorizer.transform([genres])
    similarity = cosine_similarity(query_vec, tfidf_genres_matrix).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = movies_filtered_df.iloc[indices][::-1]
    return results

In [11]:
def scores_calculator(movie_id):
    similar_users = combined_data[(combined_data['movieId'] == movie_id) & (combined_data['rating'] >= 4)]['userId'].unique()
    similar_user_recs = combined_data[(combined_data['userId'].isin(similar_users)) & (combined_data['rating'] >= 4)]['movieId']
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    all_users = combined_data[(combined_data['movieId'].isin(similar_user_recs.index)) & (combined_data['rating'] >= 4)]
    all_users_recs = all_users['movieId'].value_counts() / all_users['userId'].nunique()

    genres_of_selected_movie = combined_data[combined_data['movieId'] == movie_id]['genres_list'].unique()
    genres_of_selected_movie = np.array2string(genres_of_selected_movie)
    movies_with_similar_genres = search_similar_genres(genres_of_selected_movie)

    indices = movies_with_similar_genres[movies_with_similar_genres['movieId'].isin(similar_user_recs.index)]['movieId']
    similar_user_recs.loc[indices] *= 1.5

    indices = movies_with_similar_genres[movies_with_similar_genres['movieId'].isin(all_users_recs.index)]['movieId']
    all_users_recs.loc[indices] *= 0.9

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ['similar', 'all']
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']

    rec_percentages = rec_percentages.sort_values('score', ascending=False)
    return rec_percentages

In [12]:
# Final recommendation function
def recommendation_results(user_input, title=0):
    title_candidates = search_by_title(user_input)
    movie_id = title_candidates.iloc[title]['movieId']
    scores = scores_calculator(movie_id)
    results = scores.head(10).merge(movies_filtered_df, left_index=True, right_on='movieId')[['clean_title', 'score', 'genres_list']]
    results.rename(columns={'clean_title': 'title', 'genres_list': 'genres'}, inplace=True)
    return results

# Display user options for title search
user_input = "Toy Story"
print("Are you looking for (please choose a number): ")
for i in range(5):
    print(i, ": ", search_by_title(user_input)['clean_title'].iloc[i])

# User selects a title
title_choice = 0
if title_choice in range(5):
    print("We have the following recommendations: ")
    print(recommendation_results(user_input, title_choice))
else:
    print("Sorry! please try again!")

Are you looking for (please choose a number): 
0 :  Toy Story 2 1999
1 :  Toy Story 3 2010
2 :  Toy Story 1995
3 :  Toy Story 4 2019
4 :  Toy Story of Terror 2013
We have the following recommendations: 
                                        title      score  \
22633  Toy Story Toons Hawaiian Vacation 2011  15.455353   
22634          Toy Story Toons Small Fry 2011  14.043053   
22286                            Aladdin 1992  12.390929   
7144            Hangin with the Homeboys 1991   9.912743   
13140                       Frontrunners 2008   9.912743   
14093                   My Sister Eileen 1942   9.912743   
3021                         Toy Story 2 1999   9.912743   
29142                   The Killing Kind 1973   9.912743   
27828                      HalfBreed The 1916   9.912743   
27968                         The Corpse 1970   9.912743   

                                            genres  
22633  Adventure Animation Children Comedy Fantasy  
22634  Adventure Animation Chi

K nearest neighbor

In [13]:
# KNN-based Recommendations using User-Item Matrix
# Adding rating count per movie title for filtering
user_movie_ratings = combined_data.groupby('clean_title')['rating'].count().reset_index().rename(columns={'rating': 'rating_count'})
popular_movies_df = combined_data.merge(user_movie_ratings, on='clean_title', how='left')
print(f"10% of count= {popular_movies_df.rating_count.quantile(.1)}")
print(f"25% of count= {popular_movies_df.rating_count.quantile(.25)}")
print(f"50% of count= {popular_movies_df.rating_count.quantile(.5)}")
print(f"75% of count= {popular_movies_df.rating_count.quantile(.75)}")
print(f"85% of count= {popular_movies_df.rating_count.quantile(.85)}")
print(f"100% of count= {popular_movies_df.rating_count.quantile(1)}")

10% of count= 825.0
25% of count= 2986.0
50% of count= 9152.0
75% of count= 20757.0
85% of count= 30910.0
100% of count= 81491.0


In [14]:
# Filtering movies with rating count threshold
popular_movies_df = popular_movies_df[popular_movies_df['rating_count'] >= 10000]
print("Popular Movies DataFrame Shape:", popular_movies_df.shape)


Popular Movies DataFrame Shape: (11878413, 6)


In [15]:
# Creating a pivot table for the KNN algorithm
movie_user_matrix = popular_movies_df.pivot_table(index='clean_title', columns='userId', values='rating').fillna(0)
print("Movie-User Pivot Table:")
print(movie_user_matrix.head())

Movie-User Pivot Table:
userId                           1       2       3       4       5       \
clean_title                                                               
10 Things I Hate About You 1999     0.0     0.0     0.0     0.0     0.0   
12 Angry Men 1957                   0.0     0.0     0.0     0.0     0.0   
2001 A Space Odyssey 1968           0.0     0.0     5.0     4.0     0.0   
28 Days Later 2002                  0.0     0.0     3.0     0.0     0.0   
300 2007                            0.0     0.0     4.0     0.0     0.0   

userId                           6       7       8       9       10      ...  \
clean_title                                                              ...   
10 Things I Hate About You 1999     0.0     0.0     0.0     0.0     3.0  ...   
12 Angry Men 1957                   5.0     0.0     4.0     0.0     0.0  ...   
2001 A Space Odyssey 1968           4.0     0.0     5.0     3.0     4.5  ...   
28 Days Later 2002                  0.0     0.0   

In [16]:
# Converting pivot table to a sparse matrix
movie_user_sparse_matrix = csr_matrix(movie_user_matrix.values)

In [17]:
# Initializing and train KNN model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11)
knn_model.fit(movie_user_sparse_matrix)
print("KNN Model trained successfully.")

KNN Model trained successfully.


In [18]:
# KNN recommendation function
def knn_recommendations(random_movie_index):
    distances, indices = knn_model.kneighbors(movie_user_matrix.iloc[random_movie_index, :].values.reshape(1, -1), n_neighbors=11)
    for i, (distance, idx) in enumerate(zip(distances.flatten(), indices.flatten())):
        if i == 0:
            print(f'Recommendation for the movie: {movie_user_matrix.index[idx]}')
        else:
            print(f'{i}: {movie_user_matrix.index[idx]} with a distance of {distance}')

# Example of KNN recommendation
random_movie_index = np.random.choice(movie_user_sparse_matrix.shape[0])
knn_recommendations(random_movie_index)

Recommendation for the movie: Pans Labyrinth Laberinto del fauno El 2006
1: Children of Men 2006 with a distance of 0.5018557109340256
2: Donnie Darko 2001 with a distance of 0.5062206873212357
3: Eternal Sunshine of the Spotless Mind 2004 with a distance of 0.510537159834269
4: V for Vendetta 2006 with a distance of 0.511195143489295
5: No Country for Old Men 2007 with a distance of 0.5152410894698674
6: Little Miss Sunshine 2006 with a distance of 0.5225744422027347
7: Sin City 2005 with a distance of 0.5316591190316157
8: WALLE 2008 with a distance of 0.532382315840166
9: Prestige The 2006 with a distance of 0.5345969664668239
10: Dark Knight The 2008 with a distance of 0.5352003355651167
