
https://umair-iftikhar.medium.com/building-a-simple-recommendation-system-with-item-item-collaborative-filtering-in-python-3baae5179c52

In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


USER_ID = 240567         # user_id for which we are going to recommend movies
TOP = 20                 # number of movies to recommend

In [2]:
# Load the books dataset
df_books_name = pd.read_csv("BX-Books.csv", on_bad_lines='skip', sep=';',low_memory=False)
df_books_name = df_books_name.iloc[:, :-3]
df_books_name = df_books_name.set_index('ISBN')
df_books_name = df_books_name.rename_axis(None)

# Load the ratings dataset
df = pd.read_csv("BX-Book-Ratings.csv", on_bad_lines='skip', sep=';')
df = df[df['Book-Rating'] != 0]

#Remove the books that are not in the books_name dataset
df = df[df['ISBN'].isin(df_books_name.index)]


# Cut the dataframe
# cut = 200000
# df = df[:cut]

# Remove the books with less than 10 ratings
df_filter = df.groupby('ISBN').filter(lambda x: len(x) >= 2)


# Unique list of all the users and books
users = df_filter['User-ID'].unique()
books = df_filter['ISBN'].unique()
print("Number of books: ", len(books))
# Create an empty dataframe
df_books = pd.DataFrame(index=users, columns=books)
df_books = df_books.fillna(0)

# fill the dataframe with the ratings
for index, row in df_filter.iterrows():
    df_books.at[row['User-ID'], row['ISBN']] = row['Book-Rating']




Number of books:  50423


MemoryError: Unable to allocate 22.2 GiB for an array with shape (59190, 50423) and data type object

In [3]:
df_books

Unnamed: 0,038550120X,0060517794,0671537458,0679776818,0684867621,0451166892,0380711524,0345443683,043935806X,055310666X,...,1551666561,0553273906,0875421318,0380973820,0449242072,0452283795,067102731X,0060294671,1853262102,0441005667
276744,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
276747,0,9,9,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
276754,0,0,0,0,8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
276755,0,0,0,0,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
276762,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
276685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
276688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
276704,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Top 10 books by Rating

In [4]:
from tabulate import tabulate
print("Top most rated books in the dataset in this fomrat (ISBN, book title, authors, average rating, number of ratings)\n")
top = df_filter['ISBN'].value_counts().head(10)

table_data = []
for i in range(10):
    book = df_books.loc[:, top.index[i]].values.T
    mean = np.round(np.mean(book[book != 0]), 2)
    table_data.append([top.index[i], df_books_name.loc[top.index[i]]['Book-Title'], df_books_name.loc[top.index[i]]['Book-Author'], mean, top.values[i]])

table_headers = ["ISBN", "Book Title", "Authors", "Average Rating", "Number of Ratings"]
print(tabulate(table_data, headers=table_headers))

#top users with most ratings
top = df_filter['User-ID'].value_counts().head(30)
print("\nTop users with most ratings in this format (User-ID, number of ratings)\n")
for i in range(10):
    print(top.index[i], top.values[i])

Top most rated books in the dataset in this fomrat (ISBN, book title, authors, average rating, number of ratings)

ISBN        Book Title                                                        Authors            Average Rating    Number of Ratings
----------  ----------------------------------------------------------------  ---------------  ----------------  -------------------
0316666343  The Lovely Bones: A Novel                                         Alice Sebold                 8.19                  707
0971880107  Wild Animus                                                       Rich Shapero                 4.39                  581
0385504209  The Da Vinci Code                                                 Dan Brown                    8.44                  487
0312195516  The Red Tent (Bestselling Backlist)                               Anita Diamant                8.18                  383
0060928336  Divine Secrets of the Ya-Ya Sisterhood: A Novel                   Rebecca W

In [5]:
# Compute item-item similarity
item_similarity = cosine_similarity(df_books.T)

# Example user's interactions
user_interactions = df_books.loc[USER_ID]
# Calculate item scores based on user's interactions and item similarity
item_scores = user_interactions.dot(item_similarity)

# Set the scores of the items that the user has already interacted with to 0
item_scores[user_interactions > 0] = user_interactions[user_interactions > 0]

# Normalize the scores between 0 and 10
item_scores = (item_scores - item_scores.min()) / (item_scores.max() - item_scores.min()) * 10

#set to 0 the items that the user has already interacted with
item_scores[user_interactions > 0] = 0

# Sort items by score and recommend the top-n
recommended_items = np.argsort(item_scores)[::-1][:TOP]
print(item_scores)


KeyboardInterrupt



In [None]:
from tabulate import tabulate
print("Top " + str(TOP) + " recommended books for user " + str(USER_ID) + ":")

table_data = []
for item in recommended_items:
    Isnb = df_books.columns[item]
    title = df_books_name.loc[Isnb]
    Pred = item_scores[item]
    table_data.append([title['Book-Title'], title['Book-Author'], Isnb, Pred])



table_headers = ["Book Title", "Book Author", "ISBN","Predicted Rating"]
print(tabulate(table_data, headers=table_headers))


Top 20 recommended books for user 240567:
Book Title                                                             Book Author           ISBN          Predicted Rating
---------------------------------------------------------------------  --------------------  ----------  ------------------
Four To Score (A Stephanie Plum Novel)                                 Janet Evanovich       0312966970            10
Three To Get Deadly : A Stephanie Plum Novel (A Stephanie Plum Novel)  Janet Evanovich       0312966091             9.56228
When the Bough Breaks (Alex Delaware Novels (Paperback))               Jonathan Kellerman    0553569619             8.47101
Hawaii                                                                 James A. Michener     0449213358             8.12772
Loving                                                                 Danielle Steel        0440146577             8.07582
Turbulence                                                             John J. Nance         051

In [None]:
# Predictions User ratings for a book
#pick random book id from the list of books

# book_id = books[np.random.randint(0, len(books))]

# book_title = df_books_name.loc[book_id]['Book-Title']
# book_ratings = df_books[book_id]
# book_similarity = item_similarity[books == book_id]
# user_ratings = df_books.loc[USER_ID]
# predicted_rating = user_ratings.dot(book_similarity.T)
# print("Predicted rating for book '" + book_title + "' for user " + str(USER_ID) + ": " + str(predicted_rating[0]))
# print("Actual rating for book '" + book_title + "' for user " + str(USER_ID) + ": " + str(book_ratings[USER_ID]))


# User-Based Collaborative Filtering

In user-based collaborative filtering, we recommend items to a user based on the preferences of similar users.

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Sample user-item interaction matrix (rows are users, columns are items)
ratings = np.array([
    [5, 3, 0, 1],
    [4, 0, 0, 1],
    [1, 1, 0, 5],
    [0, 0, 5, 4],
    [0, 3, 4, 5]
])

# Compute user-user similarity matrix
user_similarity = cosine_similarity(ratings)

def predict_user_based(user, item, ratings, user_similarity):
    # Mean rating for the target user
    mean_user_rating = ratings[user].mean()
    # Compute the weighted sum of ratings
    weighted_sum = np.sum(user_similarity[user] * (ratings[:, item] - ratings.mean(axis=1)))
    similarity_sum = np.sum(user_similarity[user])

    if similarity_sum == 0:
        return mean_user_rating

    # Predicted rating
    return mean_user_rating + (weighted_sum / similarity_sum)

# Predict rating for user 0 on item 2
predicted_rating = predict_user_based(0, 2, ratings, user_similarity)
print(f"Predicted rating for user 0 on item 2: {predicted_rating:.2f}")


Predicted rating for user 0 on item 2: 0.99


 # Item-Based Collaborative Filtering
In item-based collaborative filtering, we recommend items similar to those the user has liked in the past.

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Sample user-item interaction matrix (rows are users, columns are items)
ratings = np.array([
    [5, 3, 0, 1],
    [4, 0, 0, 1],
    [1, 1, 0, 5],
    [0, 0, 5, 4],
    [0, 3, 4, 5]
])

# Compute item-item similarity matrix
item_similarity = cosine_similarity(ratings.T)

def predict_item_based(user, item, ratings, item_similarity):
    # Mean rating for the target item
    mean_item_rating = ratings[:, item].mean()
    # Compute the weighted sum of ratings
    weighted_sum = np.sum(item_similarity[item] * (ratings[user] - ratings.mean(axis=1)))
    similarity_sum = np.sum(item_similarity[item])

    if similarity_sum == 0:
        return mean_item_rating

    # Predicted rating
    return mean_item_rating + (weighted_sum / similarity_sum)

# Predict rating for user 0 on item 2
predicted_rating = predict_item_based(0, 2, ratings, item_similarity)
print(f"Predicted rating for user 0 on item 2: {predicted_rating:.2f}")


ValueError: operands could not be broadcast together with shapes (4,) (5,) 

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

# Setup Spark session
spark = SparkSession.builder.appName("BookRecommendation").getOrCreate()

# Load the data
df_books_name = pd.read_csv("BX-Books.csv", on_bad_lines='skip', sep=';', low_memory=False)
df_books_name = df_books_name.iloc[:, :-3]
df_books_name = df_books_name.set_index('ISBN')
df_books_name = df_books_name.rename_axis(None)

df_books_name_spark = spark.createDataFrame(df_books_name.reset_index())

df = pd.read_csv("BX-Book-Ratings.csv", on_bad_lines='skip', sep=';')
df = df[df['Book-Rating'] != 0]
df_spark = spark.createDataFrame(df)
df_spark = df_spark.filter(df_spark['ISBN'].isin([row['index'] for row in df_books_name_spark.select('index').collect()]))

# Filter and prepare the data
df_filter_spark = df_spark.groupBy('ISBN').count().filter('count >= 10').select('ISBN')
df_filter_spark = df_spark.join(df_filter_spark, on='ISBN')

users = df_filter_spark.select('User-ID').distinct().rdd.flatMap(lambda x: x).collect()
books = df_filter_spark.select('ISBN').distinct().rdd.flatMap(lambda x: x).collect()

schema = StructType([StructField('User-ID', IntegerType(), True)] + [StructField(book, FloatType(), True) for book in books])
df_books_spark = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

for row in df_filter_spark.collect():
    df_books_spark = df_books_spark.withColumn(row['ISBN'], F.when(col('User-ID') == row['User-ID'], row['Book-Rating']).otherwise(col(row['ISBN'])))

# Compute item-item similarity
df_books_pandas = df_books_spark.toPandas().fillna(0)
item_similarity = cosine_similarity(df_books_pandas.set_index('User-ID').T)

# User interactions and item scores
USER_ID = 104636
TOP = 3
user_interactions = df_books_pandas[df_books_pandas['User-ID'] == USER_ID].drop('User-ID', axis=1).values.flatten()
item_scores = user_interactions.dot(item_similarity)

item_scores[user_interactions > 0] = 0
item_scores = (item_scores - item_scores.min()) / (item_scores.max() - item_scores.min()) * 10
item_scores[user_interactions > 0] = 0

recommended_items = np.argsort(item_scores)[::-1][:TOP]

# Display recommended items
table_data = []
for item in recommended_items:
    Isnb = df_books_pandas.columns[item + 1]  # +1 because the first column is 'User-ID'
    title = df_books_name.loc[Isnb]
    Pred = item_scores[item]
    table_data.append([title['Book-Title'], title['Book-Author'], Isnb, Pred])

table_headers = ["Book Title", "Book Author", "ISBN", "Predicted Rating"]
print(tabulate(table_data, headers=table_headers))


KeyboardInterrupt: 

: 