# Collaborative Filtering

In collaborative filtering, we observe similar users or items when making recommendations.


In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Preparing Dataset

Below we have a rating matrix. The rows are users, and the columns are movie genre.  We want to predict the rating of `anime` genre for `user b`.

In [2]:
critics = {
    "Lisa Rose": {
        "Lady in the Water": 2.5,
        "Snakes on a Plane": 3.5,
        "Just My Luck": 3.0,
        "Superman Returns": 3.5,
        "You, Me and Dupree": 2.5,
        "The Night Listener": 3.0,
    },
    "Gene Seymour": {
        "Lady in the Water": 3.0,
        "Snakes on a Plane": 3.5,
        "Just My Luck": 1.5,
        "Superman Returns": 5.0,
        "The Night Listener": 3.0,
        "You, Me and Dupree": 3.5,
    },
    "Michael Phillips": {
        "Lady in the Water": 2.5,
        "Snakes on a Plane": 3.0,
        "Superman Returns": 3.5,
        "The Night Listener": 4.0,
    },
    "Claudia Puig": {
        "Snakes on a Plane": 3.5,
        "Just My Luck": 3.0,
        "The Night Listener": 4.5,
        "Superman Returns": 4.0,
        "You, Me and Dupree": 2.5,
    },
    "Mick LaSalle": {
        "Lady in the Water": 3.0,
        "Snakes on a Plane": 4.0,
        "Just My Luck": 2.0,
        "Superman Returns": 3.0,
        "The Night Listener": 3.0,
        "You, Me and Dupree": 2.0,
    },
    "Jack Matthews": {
        "Lady in the Water": 3.0,
        "Snakes on a Plane": 4.0,
        "The Night Listener": 3.0,
        "Superman Returns": 5.0,
        "You, Me and Dupree": 3.5,
    },
    "Toby": {
        "Snakes on a Plane": 4.5,
        "You, Me and Dupree": 1.0,
        "Superman Returns": 4.0,
    },
}

df = pd.DataFrame(critics).T
df

Unnamed: 0,Lady in the Water,Snakes on a Plane,Just My Luck,Superman Returns,"You, Me and Dupree",The Night Listener
Lisa Rose,2.5,3.5,3.0,3.5,2.5,3.0
Gene Seymour,3.0,3.5,1.5,5.0,3.5,3.0
Michael Phillips,2.5,3.0,,3.5,,4.0
Claudia Puig,,3.5,3.0,4.0,2.5,4.5
Mick LaSalle,3.0,4.0,2.0,3.0,2.0,3.0
Jack Matthews,3.0,4.0,,5.0,3.5,3.0
Toby,,4.5,,4.0,1.0,


In [3]:
df.T.corr()

Unnamed: 0,Lisa Rose,Gene Seymour,Michael Phillips,Claudia Puig,Mick LaSalle,Jack Matthews,Toby
Lisa Rose,1.0,0.396059,0.40452,0.566947,0.594089,0.747018,0.991241
Gene Seymour,0.396059,1.0,0.204598,0.31497,0.411765,0.963796,0.381246
Michael Phillips,0.40452,0.204598,1.0,1.0,-0.258199,0.13484,-1.0
Claudia Puig,0.566947,0.31497,1.0,1.0,0.566947,0.028571,0.893405
Mick LaSalle,0.594089,0.411765,-0.258199,0.566947,1.0,0.211289,0.924473
Jack Matthews,0.747018,0.963796,0.13484,0.028571,0.211289,1.0,0.662849
Toby,0.991241,0.381246,-1.0,0.893405,0.924473,0.662849,1.0


In [4]:
def similar_to(df, user, n=5):
    """
    Finding the top-n users is as simple as just computing the pearson correlation scores,
    and returning the sorted result.
    """
    return sorted(
        df.T.corr()[user].drop(user).items(), key=lambda t: t[1], reverse=True
    )[:n]

In [5]:
similar_to(df, "Toby")

[('Lisa Rose', 0.9912407071619305),
 ('Mick LaSalle', 0.924473451641905),
 ('Claudia Puig', 0.8934051474415642),
 ('Jack Matthews', 0.6628489803598702),
 ('Gene Seymour', 0.3812464258315117)]

In [6]:
# For item based collaborative filtering, we just transpose the df.
similar_to(df.T, "Just My Luck")

[('The Night Listener', 0.5555555555555556),
 ('Snakes on a Plane', -0.3333333333333333),
 ('Superman Returns', -0.42289003161103117),
 ('You, Me and Dupree', -0.48566186425718266),
 ('Lady in the Water', -0.9449111825230683)]

In [7]:
def recommend(df, user):
    similarity_scores = similar_to(df, user)
    recs = []

    # Only select movies that has np.nan ratings.
    not_watched = df.columns[df.loc[user].isnull()]
    for movie in not_watched:
        # Ratings for the movie from other users.
        rated_by_user = dict(df[movie].dropna())

        sum_weight = 0
        sum_rating = 0

        for user, weight in similarity_scores:
            # Ignore users that did not give rating.
            if user not in rated_by_user:
                continue

            sum_weight += weight
            sum_rating += weight * rated_by_user[user]

        recs.append((movie, sum_rating / sum_weight))

    # Sort by rating, in descending order (highest to lowest rating)
    return sorted(recs, key=lambda t: t[1], reverse=True)

In [8]:
recommend(df, "Toby")

[('The Night Listener', 3.3477895267131013),
 ('Lady in the Water', 2.8325499182641622),
 ('Just My Luck', 2.5309807037655645)]

In [9]:
recommend(df, "Michael Phillips")

[('Just My Luck', 2.963951538816176),
 ('You, Me and Dupree', 2.815352371380952)]