## Collaborative Filtering

In [331]:
# Import relevant libraries
import pandas as pd
import numpy as np
import math

In [332]:
def read_ratings_xlsx(path, sheet_name=0):
    df = pd.read_excel(path, sheet_name, header=0)
    first_col = df.columns[0]
    df = df.rename(columns={first_col: "critic"})

    df['critic'] = df['critic'].astype(str).str.strip()
    df = df.set_index('critic')
    df.columns = [str(c).strip() for c in df.columns]
    df = df.apply(pd.to_numeric, errors='coerce')

    result = {}
    for critic, row in df.iterrows():
        movies = {}
        for movie, val in row.dropna().items(): 
            movies[movie] = float(val)
        result[critic] = movies

    return result, df.columns.to_list()

# Get the critics' ratings and movie list
critiques, movies = read_ratings_xlsx("data/movie_critique.xlsx")

# Get the list of movies rated by Lisa Rose
critiques['Lisa Rose']

{'Lady': 2.5,
 'Snakes': 3.5,
 'Luck': 3.0,
 'Superman': 3.5,
 'Dupree': 2.5,
 'Night': 3.0}

### Simple Distance Functions

In [333]:
def sim_distanceManhattan(person1, person2):
    distance = 0
    for critique1 in person1:
        if critique1 in person2:
            distance += abs(person1[critique1] - person2[critique1])
    return distance

# Get the Manhattan distance between Lisa Rose and Gene Seymour
sim_distanceManhattan(critiques['Lisa Rose'], critiques['Gene Seymour'])

4.5

In [334]:
def sim_distanceEuclidienne(person1, person2):
    distance = 0
    for critique1 in person1:
        if critique1 in person2:
            distance += (person1[critique1] - person2[critique1]) ** 2
    return distance ** 0.5

# Get the Euclidean distance between Lisa Rose and Gene Seymour
sim_distanceEuclidienne(critiques['Lisa Rose'], critiques['Gene Seymour'])

2.3979157616563596

In [335]:
def computeNearestNeighbor(nouveauCritique, critiques):
    distances=[]
    for critique in critiques:
        if critique!=nouveauCritique:
            distance=sim_distanceManhattan(critiques[critique], critiques[nouveauCritique])
            distances.append((distance, critique))
    distances.sort()
    return distances

# Find the nearest neighbors for Lisa Rose
computeNearestNeighbor('Lisa Rose', critiques)

[(1.5, 'Michael Phillips'),
 (2.0, 'Claudia Puig'),
 (2.5, 'Anne'),
 (3.0, 'Mick Lasalle'),
 (3.0, 'Toby'),
 (3.5, 'Jack Matthews'),
 (4.5, 'Gene Seymour')]

In [336]:
def recommendNearestNeighbor(nouveauCritique, critiques):
    distances = computeNearestNeighbor(nouveauCritique, critiques)
    nearestNeighbor = critiques[distances[0][1]]
    recommendations = []

    for critique in nearestNeighbor:
        if critique not in critiques[nouveauCritique]:
            recommendations.append((critique, nearestNeighbor[critique]))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations

# Get movie recommendations for Lisa Rose
recommendNearestNeighbor('Lisa Rose', critiques)

[]

In [337]:
computeNearestNeighbor('Toby', critiques)

[(1.0, 'Anne'),
 (2.0, 'Michael Phillips'),
 (2.5, 'Claudia Puig'),
 (2.5, 'Mick Lasalle'),
 (3.0, 'Lisa Rose'),
 (4.0, 'Jack Matthews'),
 (4.5, 'Gene Seymour')]

In [338]:
recommendNearestNeighbor('Toby', critiques)

[('Luck', 4.0), ('Lady', 1.5)]

In [339]:
def computeNearestNeighborEuclidienne(nouveauCritique, critiques):
    distances=[]
    for critique in critiques:
        if critique!=nouveauCritique:
            distance=sim_distanceEuclidienne(critiques[critique], critiques[nouveauCritique])
            distances.append((distance, critique))
    distances.sort()
    return distances

# Find the nearest neighbors for Toby using Euclidean distance
computeNearestNeighborEuclidienne('Toby', critiques)

[(1.0, 'Anne'),
 (1.5, 'Mick Lasalle'),
 (1.5811388300841898, 'Michael Phillips'),
 (1.8027756377319946, 'Claudia Puig'),
 (1.8708286933869707, 'Lisa Rose'),
 (2.7386127875258306, 'Jack Matthews'),
 (2.8722813232690143, 'Gene Seymour')]

Notably, when Euclidian distance is used instead, the ranking of distances changes slightly.

In [340]:
def recommendNearestNeighborEuclidienne(nouveauCritique, critiques):
    distances = computeNearestNeighborEuclidienne(nouveauCritique, critiques)
    nearestNeighbor = critiques[distances[0][1]]
    recommendations = []

    for critique in nearestNeighbor:
        if critique not in critiques[nouveauCritique]:
            recommendations.append((critique, nearestNeighbor[critique]))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations

# Get movie recommendations for Toby using Euclidean distance
recommendNearestNeighborEuclidienne('Toby', critiques)

[('Luck', 4.0), ('Lady', 1.5)]

BUT - the closest critic remains the same.

In [341]:
def BestRecommendNearestNeighbor(recommendations):
    # Simple function to return the best recommendation given the above functions
    name, score = recommendations[0]
    return (name, round(float(score), 2))

# Get the best recommendation for Toby using Manhattan distance
BestRecommendNearestNeighbor(recommendNearestNeighbor('Toby', critiques))

('Luck', 4.0)

### Weighted Recommendations

#### Manhattan and Euclidian Recommendations

In [342]:
def Bestrecommend(nouveauCritique, critiques, movies):
    # Weighted recommendation function using Manhattan distance
    distances = computeNearestNeighbor(nouveauCritique, critiques)
    globalScores = {}
    for movie in movies:
        if movie not in critiques[nouveauCritique]:
            scoreSum = 0
            weightSum = 0
            globalScores[movie] = 0
            for i in range(len(distances)):
                criticName = distances[i][1]
                if movie in critiques[criticName]:
                    score = critiques[criticName][movie]
                    weight = 1 / (1 + distances[i][0])
                    scoreSum += score * weight
                    weightSum += weight

            if weightSum > 0:
                globalScores[movie] = scoreSum / weightSum
                
    globalScores = sorted(globalScores.items(), key=lambda x: x[1], reverse=True)
    
    name, score = globalScores[0]
    return (name, round(float(score), 2))

In [343]:
def BestrecommendEuclidienne(nouveauCritique, critiques, movies):
    # Weighted recommendation function using Euclidean distance
    distances = computeNearestNeighborEuclidienne(nouveauCritique, critiques)
    globalScores = {}
    for movie in movies:
        if movie not in critiques[nouveauCritique]:
            scoreSum = 0
            weightSum = 0
            globalScores[movie] = 0
            for i in range(len(distances)):
                criticName = distances[i][1]
                if movie in critiques[criticName]:
                    score = critiques[criticName][movie]
                    weight = 1 / (1 + distances[i][0])
                    scoreSum += score * weight
                    weightSum += weight

            if weightSum > 0:
                globalScores[movie] = scoreSum / weightSum
    
    globalScores = sorted(globalScores.items(), key=lambda x: x[1], reverse=True)

    name, score = globalScores[0]
    return (name, round(float(score), 2))

#### Exponential Recommendation

In [344]:
def BestrecommendwithExp(nouveauCritique, critiques, movies):
    distances = computeNearestNeighbor(nouveauCritique, critiques)
    globalScores = {}
    for movie in movies:
        if movie not in critiques[nouveauCritique]:
            scoreSum = 0
            weightSum = 0
            globalScores[movie] = 0
            for i in range(len(distances)):
                criticName = distances[i][1]
                if movie in critiques[criticName]:
                    score = critiques[criticName][movie]
                    weight = np.exp(-distances[i][0])
                    scoreSum += score * weight
                    weightSum += weight

            if weightSum > 0:
                globalScores[movie] = scoreSum / weightSum


    globalScores = sorted(globalScores.items(), key=lambda x: x[1], reverse=True)
    
    name, score = globalScores[0]
    return (name, round(float(score), 2))

#### Pearson Recommendation

In [345]:
def pearson(person1, person2):
    sum_xy=0
    sum_x=0
    sum_y=0
    sum_x2=0
    sum_y2=0
    n=0
    for key in person1:
        if key in person2:
            n += 1
            x=person1[key]
            y=person2[key]
            sum_xy +=x*y
            sum_x += x
            sum_y += y
            sum_x2 += x**2
            sum_y2 += y**2
    if n == 0:
        return 0 
    denominator = math.sqrt(sum_x2 - (sum_x**2) / n) * math.sqrt(sum_y2 - (sum_y**2) / n)
    if denominator == 0:
        return 0
    else:
        return (sum_xy - (sum_x * sum_y) /n ) / denominator

def PearsonRecommend(nouveauCritique, critiques, movies):
    globalScores = {}
    for movie in movies:
        if movie not in critiques[nouveauCritique]:
            scoreSum = 0
            weightSum = 0
            globalScores[movie] = 0
            for critic in critiques:
                if movie in critiques[critic]:
                    score = critiques[critic][movie]
                    weight = pearson(critiques[critic], critiques[nouveauCritique])
                    scoreSum += score * weight
                    weightSum += np.abs(weight)

            if weightSum > 0:
                globalScores[movie] = scoreSum / weightSum
    
    globalScores = sorted(globalScores.items(), key=lambda x: x[1], reverse=True)

    name, score = globalScores[0]
    return (name, round(float(score), 2))

#### Cosine Recommendation

In [346]:
def cosine(person1, person2):
    sum_x2 = 0
    sum_y2 = 0
    sum_xy = 0
    for key in person1:
        if key in person2:
            x=person1[key]
            y=person2[key]
            sum_xy += x * y
            sum_x2 += x ** 2
            sum_y2 += y ** 2
    denominator = math.sqrt(sum_x2) * math.sqrt(sum_y2)
    if denominator == 0:
        return 0
    else:
        return sum_xy / denominator

def CosineRecommend(nouveauCritique, critiques, movies):
    globalScores = {}
    for movie in movies:
        if movie not in critiques[nouveauCritique]:
            scoreSum = 0
            weightSum = 0
            globalScores[movie] = 0
            for critic in critiques:
                if movie in critiques[critic]:
                    score = critiques[critic][movie]
                    weight = cosine(critiques[critic], critiques[nouveauCritique])
                    scoreSum += score * weight
                    weightSum += np.abs(weight)

            if weightSum > 0:
                globalScores[movie] = scoreSum / weightSum
    
    globalScores = sorted(globalScores.items(), key=lambda x: x[1], reverse=True)
    
    name, score = globalScores[0]
    return (name, round(float(score), 2))

### Helper Functions

In [347]:
def percentageEmptyCells(critiques, movies):
    total_cells = len(movies) * len(critiques)
    empty_cells = 0
    for critic in critiques:
        empty_cells += len(movies) - len(critiques[critic])
    return (empty_cells / total_cells) * 100 if total_cells > 0 else 0

def recommendAll(critic, critiques, movies):
    print("Recommendations for", critic)

    print("Percentage of empty cells:", percentageEmptyCells(critiques, movies), "%")

    print("\nBest Recommend with Nearest Neighbor (Manhattan):")
    print(BestRecommendNearestNeighbor(recommendNearestNeighbor(critic, critiques)))

    print("\nBest Recommend with Manhattan:")
    print(Bestrecommend(critic, critiques, movies))    

    print("\nBest Recommend with Euclidienne:")
    print(BestrecommendEuclidienne(critic, critiques, movies))  

    print("\nBest Recommend with Exponential Weighting:")
    print(BestrecommendwithExp(critic, critiques, movies))

    print("\nPearson Recommend:")
    print(PearsonRecommend(critic, critiques, movies))

    print("\nCosine Recommend:")
    print(CosineRecommend(critic, critiques, movies))

#### Simple Movie Critique

In [348]:
movie_critiques, movies = read_ratings_xlsx("data/movie_critique.xlsx")
recommendAll("Anne", movie_critiques, movies)

Recommendations for Anne
Percentage of empty cells: 20.833333333333336 %

Best Recommend with Nearest Neighbor (Manhattan):
('Night', 4.0)

Best Recommend with Manhattan:
('Superman', 3.91)

Best Recommend with Euclidienne:
('Superman', 3.93)

Best Recommend with Exponential Weighting:
('Night', 3.93)

Pearson Recommend:
('Superman', 1.31)

Cosine Recommend:
('Superman', 3.99)


#### Music Critique



In [349]:
music_critiques, music = read_ratings_xlsx("data/music_critique.xlsx")

In [350]:
recommendAll("Veronica", music_critiques, music)

Recommendations for Veronica
Percentage of empty cells: 23.4375 %

Best Recommend with Nearest Neighbor (Manhattan):
('Broken Bells', 4.0)

Best Recommend with Manhattan:
('Broken Bells', 3.24)

Best Recommend with Euclidienne:
('Broken Bells', 3.12)

Best Recommend with Exponential Weighting:
('Broken Bells', 3.71)

Pearson Recommend:
('Vampire Weekend', 0.28)

Cosine Recommend:
('Broken Bells', 3.02)


In [351]:
recommendAll("Hailey", music_critiques, music)

Recommendations for Hailey
Percentage of empty cells: 23.4375 %

Best Recommend with Nearest Neighbor (Manhattan):
('Phoenix', 4.0)

Best Recommend with Manhattan:
('Phoenix', 4.14)

Best Recommend with Euclidienne:
('Phoenix', 4.18)

Best Recommend with Exponential Weighting:
('Phoenix', 4.13)

Pearson Recommend:
('Phoenix', 4.59)

Cosine Recommend:
('Phoenix', 4.18)


#### Movie Critique

In [352]:
print("SAME RECOMMENDATIONS")
critiques, movies = read_ratings_xlsx("data/same_recommendation.xlsx")
recommendAll("Reviewer1", critiques, movies)

SAME RECOMMENDATIONS
Recommendations for Reviewer1
Percentage of empty cells: 44.44444444444444 %

Best Recommend with Nearest Neighbor (Manhattan):
('Movie15', 10.0)

Best Recommend with Manhattan:
('Movie15', 10.0)

Best Recommend with Euclidienne:
('Movie15', 10.0)

Best Recommend with Exponential Weighting:
('Movie15', 10.0)

Pearson Recommend:
('Movie15', 10.0)

Cosine Recommend:
('Movie15', 10.0)


In [353]:
print("DIFFERENT RECOMMENDATIONS")
# critiques, movies = read_ratings_xlsx("data/different_recommendations.xlsx", 2)
# recommendAll("Reviewer1", critiques, movies)

DIFFERENT RECOMMENDATIONS
