# Movie Recommendations

This notebook uses the [MovieLens dataset](https://grouplens.org/datasets/movielens/latest/)
as well as content information that is linked through the respective movie pages on [TMDB](https://www.themoviedb.org/)

* I have included csv files under our class repo on GitHub
* License info is included in the file https://raw.githubusercontent.com/benjum/UCLA-24W-DH150/main/Data/movielens-data/README.txt

In [None]:
import pandas as pd

In [None]:
ratings = pd.read_csv('https://raw.githubusercontent.com/benjum/UCLA-24W-DH150/main/Data/movielens-data/ratings.csv')
movies = pd.read_csv('https://raw.githubusercontent.com/benjum/UCLA-24W-DH150/main/Data/movielens-data/movies.csv')

In [None]:
ratings

In [None]:
movies

610 users and 9724 movies

In [None]:
len(ratings['userId'].unique())

In [None]:
len(ratings['movieId'].unique())

In [None]:
ratings['rating'].unique()

# Idea 2: Recommend based on Collaborative Filtering

In [None]:
people = ['Alice','Ben','Charlie','Dan','Evelyn']

In [None]:
import random

In [None]:
random.seed(3)

In [None]:
random.randint(1,5)

In [None]:
movies['title'][:5]

In [None]:
m10 = list(movies['title'][:10])

In [None]:
m10

In [None]:
m = []
p = []
r = []
for i in m10[:5]:
    for j in people:
        m.append(i)
        p.append(j)
        r.append(random.randint(1,5))
for i in m10[5:10]:
    for j in people:
        if j != 'Ben':
            m.append(i)
            p.append(j)
            r.append(random.randint(1,5))            

df = pd.DataFrame({'people':p, 'movie':m, 'rating':r})

In [None]:
df

In [None]:
df[df['people']=='Evelyn']

In [None]:
df[df['people']=='Dan']

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(df.loc[df['people']=='Ben', 'rating'], df.loc[df['people']=='Alice', 'rating'][:5])
plt.xlim([0,6])
plt.ylim([0,6])
plt.show()

In [None]:
plt.scatter(df.loc[df['people']=='Ben', 'rating'], df.loc[df['people']=='Charlie', 'rating'][:5])
plt.xlim([0,6])
plt.ylim([0,6])
plt.show()

In [None]:
plt.scatter(df.loc[df['people']=='Ben', 'rating'], df.loc[df['people']=='Dan', 'rating'][:5])
plt.xlim([0,6])
plt.ylim([0,6])
plt.show()

In [None]:
plt.scatter(df.loc[df['people']=='Ben', 'rating'], df.loc[df['people']=='Evelyn', 'rating'][:5])
plt.xlim([0,6])
plt.ylim([0,6])
plt.show()

In [None]:
import scipy.spatial
import scipy.stats

In [None]:
def d1(a):
    b = df.loc[df['people']=='Ben', 'rating']
    person_a = df.loc[df['people']==a, 'rating'][:5]
    return 1 / (1 + scipy.spatial.distance.euclidean(b, person_a))

In [None]:
d1('Evelyn')

In [None]:
for i in people:
    if i != 'Ben':
        print(i,':',d1(i))

In [None]:
def topsim(a):
    best = {}
    for i in people:
        if i != 'Ben':
            best[i] = a(i)
    return dict(sorted(best.items(), key=lambda item: -item[1]))

In [None]:
topsim(d1)

In [None]:
def d_euclidean(a):
    b = df.loc[df['people']=='Ben', 'rating']
    person_a = df.loc[df['people']==a, 'rating'][:5]
    return 1 / (1 + scipy.spatial.distance.euclidean(b, person_a))

In [None]:
def d_cosine(a):
    b = df.loc[df['people']=='Ben', 'rating']
    person_a = df.loc[df['people']==a, 'rating'][:5]
    return 1 / (1 + scipy.spatial.distance.cosine(b, person_a))

In [None]:
topsim(d_euclidean)

In [None]:
topsim(d_cosine)

In [None]:
scipy.stats.pearsonr(df.loc[df['people']=='Ben', 'rating'], df.loc[df['people']=='Dan', 'rating'][:5])

In [None]:
def d_pearson(a):
    b = df.loc[df['people']=='Ben', 'rating']
    person_a = df.loc[df['people']==a, 'rating'][:5]
    return scipy.stats.pearsonr(b, person_a)[0]

In [None]:
def d_spearman(a):
    b = df.loc[df['people']=='Ben', 'rating']
    person_a = df.loc[df['people']==a, 'rating'][:5]
    return scipy.stats.spearmanr(b, person_a)[0]

In [None]:
topsim(d_pearson)

In [None]:
topsim(d_spearman)

In [None]:
# To get rankings of my unseen movies, I could just look at Alice's ratings, since she's closest to me.

alice_ratings = df.loc[df['people'] == 'Alice', ['movie','rating']][5:]
alice_ratings.sort_values(by='rating',ascending=False)

In [None]:
df

In [None]:
people

In [None]:
# Better is to look at weighted average over all people
# with weights equal to the similarity scores
# and divide the total by the sum of all the similarity scores

movie_predictions = {}
weights = topsim(d_euclidean)

# Get my unseen 5 movies and assign an initial score of 0
for i in movies['title'][5:10]:
    
    # Calculate the weighted score based on my similarity with others
    total_weight = 0
    weighted_rating = 0
    for person in people:
        if person != 'Ben':
            weight = weights[person]
            weighted_rating += weight * df.loc[(df['people'] == person) & (df['movie'] == i), 'rating'].iloc[0]
            total_weight += weight
    movie_predictions[i] = weighted_rating / total_weight
    
preds_sorted = dict(sorted(movie_predictions.items(), key=lambda item: -item[1]))
print(preds_sorted)

In [None]:
# Better is to look at weighted average over all people
# with weights equal to the similarity scores
# and divide the total by the sum of all the similarity scores

movie_predictions = {}
weights = topsim(d_spearman)

# Get my unseen 5 movies and assign an initial score of 0
for i in movies['title'][5:10]:
    
    # Calculate the weighted score based on my similarity with others
    total_weight = 0
    weighted_rating = 0
    for person in people:
        if person != 'Ben':
            weight = weights[person]
            weighted_rating += weight * df.loc[(df['people'] == person) & (df['movie'] == i), 'rating'].iloc[0]
            total_weight += weight
    movie_predictions[i] = weighted_rating / total_weight
    
preds_sorted = dict(sorted(movie_predictions.items(), key=lambda item: -item[1]))
print(preds_sorted)

In [None]:
preds_sorted

In [None]:
pd.DataFrame.from_dict(preds_sorted, orient='index').plot.barh()

## Item-based vs user-based collaborative filtering

What if we look not at the similarity between people, but the similarity between movies?

... effectively, conceptually, this is just switching the places of people and movies.

In [None]:
m10

In [None]:
def topsim2(movie, method):
    best = {}
    for i in m10:
        if i != movie:
            best[i] = method(movie, i)
    return dict(sorted(best.items(), key=lambda item: -item[1]))

In [None]:
def d_euclidean2(movie1, movie2):
    rating1 = []
    rating2 = []
    for i in people:
        if movie1 in df.loc[df['people']==i, 'movie'].unique() and movie2 in df.loc[df['people']==i, 'movie'].unique():
            rating1.append(df.loc[(df['people']==i) & (df['movie']==movie1), 'rating'].iloc[0])
            rating2.append(df.loc[(df['people']==i) & (df['movie']==movie2), 'rating'].iloc[0])
    return 1 / (1 + scipy.spatial.distance.euclidean(rating1, rating2))

In [None]:
topsim2('Toy Story (1995)', d_euclidean2)

In [None]:
# For movies that I haven't seen, we follow a similar procedure to calculate a weighted average

movie_predictions = {}

# Get my unseen 5 movies and assign an initial score of 0
for i in m10[5:10]:

    # now the weights change for every movie pair
    weights = topsim2(i, d_euclidean2)

    # Calculate the weighted score based on my movie ratings
    total_weight = 0
    weighted_rating = 0
    for m in m10[:5]:
        weight = weights[m]
        weighted_rating += weight * df.loc[(df['people'] == 'Ben') & (df['movie'] == m), 'rating'].iloc[0]
        total_weight += weight
    movie_predictions[i] = weighted_rating / total_weight
    
preds_sorted = dict(sorted(movie_predictions.items(), key=lambda item: -item[1]))
print(preds_sorted)

In [None]:
pd.DataFrame.from_dict(preds_sorted, orient='index').plot.barh()