In [14]:
import pandas as pd
import math
import os
import pickle

In [15]:
CACHE_DIR = "cache"
os.makedirs(CACHE_DIR, exist_ok=True)

In [16]:
def load_or_compute(path, compute_fn, force=False):
    if os.path.exists(path) and not force:
        print(f"Loading cache → {path}")
        with open(path, "rb") as f:
            return pickle.load(f)

    print(f"Computing & saving → {path}")
    obj = compute_fn()
    with open(path, "wb") as f:
        pickle.dump(obj, f)
    return obj


In [17]:
df_movies = load_or_compute(
    f"{CACHE_DIR}/df_movies.pkl",
    lambda: pd.read_csv("ml-25m/movies.csv", index_col="movieId")
)

df_ratings = load_or_compute(
    f"{CACHE_DIR}/df_ratings.pkl",
    lambda: pd.read_csv("ml-25m/ratings.csv")
)
print(df_movies.shape)
print(df_ratings.shape)

Loading cache → cache/df_movies.pkl
Loading cache → cache/df_ratings.pkl
(62423, 2)
(25000095, 4)


In [18]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


let's make a new sampled down csv which only contains entries from our top 5k movies

### Converting rationgs into interactions 

| rating | interaction |
| ------ | ----------- |
| ≥ 4    | 1           |
| < 4   | ignore      |


In [19]:
MIN_LIKES = 8

def build_top_k():
    # count movie frequency
    counts = {}
    for row in df_ratings.itertuples():
        counts[row.movieId] = counts.get(row.movieId, 0) + 1

    # select top movies
    movies = sorted(counts.items(), key=lambda x: x[1], reverse=True)[:5000]
    movie_ids = [x[0] for x in movies]

    top_k = df_ratings[df_ratings["movieId"].isin(movie_ids)]
    top_k = top_k[top_k["rating"] >= 4].copy()
    top_k["rating"] = 1

    # filter users with minimum likes
    user_like_counts = top_k.groupby("userId").size()
    valid_users = user_like_counts[user_like_counts >= MIN_LIKES].index
    top_k = top_k[top_k["userId"].isin(valid_users)]

    # save human-readable copy
    top_k.to_csv("top5kratings.csv", index=False)

    print("Remaining users:", top_k["userId"].nunique())
    return top_k

top_k = load_or_compute(f"{CACHE_DIR}/top_k.pkl", build_top_k)


Loading cache → cache/top_k.pkl


In [20]:
# now top_k contains all the ratings related to the first top5k movies
top_k.shape

(11759456, 4)

Removing all users who have rated less than 8 films

In [21]:
def build_user_movie_lists():
    user_movie_lists = {}
    for row in top_k.itertuples():
        user_movie_lists.setdefault(row.userId, []).append(row.movieId)
    return user_movie_lists

user_movie_lists = load_or_compute(
    f"{CACHE_DIR}/user_movie_lists.pkl",
    build_user_movie_lists
)

Computing & saving → cache/user_movie_lists.pkl


for each user, we need to make pairs of all the movies he likes.
If there are n users, and each users likes m movies. Then we are left with $ n * (m) * (m-1) * 0.5$ pairs 

We can compress this pairs by building a matrix of movies in which 

$ mat[i][j] = count $

shows how many times that pair occurs.

In [22]:
rows, cols = top_k.shape

top_k = top_k.reset_index(drop=True)
top_k.loc[2]

userId                1
movieId             665
rating                1
timestamp    1147878820
Name: 2, dtype: int64

In [23]:
# This creates a Series where the index is userId and the value is a list of movieIds
user_movie_lists = top_k.groupby("userId")["movieId"].apply(list)

maybe we can write a better cooccur function using combinactorics

In [24]:
def build_cooccur():
    cooccur = {}

    for user_id, movies in user_movie_lists.items():
        movies = list(set(movies))
        for i in range(len(movies)):
            for j in range(i+1, len(movies)):

                mini = min(movies[i], movies[j])
                maxi = max(movies[i], movies[j])

                if mini not in cooccur:
                    cooccur[mini] = {}

                if maxi not in cooccur[mini]:
                    cooccur[mini][maxi] = 0

                cooccur[mini][maxi] += 1

    return cooccur

cooccur = load_or_compute(f"{CACHE_DIR}/cooccur.pkl", build_cooccur)

Computing & saving → cache/cooccur.pkl


In [25]:
sum = 0
for i in cooccur:
    sum += len(cooccur[i])

print(sum)

11897161


Using Pointwise Mutual Information to remove popularity bias

In [28]:
def build_movie_likes():
    movie_likes = {}
    for movies in user_movie_lists:
        for m in movies:
            movie_likes[m] = movie_likes.get(m, 0) + 1
    return movie_likes

movie_likes = load_or_compute(
    f"{CACHE_DIR}/movie_likes.pkl",
    build_movie_likes
)

TOTAL_USERS = len(user_movie_lists)

Computing & saving → cache/movie_likes.pkl


In [None]:

def build_pmi_graph():
    pmi_graph = {}

    for i in cooccur:
        if movie_likes[i] == 0:
            continue

        pmi_graph[i] = {}
        for j, cij in cooccur[i].items():
            if movie_likes[j] == 0:
                continue

            val = math.log((cij * TOTAL_USERS) /
                        (movie_likes[i] * movie_likes[j]))

            if val > 0:                     # keep only meaningful associations
                pmi_graph[i][j] = val

    return pmi_graph

pmi_graph = load_or_compute(
    f"{CACHE_DIR}/pmi_graph.pkl",
    build_pmi_graph,
    validate_fn=lambda x: isinstance(x, dict) and len(x) > 1000
)


Computing & saving → cache/pmi_graph.pkl
