In [None]:
from collections import Counter

from bagoftools.logger import Logger

L = Logger(colorize=True)

In [None]:
links   = pd.read_csv('./data/links.csv')
movies  = pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')
tags    = pd.read_csv('./data/tags.csv')

In [None]:
user_ids  = set(ratings.userId - 1)
movie_ids = set(ratings.movieId - 1)

L.info(f'{len(user_ids):6,d} unique UserIDs')
L.info(f'{len(movie_ids):6,d} unique MovieIDs')

In [None]:
mid2title = dict(zip(movies.movieId, movies.title))

def title_of(uid):
    return mid2title.get(uid + 1, 'N/A')
    
def genres_of(uid):
    ms = movies[movies.movieId.isin(ratings[ratings.userId == uid].movieId)]
    ct = Counter(reduce(lambda x, y: x + '|' + y, ms.genres).split('|'))
    return ct

def preferences_of(uid):
    return np.argsort(-R[uid])

In [None]:
# users x ratings
R = ratings.pivot_table(index=['movieId'], columns=['userId'], values='rating').reset_index(drop=True)
R.fillna(0, inplace=True)
R = R.transpose().to_numpy()

L.info(f'rating matrix shape: {R.shape}')

In [None]:
def user_similarity(uid1, uid2, method='cosine'):
    if method == 'cosine':
        u, v = R[uid1], R[uid2]
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
    
    elif method == 'pearson':
        raise NotImplementedError()
    
    else:
        raise ValueError(f'Invalid method {method}. Expected cosine or pearson.')

        
def similar_users(uid, count=5):
    return sorted(user_ids - {uid}, key=lambda u: user_similarity(uid, u), reverse=True)[:count]


def recommend(uid, count=5):
    rec = np.zeros_like(R[uid])

    simil = {u: user_similarity(uid, u) for u in similar_users(uid, count=5)}

    for i, r in enumerate(R[uid]):
        # already rated
        if r > 0:
            continue

        rec[i] = sum([R[u, i] * simil[u] for u in simil]) / sum(simil.values())

    return np.argsort(-rec)[:count]

In [None]:
recommend(uid=12, count=10)

In [None]:
[title_of(x) for x in recommend(uid=12, count=10)]

In [None]:
[title_of(x) for x in preferences_of(uid=12)[:10]]