In [2]:
import tqdm
import json

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

In [3]:
BOTIFY_DATA_DIR = "/Users/n.anokhin/Projects/recsys-course/botify/data/"

data = pd.read_json("/Users/n.anokhin/Desktop/input.json", lines=True)[["user", "time", "track"]].copy()

data.head()

Unnamed: 0,user,time,track
0,2992,1.0,5776
1,5146,1.0,6022
2,6892,1.0,10163
3,2992,0.0,8433
4,5146,0.02,41940


In [4]:
data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())

data.head()

Unnamed: 0,user,time,track,normalized_time
0,2992,1.0,5776,0.78375
1,5146,1.0,6022,0.7284
2,6892,1.0,10163,0.833333
3,2992,0.0,8433,-0.21625
4,5146,0.02,41940,-0.2516


In [21]:
interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)

print(f"Interactions matrix: shape={interactions.shape}, sparsity={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(8759, 46209), sparsity=0.00036724390792474776


In [22]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)

print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 40.980705559995435


In [23]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 21.3225254024432


In [24]:
# TODO: Compute proper user-based scores
# TODO: expected size: observed users x observed tracks
scores_matrix = np.matmul(similarity_matrix, interactions.values)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[1, 2, 3, 4, 5]].head()

track,1,2,3,4,5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,-0.055265
4,0.0,0.0,0.0,0.0,0.0


## Глянем на рекомендации

In [26]:
products = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True).set_index("track")
products.head()

Unnamed: 0_level_0,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Jack Johnson,The Cove
1,Billy Preston,Nothing from Nothing
2,Paco De Lucia,Entre Dos Aguas
3,Josh Rouse,Under Cold Blue Stars
4,The Dead 60s,Riot Radio (Soundtrack Version)


In [46]:
user = np.random.choice(scores.index)
k = 10

# data[data["user"] == user]

In [47]:
data[data["user"] == user]

Unnamed: 0,user,time,track,normalized_time
107149,9071,1.0,3918,0.828333
107152,9071,0.03,48975,-0.141667
107155,9071,0.0,46220,-0.171667
107157,9071,0.0,14204,-0.171667
107159,9071,0.0,5984,-0.171667
107161,9071,0.0,39056,-0.171667


In [49]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)

user_scores

Unnamed: 0_level_0,score,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3918,2.474152,Mike And The Mechanics,A Beggar On A Beach Of Gold
38988,0.456922,Cabaret Voltaire,Do The Mussolini (Head Kick) They Kill Him Dub
38763,0.355865,Hole,Rock Star
14754,0.348692,Mogwai,Glasgow Mega-Snake
14401,0.32757,Katie Melua,On The Road Again
3590,0.313707,The Ruts,West One (Shine On Me)
1487,0.297004,Justin Bieber,One Time
1495,0.297004,Justin Bieber,U Smile
8262,0.297004,The Prodigy,The Big Gundown
5683,0.277812,Echo And The Bunnymen,Zimbo (Live)


In [48]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3918,0.828333,Mike And The Mechanics,A Beggar On A Beach Of Gold
48975,-0.141667,Blitzen Trapper,Summer Town (Album)
46220,-0.171667,Third Day,Believe
14204,-0.171667,Valley of the Giants,Bala Bay Inn
39056,-0.171667,Marshall Jefferson,Move Your Body
5984,-0.171667,Bad Company,Gone_ Gone_ Gone


## Подготавливаем рекомендации для продакшена

In [None]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

In [None]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")