In [3]:
import tqdm
import json
import glob

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

$$\hat r_{ui} \propto \sum_{v} w_{uv} h(r_{vi})$$

In [5]:
data = pd.concat([
    pd.read_json(data_path, lines=True) 
    for data_path 
    in glob.glob("/Users/n.anokhin/Desktop/data/*/data.json")
])
data["rnd"] = np.random.random(len(data))

data.head(5)

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd
0,next,2024-02-19 07:56:30.469,5650,16053,0.8,0.021466,1817.0,{'STICKY_ARTIST': 'T2'},0.800656
1,next,2024-02-19 07:56:30.504,5100,18840,0.45,0.002275,708.0,{'STICKY_ARTIST': 'T1'},0.27283
2,next,2024-02-19 07:56:30.534,6269,9804,0.0,0.001347,49810.0,{'STICKY_ARTIST': 'T1'},0.935573
3,next,2024-02-19 07:56:30.554,771,525,1.0,0.000751,49387.0,{'STICKY_ARTIST': 'T3'},0.733893
4,next,2024-02-19 07:56:30.573,771,49387,0.02,0.001765,1873.0,{'STICKY_ARTIST': 'T3'},0.334892


In [6]:
data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())

data.head()

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd,normalized_time
0,next,2024-02-19 07:56:30.469,5650,16053,0.8,0.021466,1817.0,{'STICKY_ARTIST': 'T2'},0.800656,0.376852
1,next,2024-02-19 07:56:30.504,5100,18840,0.45,0.002275,708.0,{'STICKY_ARTIST': 'T1'},0.27283,0.108333
2,next,2024-02-19 07:56:30.534,6269,9804,0.0,0.001347,49810.0,{'STICKY_ARTIST': 'T1'},0.935573,-0.226667
3,next,2024-02-19 07:56:30.554,771,525,1.0,0.000751,49387.0,{'STICKY_ARTIST': 'T3'},0.733893,0.629038
4,next,2024-02-19 07:56:30.573,771,49387,0.02,0.001765,1873.0,{'STICKY_ARTIST': 'T3'},0.334892,-0.350962


In [7]:
interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)

print(f"Interactions matrix: shape={interactions.shape}, sparsity={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(9515, 44610), sparsity=0.0004127250793736055


In [8]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)

print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 59.66431949553337


In [9]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 33.41229637414609


In [19]:
# TODO Seminar 3 step 1.1: Compute user-based scores
# TODO: expected size: observed users x observed tracks
scores_matrix = np.zeros_like(interactions)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[1, 2, 3, 4, 5]].head()

track,1,2,3,4,5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


## Глянем на рекомендации

In [20]:
BOTIFY_DATA_DIR = "/Users/n.anokhin/Projects/recsys-course/botify/data/"

In [21]:
products = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True).set_index("track")
products.head()

Unnamed: 0_level_0,artist,title,genre,pop
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,Harmonia,Sehr kosmisch,Pop_Rock,65688
0,Björk,Undo,,57660
2,Dwight Yoakam,You're The One,Country,55035
1,Florence + The Machine,Dog Days Are Over (Radio Edit),,52773
15,Kings Of Leon,Revelry,Pop_Rock,48290


In [22]:
user = np.random.choice(scores.index)
k = 10

data[data["user"] == user]

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd,normalized_time
2452,next,2024-02-19 07:57:19.437,1045,5573,1.0,0.010495,35667.0,{'STICKY_ARTIST': 'T2'},0.038974,0.754118
2454,next,2024-02-19 07:57:19.464,1045,35667,0.0,0.003881,3553.0,{'STICKY_ARTIST': 'T2'},0.179203,-0.245882
3634,next,2024-02-19 07:57:39.357,1045,31657,0.01,0.003484,31657.0,{'STICKY_ARTIST': 'T2'},0.921108,-0.235882
3635,next,2024-02-19 07:57:39.368,1045,31657,0.0,0.002554,29891.0,{'STICKY_ARTIST': 'T2'},0.106091,-0.245882
10618,next,2024-02-19 07:59:27.452,1045,45218,0.0,0.001436,31187.0,{'STICKY_ARTIST': 'T2'},0.245053,-0.245882
12877,next,2024-02-19 07:59:54.913,1045,32528,0.0,0.005296,37551.0,{'STICKY_ARTIST': 'T2'},0.554635,-0.245882
14459,next,2024-02-19 08:00:22.156,1045,15093,1.0,0.001864,37545.0,{'STICKY_ARTIST': 'T2'},0.986381,0.754118
2453,next,2024-02-19 07:57:19.454,1045,35667,0.0,0.009436,35667.0,{'STICKY_ARTIST': 'T2'},0.50125,-0.245882
10620,last,2024-02-19 07:59:27.482,1045,13295,0.0,0.000484,,{'STICKY_ARTIST': 'T2'},0.858799,-0.245882
12878,last,2024-02-19 07:59:54.922,1045,37551,0.0,0.001572,,{'STICKY_ARTIST': 'T2'},0.305221,-0.245882


In [23]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)

user_scores

Unnamed: 0_level_0,score,artist,title,genre,pop
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,Björk,Undo,,57660
32686,0.0,Franco Battiato,Aria Di Rivoluzione/Sequenze E Frequenze,,95
32678,0.0,Darkseed,Like To A Silver Bow,Pop_Rock,55
32679,0.0,Lunic,The Little Room,,60
32680,0.0,Benabar,Y'a Une Fille Qu'Habite Chez Moi,,122
32681,0.0,The Gathering,Even The Spirits Are Afraid,Pop_Rock,65
32682,0.0,Nick Drake,Horn,Folk,136
32683,0.0,Of Montreal,A Sentence Of Sorts In Kongsvinger,,288
32684,0.0,Nick Drake,Free Ride,Folk,133
32685,0.0,Okkervil River,A King And A Queen,,86


In [24]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,title,genre,pop
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11745,0.754118,Steve Miller Band,Abracadabra,,1344
5573,0.754118,John Mellencamp,Jack & Diane,,2291
15093,0.754118,The Killers,Bones,,2125
568,0.754118,The Killers,When You Were Young,,13571
1916,0.754118,Steve Miller,Jungle Love,,762
757,0.554118,The Killers,Human,,5618
1926,0.164118,Steve Miller Band,The Joker,,3686
37581,0.154118,Steve Miller,Threshold,,105
1908,0.149118,Steve Miller,True Fine Love,,221
37551,0.074118,Steve Miller Band,Space Cowboy (Digitally Remastered 90),Pop_Rock,262


## Подготавливаем рекомендации для продакшена

In [25]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

In [26]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9515/9515 [00:20<00:00, 461.26it/s]
