In [2]:
import tqdm
import json
import glob

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

$$\hat r_{ui} \propto \sum_{v} w_{uv} h(r_{vi})$$

In [4]:
data = pd.concat([
    pd.read_json(data_path, lines=True) 
    for data_path 
    in glob.glob(r"C:\Users\denis\PycharmProjects\recsys-course-spring-2025\rec_sys_data\week_3\*\data.json")
])
data["rnd"] = np.random.random(len(data))

data.head(5)

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd
0,next,2025-03-10 12:34:54.719,320,30000,1.0,0.001235,30000.0,{'STICKY_ARTIST': 'C'},0.46296
1,next,2025-03-10 12:34:54.721,8372,27544,1.0,0.000553,33964.0,{'STICKY_ARTIST': 'C'},0.653183
2,next,2025-03-10 12:34:54.723,6112,23651,1.0,0.000602,23651.0,{'STICKY_ARTIST': 'T2'},0.876458
3,next,2025-03-10 12:34:54.724,7979,8100,1.0,0.000553,6633.0,{'STICKY_ARTIST': 'T4'},0.197578
4,next,2025-03-10 12:34:54.774,3589,45174,0.04,0.000459,10071.0,{'STICKY_ARTIST': 'T5'},0.069644


In [5]:
data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())

data.head()

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd,normalized_time
0,next,2025-03-10 12:34:54.719,320,30000,1.0,0.001235,30000.0,{'STICKY_ARTIST': 'C'},0.46296,0.624571
1,next,2025-03-10 12:34:54.721,8372,27544,1.0,0.000553,33964.0,{'STICKY_ARTIST': 'C'},0.653183,0.416364
2,next,2025-03-10 12:34:54.723,6112,23651,1.0,0.000602,23651.0,{'STICKY_ARTIST': 'T2'},0.876458,0.605102
3,next,2025-03-10 12:34:54.724,7979,8100,1.0,0.000553,6633.0,{'STICKY_ARTIST': 'T4'},0.197578,0.649655
4,next,2025-03-10 12:34:54.774,3589,45174,0.04,0.000459,10071.0,{'STICKY_ARTIST': 'T5'},0.069644,-0.295263


In [6]:
interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)

print(f"Interactions matrix: shape={interactions.shape}, density={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(9815, 46864), density=0.0004946864379202338


In [7]:
interactions.head(3)

track,0,1,2,3,4,5,6,7,8,9,...,49990,49991,49992,49993,49994,49995,49996,49997,49998,49999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)
    
print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 44.41670911869588


In [9]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 36.3301069791136


In [10]:
# expected size: observed users x observed tracks
scores_matrix = np.matmul(similarity_matrix, interactions.values)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[1, 2, 3, 4, 5]].head()

track,1,2,3,4,5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


## Глянем на рекомендации

In [31]:
BOTIFY_DATA_DIR = r"C:\Users\denis\PycharmProjects\recsys-course-spring-2025\botify\data\\"

In [32]:
products = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True).set_index("track")
products.head()

Unnamed: 0_level_0,artist,album,title,genre,pop,duration
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
41164,Михаил Бублик,ART-Обстрел I-часть,Сорок тысяч верст,"[1, 47]",-0.500252,282
27544,Xamdam Sobirov,Baxtli Bo'lolmadik,Baxtli Bo'lolmadik,[1],-0.942953,205
34702,Сергей Какенов,Ишимская шпана,Крутые лагеря,[147],-0.801382,252
45907,Loc-Dog,Electrodog 2,Еду убивать,[17],-0.577525,276
14978,Gafur,Февраль,Февраль,[1],-0.738636,160


In [33]:
user = np.random.choice(scores.index)
k = 30

data[data["user"] == user]

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd,normalized_time
6802,next,2025-03-10 12:36:24.737,6314,15313,1.0,0.000464,11385.0,{'STICKY_ARTIST': 'T5'},0.223937,0.537879
6815,next,2025-03-10 12:36:24.899,6314,7480,0.51,0.000732,12499.0,{'STICKY_ARTIST': 'T5'},0.754547,0.047879
6822,next,2025-03-10 12:36:25.008,6314,9531,0.33,0.000654,13634.0,{'STICKY_ARTIST': 'T5'},0.133025,-0.132121
6830,next,2025-03-10 12:36:25.107,6314,1671,0.0,0.000649,12499.0,{'STICKY_ARTIST': 'T5'},0.435222,-0.462121
6834,next,2025-03-10 12:36:25.165,6314,12499,0.0,0.000695,13634.0,{'STICKY_ARTIST': 'T5'},0.755794,-0.462121
8919,next,2025-03-10 12:36:52.585,6314,11230,0.8,0.000613,18741.0,{'STICKY_ARTIST': 'T5'},0.624351,0.337879
8934,next,2025-03-10 12:36:52.804,6314,33962,0.0,0.000779,11231.0,{'STICKY_ARTIST': 'T5'},0.857949,-0.462121
8937,next,2025-03-10 12:36:52.854,6314,11231,0.26,0.000927,7720.0,{'STICKY_ARTIST': 'T5'},0.245674,-0.202121
8941,last,2025-03-10 12:36:52.903,6314,7720,0.21,0.000122,,{'STICKY_ARTIST': 'T5'},0.992774,-0.252121
102369,next,2025-03-10 12:57:17.226,6314,32649,1.0,0.000799,31837.0,{'STICKY_ARTIST': 'T5'},0.579625,0.537879


In [34]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)

user_scores

Unnamed: 0_level_0,score,artist,album,title,genre,pop,duration
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
32649,1.582505,BTS,"BTS, THE BEST",IDOL (Japanese ver.),"[152, 10]",-0.018736,223
44708,0.389012,BTS,Skool Luv Affair (Special Addition),Jump,[152],-0.287483,236
19647,0.252318,BTS,"Butter (Hotter, Sweeter, Cooler)",Butter,[152],-0.38912,164
33429,0.22414,BTS,"BTS, THE BEST",Airplane pt.2 (Japanese ver.),"[152, 10]",-0.042091,219
40343,0.211882,BTS,The Most Beautiful Moment in Life: Young Forever,Silver Spoon,[152],-0.51925,233
31837,0.183547,BTS,FAKE LOVE / Airplane pt.2,FAKE LOVE (Japanese Version / Remix),[10],-1.01229,246
37331,0.179737,BTS,You Never Walk Alone,Not Today,[152],-0.479174,231
35612,0.175814,BTS,Love Yourself 承 'Her',Pied Piper,[152],-0.510967,245
39316,0.172272,BTS,The Most Beautiful Moment in Life: Young Forever,House of Cards (Full Length Edition),[152],-0.486124,226
34190,0.164312,Сидоренко Вячеслав,Единственная женщина,Таял снег,[149],-1.015986,262


In [35]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,album,title,genre,pop,duration
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
15501,0.537879,Динара Залумханова,Это любовь,Это любовь,[1],-0.816551,254
15313,0.537879,Динара Залумханова,Наш апрель,Наш апрель,[1],-0.622484,235
32649,0.537879,BTS,"BTS, THE BEST",IDOL (Japanese ver.),"[152, 10]",-0.018736,223
11230,0.337879,Патимат Расулова,Мой аварец,Мой аварец,[81],-0.705478,245
31837,0.337879,BTS,FAKE LOVE / Airplane pt.2,FAKE LOVE (Japanese Version / Remix),[10],-1.01229,246
18741,0.177879,Патимат Расулова,Душа моя,Душа моя,[1],-0.878103,208
39316,0.177879,BTS,The Most Beautiful Moment in Life: Young Forever,House of Cards (Full Length Edition),[152],-0.486124,226
11385,0.071212,Динара Залумханова,Мотылек,Мотылек,[81],-0.404475,188
11263,0.047879,Патимат Расулова,Не твоя,Не твоя,[81],0.199286,197
6407,0.047879,Динара Залумханова,Пусть весь мир померкнет,Пусть весь мир померкнет,[1],-1.010349,195


## Подготавливаем рекомендации для продакшена

In [29]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

In [30]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████| 9815/9815 [00:15<00:00, 624.31it/s]
