In [1]:
import tqdm
import json
import glob

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

ModuleNotFoundError: No module named 'pandas'

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

$$\hat r_{ui} \propto \sum_{v} w_{uv} h(r_{vi})$$

In [3]:
data = pd.concat([
    pd.read_json(data_path, lines=True) 
    for data_path 
    in glob.glob("/Users/n.anokhin/Desktop/data/*/data.json")
])
data["rnd"] = np.random.random(len(data))

data.head(5)

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd
0,next,2024-02-19 07:56:30.469,5650,16053,0.8,0.021466,1817.0,{'STICKY_ARTIST': 'T2'},0.878263
1,next,2024-02-19 07:56:30.504,5100,18840,0.45,0.002275,708.0,{'STICKY_ARTIST': 'T1'},0.83699
2,next,2024-02-19 07:56:30.534,6269,9804,0.0,0.001347,49810.0,{'STICKY_ARTIST': 'T1'},0.197788
3,next,2024-02-19 07:56:30.554,771,525,1.0,0.000751,49387.0,{'STICKY_ARTIST': 'T3'},0.799105
4,next,2024-02-19 07:56:30.573,771,49387,0.02,0.001765,1873.0,{'STICKY_ARTIST': 'T3'},0.077482


In [4]:
data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())

data.head()

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd,normalized_time
0,next,2024-02-19 07:56:30.469,5650,16053,0.8,0.021466,1817.0,{'STICKY_ARTIST': 'T2'},0.878263,0.376852
1,next,2024-02-19 07:56:30.504,5100,18840,0.45,0.002275,708.0,{'STICKY_ARTIST': 'T1'},0.83699,0.108333
2,next,2024-02-19 07:56:30.534,6269,9804,0.0,0.001347,49810.0,{'STICKY_ARTIST': 'T1'},0.197788,-0.226667
3,next,2024-02-19 07:56:30.554,771,525,1.0,0.000751,49387.0,{'STICKY_ARTIST': 'T3'},0.799105,0.629038
4,next,2024-02-19 07:56:30.573,771,49387,0.02,0.001765,1873.0,{'STICKY_ARTIST': 'T3'},0.077482,-0.350962


In [11]:
interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)

print(f"Interactions matrix: shape={interactions.shape}, density={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(9515, 44610), density=0.0004127250793736055


In [12]:
interactions.head(3)

track,0,1,2,3,4,5,6,7,9,10,...,49989,49990,49991,49992,49993,49994,49995,49996,49997,49998
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)

print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 59.66431949553337


In [14]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 33.41229637414609


In [15]:
# expected size: observed users x observed tracks
scores_matrix = np.matmul(similarity_matrix, interactions.values)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[1, 2, 3, 4, 5]].head()

track,1,2,3,4,5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,-0.036917,0.0,0.0,-0.002214
2,0.0,0.089285,0.0,0.0,-0.031835
3,0.004679,0.0,0.0,0.0,0.003102
4,0.000528,0.0,0.0,0.0,-0.007229


## Глянем на рекомендации

In [16]:
BOTIFY_DATA_DIR = "/Users/n.anokhin/Projects/recsys-course/botify/data/"

In [17]:
products = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True).set_index("track")
products.head()

Unnamed: 0_level_0,artist,title,genre,pop
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,Harmonia,Sehr kosmisch,Pop_Rock,65688
0,Björk,Undo,,57660
2,Dwight Yoakam,You're The One,Country,55035
1,Florence + The Machine,Dog Days Are Over (Radio Edit),,52773
15,Kings Of Leon,Revelry,Pop_Rock,48290


In [25]:
user = np.random.choice(scores.index)
k = 30

data[data["user"] == user]

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,rnd,normalized_time
4107,next,2024-02-19 07:57:48.139,156,45140,0.0,0.002757,17659.0,{'STICKY_ARTIST': 'T1'},0.950156,-0.1675
4109,next,2024-02-19 07:57:48.170,156,20549,0.01,0.005462,26043.0,{'STICKY_ARTIST': 'T1'},0.230883,-0.1575
10224,next,2024-02-19 07:59:22.972,156,22927,0.0,0.010818,22927.0,{'STICKY_ARTIST': 'T1'},0.953156,-0.1675
10223,next,2024-02-19 07:59:22.942,156,421,1.0,0.002987,22927.0,{'STICKY_ARTIST': 'T1'},0.53785,0.8325
4108,next,2024-02-19 07:57:48.156,156,17659,0.0,0.006016,20549.0,{'STICKY_ARTIST': 'T1'},0.284824,-0.1675
10226,next,2024-02-19 07:59:22.985,156,22927,0.0,0.004619,16367.0,{'STICKY_ARTIST': 'T1'},0.867001,-0.1675
10228,last,2024-02-19 07:59:23.073,156,22927,0.0,0.000651,,{'STICKY_ARTIST': 'T1'},0.085701,-0.1675
4107,next,2024-02-19 07:57:48.131,156,7838,1.0,0.004467,45140.0,{'STICKY_ARTIST': 'T1'},0.924995,0.8325
4111,next,2024-02-19 07:57:48.182,156,26043,0.0,0.004453,26043.0,{'STICKY_ARTIST': 'T1'},0.750087,-0.1675
10224,next,2024-02-19 07:59:22.953,156,22927,0.0,0.004999,22927.0,{'STICKY_ARTIST': 'T1'},0.198735,-0.1675


In [26]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)

user_scores

Unnamed: 0_level_0,score,artist,title,genre,pop
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4245,0.116487,Jonny L,Microdaze,,3798
12453,0.116487,Rebelution,From the Window,Reggae,290
443,0.116487,Abba,Lay All Your Love On Me,Pop_Rock,923
421,0.090071,Abba,Take A Chance On Me,Pop_Rock,2171
36919,0.086484,Abba,Slipping Through My Fingers,Pop_Rock,222
7838,0.050838,Telefon Tel Aviv,Your Every Idol,Electronic,191
25334,0.050838,Dead Can Dance,Enigma Of The Absolute,Pop_Rock,81
10235,0.046673,Miranda,Vete de aqui (con Fangoria),,104
6612,0.045285,Amy Winehouse,Wake Up Alone,,1935
43637,0.045285,Tandem,Le Général,,204


In [27]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,title,genre,pop
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7838,0.8325,Telefon Tel Aviv,Your Every Idol,Electronic,191
421,0.8325,Abba,Take A Chance On Me,Pop_Rock,2171
20549,-0.1575,Madeleine Peyroux,Once In A While,Vocal,118
16367,-0.1675,David Seville,Witch Doctor,,57
17659,-0.1675,Madeleine Peyroux,La Javanaise,Vocal,220
22927,-0.1675,David Seville,Witch Doctor,,346
45140,-0.1675,Confederate Railroad,Summer In Dixie (LP Version),Country,218
26043,-0.1675,Madeleine Peyroux,Reckless Blues (LP Version),Vocal,113


## Подготавливаем рекомендации для продакшена

In [28]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

In [29]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9515/9515 [00:20<00:00, 455.68it/s]
