In [1]:
import json
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, hstack
from implicit.als import AlternatingLeastSquares
import pickle

def bm25_weight(X, K1=100, B=0.8):
    X = X.tocsr()
    N = float(X.shape[0])
    idf = np.log((N + 1) / (1 + np.bincount(X.indices)))

    row_sums = np.array(X.sum(axis=1)).flatten()
    avg_length = row_sums.mean()

    X = X.copy()
    for i in range(X.shape[0]):
        row = X.data[X.indptr[i]:X.indptr[i+1]]
        length = row.sum()
        row = row * (K1 + 1) / (row + K1 * (1 - B + B * length / avg_length))
        X.data[X.indptr[i]:X.indptr[i+1]] = row

    X = X.multiply(idf)
    return X.tocoo()

In [2]:
with open('../../../sim/data/tracks.json', 'r') as f:
    tracks = [json.loads(line) for line in f]

tracks_df = pd.DataFrame(tracks)
tracks_df.set_index('track', inplace=True)

tracks_df['genre_str'] = tracks_df['genre'].apply(lambda x: ' '.join(map(str, x)))

tracks_df.head()

Unnamed: 0_level_0,artist,album,title,genre,pop,duration,genre_str
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
41164,Михаил Бублик,ART-Обстрел I-часть,Сорок тысяч верст,"[1, 47]",-0.500252,282,1 47
27544,Xamdam Sobirov,Baxtli Bo'lolmadik,Baxtli Bo'lolmadik,[1],-0.942953,205,1
34702,Сергей Какенов,Ишимская шпана,Крутые лагеря,[147],-0.801382,252,147
45907,Loc-Dog,Electrodog 2,Еду убивать,[17],-0.577525,276,17
14978,Gafur,Февраль,Февраль,[1],-0.738636,160,1


In [3]:
with open('../../../sim/data/users.json', 'r') as f:
    users = [json.loads(line) for line in f]

users_df = pd.DataFrame(users)
users_df.set_index('user', inplace=True)

users_df.head()

Unnamed: 0_level_0,interests,consume_bias,consume_sharpness
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"[47252, 47391, 47482]",8.923356,1.045157
1,"[11549, 5274, 9468]",1.576725,0.64534
2,"[7066, 16578, 3134]",3.419478,0.846186
3,"[11870, 3969, 27471]",5.116603,1.570454
4,"[7262, 3225, 27453]",8.063941,1.848041


In [4]:
def load_logs(log_files_pattern='../../../logs/botify-recommender-*/data.json*', size=3_000_000):
    logs_data = []

    log_files = glob.glob(log_files_pattern)

    for file_idx, file_path in enumerate(log_files, 1):
        try:
            file_lines = []
            with open(file_path, 'r') as f:
                sample_lines = [f.readline() for _ in range(1000) if f.readline()]
                avg_line_size = sum(len(line) for line in sample_lines) / max(1, len(sample_lines))

                f.seek(0, os.SEEK_END)
                file_size = f.tell()
                f.seek(0)

                estimated_lines = int(file_size / max(1, avg_line_size))

            with open(file_path, 'r') as f:
                for line in tqdm(f, total=estimated_lines, 
                                 desc=f'Processing files {file_idx}/{len(log_files)}: {os.path.basename(file_path)}',
                                 leave=False):
                    try:
                        file_lines.append(json.loads(line))
                    except json.JSONDecodeError:
                        continue

            logs_data.extend(file_lines)

        except Exception as e:
            print(f'Failed processing {file_path}: {e}')

        if len(logs_data) >= size:
            break

    logs_df = pd.DataFrame(logs_data)

    return logs_df

logs_df = load_logs()

logs_df['timestamp'] = pd.to_datetime(logs_df['timestamp'], unit='ms')

logs_df.head()

                                                                                                

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments
0,next,2025-04-09 16:58:59.956,6483,41042,0.0,0.000368,39473.0,{'PERSONALIZED': 'T1'}
1,next,2025-04-09 16:58:59.967,4080,18211,0.26,0.000328,15632.0,{'PERSONALIZED': 'C'}
2,next,2025-04-09 16:58:59.968,6601,40130,1.0,0.000281,40130.0,{'PERSONALIZED': 'C'}
3,next,2025-04-09 16:58:59.969,3659,30715,0.23,0.000265,11891.0,{'PERSONALIZED': 'T1'}
4,next,2025-04-09 16:58:59.971,3659,11891,0.27,0.000295,48554.0,{'PERSONALIZED': 'T1'}


In [5]:
interactions = logs_df.groupby(['user', 'track']).size().reset_index(name='count')

user_ids = interactions['user'].astype('category').cat.codes.values
track_ids = interactions['track'].astype('category').cat.codes.values
counts = interactions['count'].values

interaction_matrix = csr_matrix((counts, (user_ids, track_ids)), 
                                shape=(len(users_df), len(tracks_df)))

In [6]:
track_popularity = np.array(interaction_matrix.sum(axis=0)).flatten()
tracks_df['popularity'] = track_popularity

scaler = MinMaxScaler()
tracks_df['popularity_norm'] = scaler.fit_transform(tracks_df[['popularity']])

tracks_df.sort_values('popularity', ascending=False).head()

Unnamed: 0_level_0,artist,album,title,genre,pop,duration,genre_str,popularity,popularity_norm
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11454,Гузель Уразова,Мин яратам сине тормыш,Мин яратам сине тормыш,[1],-0.639363,223,1,473,1.0
10062,Сергей Куренков,У тебя в глазах,Слёзы это небес роса,[147],-0.305299,255,147,454,0.959574
28523,Bahh Tee,Моей последней бывшей,Моей последней бывшей,[10],-0.574473,194,10,452,0.955319
3497,Eternxlkz,SLAY!,SLAY! (Sped Up),[53],-0.931247,97,53,381,0.804255
3360,Hayk Asatryan,Amor,Amor,[1],-1.272383,197,1,361,0.761702


In [None]:
weighted_interactions = bm25_weight(interaction_matrix, K1=100, B=0.8)

weighted_interactions = weighted_interactions.T.tocsr()

als_model = AlternatingLeastSquares(factors=128, regularization=0.01, iterations=200, random_state=42)
als_model.fit(weighted_interactions)

  check_blas_config()


  0%|          | 0/200 [00:00<?, ?it/s]

In [8]:
genre_vectorizer = TfidfVectorizer()
genre_features = genre_vectorizer.fit_transform(tracks_df['genre_str'])

album_vectorizer = TfidfVectorizer()
album_features = album_vectorizer.fit_transform(tracks_df['album'])

title_vectorizer = TfidfVectorizer()
title_features = title_vectorizer.fit_transform(tracks_df['title'])

artist_encoder = TfidfVectorizer()
artist_features = artist_encoder.fit_transform(tracks_df['artist'])

content_features = hstack([genre_features, artist_features, album_features, title_features])

duration_pop = tracks_df[['duration', 'pop', 'popularity_norm']].values
duration_pop = MinMaxScaler().fit_transform(duration_pop)

track_features = hstack([content_features, csr_matrix(duration_pop)])

In [9]:
import gc

gc.collect()

batch_size = 1000
n_samples = track_features.shape[0]

similarity_matrix = np.zeros((n_samples, n_samples))

for i in tqdm(range(0, n_samples, batch_size)):
    for j in range(0, n_samples, batch_size):
        similarity_matrix[i:i+batch_size, j:j+batch_size] = cosine_similarity(track_features[i:i+batch_size], track_features[j:j+batch_size])

        gc.collect()

100%|██████████| 50/50 [02:22<00:00,  2.84s/it]


In [41]:
class HybridRecommender:
    def __init__(self, als_model, track_features, tracks_df, tracks_similarity, users_df, n_recommendations=200):
        self.als_model = als_model
        self.track_features = track_features
        self.tracks_df = tracks_df
        self.users_df = users_df
        self.n_recommendations = n_recommendations

        self.track_similarity = tracks_similarity

        self.track_index = {track: idx for idx, track in enumerate(tracks_df.index)}
        self.index_track = {idx: track for idx, track in enumerate(tracks_df.index)}

        self.interaction_matrix = weighted_interactions

    def recommend_for_user(self, user_id, prev_track=None, prev_track_time=None):
        if user_id is None:
            return self._get_popular_tracks()

        als_recs = self._get_als_recommendations(user_id)

        interest_recs = self._get_interest_recommendations(user_id)

        context_recs = self._get_context_recommendations(prev_track) if prev_track else []

        popular_recs = self._get_popular_tracks()

        recommendations = []

        all_recs = set()

        for rec, weight in [(als_recs, 0.4), (interest_recs, 0.3), (context_recs, 0.2), (popular_recs, 0.1)]:
            for track in rec:
                if track not in all_recs:
                    recommendations.append((track, weight * (1.0 - (len(recommendations) / (self.n_recommendations * 2)))))
                    all_recs.add(track)

        recommendations.sort(key=lambda x: x[1], reverse=True)

        return [track for track, score in recommendations[:self.n_recommendations]]

    def _get_als_recommendations(self, user_idx, n=100):
        ids, scores = self.als_model.recommend(user_idx, self.interaction_matrix[user_idx], N=n)
        return [self.index_track[idx] for idx in ids]

    def _get_interest_recommendations(self, user_id, n=100):
        user_interests = self.users_df.loc[user_id, 'interests']

        if not user_interests:
            return []

        interest_tracks = self.tracks_df[self.tracks_df['genre'].apply(lambda x: any(i in x for i in user_interests))]

        return interest_tracks.sort_values('popularity', ascending=False).head(n).index.tolist()

    def _get_context_recommendations(self, prev_track, n=50):
        track_idx = self.track_index.get(prev_track)

        if track_idx is None:
            return []

        sim_scores = list(enumerate(self.track_similarity[track_idx]))
        sim_scores.sort(key=lambda x: x[1], reverse=True)

        similar_tracks = [self.index_track[i] for i, score in sim_scores[1:n+1]]

        return similar_tracks

    def _get_popular_tracks(self, n=50):
        return self.tracks_df.sort_values('popularity', ascending=False).head(n).index.tolist()

In [42]:
recommender = HybridRecommender(als_model, track_features, tracks_df, similarity_matrix, users_df)

all_recommendations = {}

for user_id in tqdm(users_df.index, desc='Generating recommendations'):
    recommendations = recommender.recommend_for_user(user_id)
    all_recommendations[user_id] = recommendations

with open('../../../botify/data/ials_with_content_recommendations.json', 'w') as f:
    for user, tracks in all_recommendations.items():
        f.write(json.dumps({'user': user, 'tracks': tracks}) + '\n')

Generating recommendations: 100%|██████████| 10000/10000 [03:53<00:00, 42.86it/s]


In [43]:
def evaluate_recommender(recommender, logs_df, sample_size=1000):
    sample = logs_df.sample(min(sample_size, len(logs_df)))

    hits = 0
    total = 0

    for _, row in tqdm(sample.iterrows(), total=len(sample), desc='Eval'):
        user_id = row['user']
        track_id = row['track']

        recommendations = recommender.recommend_for_user(user_id, track_id)

        if track_id in recommendations:
            hits += 1

        total += 1

    return hits / total

hit_rate = evaluate_recommender(recommender, logs_df)
print(f'Hit Rate: {hit_rate:.4f}')

Eval: 100%|██████████| 1000/1000 [00:40<00:00, 24.47it/s]

Hit Rate: 0.0020



