In [2]:
import pandas as pd
import numpy as np
import json
import glob
import os

import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('../../../sim/data/tracks.json', 'r') as f:
    tracks_data = [json.loads(line) for line in f]
tracks_df = pd.DataFrame(tracks_data)

with open('../../../sim/data/users.json', 'r') as f:
    users_data = [json.loads(line) for line in f]
users_df = pd.DataFrame(users_data)

print(f'N tracks: {len(tracks_df)}, N users: {len(users_df)}')

N tracks: 50000, N users: 10000


In [5]:
def load_logs(log_files_pattern='../../../logs/botify-recommender-10/data.json'):
    logs_data = []

    log_files = glob.glob(log_files_pattern)

    for file_idx, file_path in enumerate(log_files, 1):
        try:
            file_lines = []
            with open(file_path, 'r') as f:
                sample_lines = [f.readline() for _ in range(1000) if f.readline()]
                avg_line_size = sum(len(line) for line in sample_lines) / max(1, len(sample_lines))

                f.seek(0, os.SEEK_END)
                file_size = f.tell()
                f.seek(0)

                estimated_lines = int(file_size / max(1, avg_line_size))

            with open(file_path, 'r') as f:
                for line in tqdm(f, total=estimated_lines, 
                                 desc=f'Processing files {file_idx}/{len(log_files)}: {os.path.basename(file_path)}',
                                 leave=False):
                    try:
                        file_lines.append(json.loads(line))
                    except json.JSONDecodeError:
                        continue

            logs_data.extend(file_lines)

        except Exception as e:
            print(f'Failed processing {file_path}: {e}')

    logs_df = pd.DataFrame(logs_data)

    return logs_df

logs_df = load_logs()

logs_df = logs_df[logs_df['message'] == 'next'].copy()

logs_df['user'] = logs_df['user'].astype('int32')
logs_df['track'] = logs_df['track'].astype('int32')
logs_df['recommendation'] = logs_df['recommendation'].astype('int32')
logs_df['time'] = logs_df['time'].astype('float32')

                                                                                            

In [6]:
def extract_genre_features(genres_list):
    return ','.join(map(str, genres_list))

tracks_df['genre_str'] = tracks_df['genre'].apply(extract_genre_features)
tracks_df['genre_count'] = tracks_df['genre'].apply(len)

track_info = {track['track']: track for track in tracks_data}

user_item_interactions = logs_df.groupby(['user', 'track']).size().reset_index(name='interactions')

In [13]:
def create_interaction_matrix(interactions_df):
    user_ids = interactions_df['user'].unique()
    track_ids = interactions_df['track'].unique()

    user_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
    track_map = {track_id: idx for idx, track_id in enumerate(track_ids)}

    user_map_inv = {idx: user_id for user_id, idx in user_map.items()}
    track_map_inv = {idx: track_id for track_id, idx in track_map.items()}

    user_indices = [user_map[user] for user in interactions_df['user']]
    track_indices = [track_map[track] for track in interactions_df['track']]

    interaction_values = np.array(interactions_df['interactions'], dtype=np.float64)
    interactions = csr_matrix((interaction_values, 
                              (user_indices, track_indices)), 
                              shape=(len(user_ids), len(track_ids)),
                              dtype=np.float64)

    return interactions, user_map, track_map, user_map_inv, track_map_inv

interactions_matrix, user_map, track_map, user_map_inv, track_map_inv = create_interaction_matrix(user_item_interactions)

maps_data = {
    'user_map': user_map,
    'track_map': track_map,
    'user_map_inv': user_map_inv,
    'track_map_inv': track_map_inv
}

with open('maps_data.pkl', 'wb') as f:
    pickle.dump(maps_data, f)

In [None]:
def train_als_model(interactions, factors=200, regularization=0.01, iterations=20):
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        calculate_training_loss=True,
        num_threads=16,
    )

    model.fit(interactions)

    return model

model = train_als_model(interactions_matrix)

with open('als_model.pkl', 'wb') as f:
    pickle.dump(model, f)

100%|██████████| 20/20 [00:27<00:00,  1.39s/it, loss=0.000452]


In [17]:
def train_item_item_model(interactions, K=50):
    if interactions.dtype != np.float64:
        interactions = interactions.astype(np.float64)

    model = ItemItemRecommender(K=K)

    try:
        model.fit(interactions.T)
        return model
    except ValueError as e:
        from scipy.sparse import coo_matrix

        coo = interactions.T.tocoo()
        new_matrix = coo_matrix((np.array(coo.data, dtype=np.float64), 
                               (coo.row, coo.col)), 
                               shape=coo.shape)

        model.fit(new_matrix)
        return model

item_model = train_item_item_model(interactions_matrix)

with open('item_item_model.pkl', 'wb') as f:
    pickle.dump(item_model, f)

100%|██████████| 9997/9997 [00:00<00:00, 322984.22it/s]


In [38]:
def generate_recommendations_for_user(user_id, als_model, user_map, track_map_inv, n=200):
    if user_id not in user_map:
        popular_tracks = user_item_interactions.groupby('track')['interactions'].sum().nlargest(n).index.tolist()
        return popular_tracks

    user_idx = user_map[user_id]

    recommended_indices = als_model.recommend(
        user_idx, 
        interactions_matrix[user_idx], 
        N=n, 
        filter_already_liked_items=False
    )

    if isinstance(recommended_indices, tuple) and len(recommended_indices) == 2:
        indices = recommended_indices[0]
        recommended_tracks = [track_map_inv[idx] for idx in indices]
    else:
        try:
            recommended_tracks = [track_map_inv[idx] for idx, _ in recommended_indices]
        except ValueError:
            recommended_tracks = []
            for item in recommended_indices:
                if isinstance(item, tuple) and len(item) == 2:
                    recommended_tracks.append(track_map_inv[item[0]])
                elif isinstance(item, (int, np.integer)):
                    recommended_tracks.append(track_map_inv[item])

            if not recommended_tracks:
                popular_tracks = user_item_interactions.groupby('track')['interactions'].sum().nlargest(n).index.tolist()
                recommended_tracks = popular_tracks
    
    return [int(track) for track in recommended_tracks]


def generate_recommendations_for_all_users(user_ids, als_model, user_map, track_map_inv, n=200):
    recommendations = {}

    for user_id in tqdm(user_ids, desc='Recs generation'):
        recommendations[user_id] = generate_recommendations_for_user(
            user_id, als_model, user_map, track_map_inv, n
        )

    return recommendations

with open('als_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('maps_data.pkl', 'rb') as f:
    maps_data = pickle.load(f)

user_map = maps_data['user_map']
track_map_inv = maps_data['track_map_inv']

all_user_ids = users_df['user'].tolist()

user_recommendations = generate_recommendations_for_all_users(
    all_user_ids, model, user_map, track_map_inv, n=200
)

with open('als_recommendations.json', 'w') as f:
    for user, tracks in user_recommendations.items():
        f.write(json.dumps({'user': user, 'tracks': tracks}) + '\n')

Recs generation: 100%|██████████| 10000/10000 [00:09<00:00, 1092.81it/s]


In [26]:
def create_test_set(logs_df, test_size=0.2):
    logs_df = logs_df.sort_values('timestamp')

    cutoff_timestamp = logs_df['timestamp'].quantile(1 - test_size)

    train_df = logs_df[logs_df['timestamp'] < cutoff_timestamp]
    test_df = logs_df[logs_df['timestamp'] >= cutoff_timestamp]

    return train_df, test_df

def evaluate_recommendations(test_df, recommendations, top_n=10):
    hits = 0
    total = 0

    user_tracks = test_df.groupby('user')['track'].apply(list).to_dict()

    for user_id, actual_tracks in user_tracks.items():
        if user_id in recommendations:
            user_recs = recommendations[user_id][:top_n]

            user_hits = len(set(actual_tracks) & set(user_recs))

            hits += user_hits
            total += len(actual_tracks)

    hit_rate = hits / total if total > 0 else 0

    return hit_rate

train_logs, test_logs = create_test_set(logs_df)

hit_rate_10 = evaluate_recommendations(test_logs, user_recommendations, top_n=10)
hit_rate_50 = evaluate_recommendations(test_logs, user_recommendations, top_n=50)
hit_rate_200 = evaluate_recommendations(test_logs, user_recommendations, top_n=200)

print(f'Hit Rate@10: {hit_rate_10:.4f}')
print(f'Hit Rate@50: {hit_rate_50:.4f}')
print(f'Hit Rate@200: {hit_rate_200:.4f}')

Hit Rate@10: 0.1201
Hit Rate@50: 0.2933
Hit Rate@200: 0.4763


In [39]:
def enhance_recommendations_with_interests(user_id, base_recommendations, users_df, tracks_df, alpha=0.7):
    user_info = users_df[users_df['user'] == user_id].iloc[0]
    user_interests = user_info['interests']
    track_scores = {track_id: 1.0 for track_id in base_recommendations}

    for track_id in base_recommendations:
        if track_id in track_info:
            track_genres = track_info[track_id]['genre']

            if any(g in user_interests for g in track_genres):
                track_scores[track_id] *= (1 + (1 - alpha))

            track_scores[track_id] *= (1 + alpha * (track_info[track_id]['pop'] + 1) / 2)

    sorted_tracks = sorted(track_scores.items(), key=lambda x: x[1], reverse=True)

    return [track_id for track_id, _ in sorted_tracks]

def enhance_all_recommendations(recommendations, users_df, tracks_df):
    enhanced_recommendations = {}

    for user_id, recs in tqdm(recommendations.items(), desc='User interests recs'):
        enhanced_recommendations[user_id] = enhance_recommendations_with_interests(
            user_id, recs, users_df, tracks_df
        )
    return enhanced_recommendations

enhanced_recommendations = enhance_all_recommendations(user_recommendations, users_df, tracks_df)

with open('enhanced_recommendations.json', 'w') as f:
    for user, tracks in enhanced_recommendations.items():
        f.write(json.dumps({'user': user, 'tracks': tracks}) + '\n')

User interests recs: 100%|██████████| 10000/10000 [00:03<00:00, 3318.74it/s]


In [28]:
hit_rate_10_enhanced = evaluate_recommendations(test_logs, enhanced_recommendations, top_n=10)
hit_rate_50_enhanced = evaluate_recommendations(test_logs, enhanced_recommendations, top_n=50)
hit_rate_200_enhanced = evaluate_recommendations(test_logs, enhanced_recommendations, top_n=200)

print(f'Enhanced Hit Rate@10: {hit_rate_10_enhanced:.4f}')
print(f'Enhanced Hit Rate@50: {hit_rate_50_enhanced:.4f}')
print(f'Enhanced Hit Rate@200: {hit_rate_200_enhanced:.4f}')

Enhanced Hit Rate@10: 0.0253
Enhanced Hit Rate@50: 0.1261
Enhanced Hit Rate@200: 0.4763


In [40]:
def generate_hybrid_recommendations(user_id, als_model, item_model, user_map, track_map, track_map_inv, interactions_matrix, n=200, als_weight=0.7):
    if user_id not in user_map:
        popular_tracks = user_item_interactions.groupby('track')['interactions'].sum().nlargest(n).index.tolist()
        return popular_tracks

    user_idx = user_map[user_id]

    als_recommendations = als_model.recommend(
        user_idx, 
        interactions_matrix[user_idx], 
        N=n*2,
        filter_already_liked_items=False
    )

    user_items = interactions_matrix[user_idx].indices

    if len(user_items) == 0:
        if isinstance(als_recommendations, tuple) and len(als_recommendations) == 2:
            indices, _ = als_recommendations
            return [track_map_inv[idx] for idx in indices[:n]]
        else:
            return [track_map_inv[idx] for idx in als_recommendations[:n]]

    item_item_scores = {}

    for item_idx in user_items:
        similar_items = item_model.similar_items(item_idx, N=50)

        if isinstance(similar_items, tuple) and len(similar_items) == 2:
            similar_indices, similar_scores = similar_items

            for idx, score in zip(similar_indices, similar_scores):
                if idx not in item_item_scores:
                    item_item_scores[idx] = 0
                item_item_scores[idx] += score
        else:
            for idx, score in similar_items:
                if idx not in item_item_scores:
                    item_item_scores[idx] = 0
                item_item_scores[idx] += score

    if item_item_scores:
        max_item_score = max(item_item_scores.values())
        if max_item_score > 0:
            for idx in item_item_scores:
                item_item_scores[idx] /= max_item_score

    als_scores = {}

    if isinstance(als_recommendations, tuple) and len(als_recommendations) == 2:
        indices, scores = als_recommendations
        for idx, score in zip(indices, scores):
            als_scores[idx] = score
    else:
        for item in als_recommendations:
            if isinstance(item, tuple) and len(item) == 2:
                idx, score = item
                als_scores[idx] = score
            else:
                als_scores[item] = 1.0

    if als_scores:
        max_als_score = max(als_scores.values())
        if max_als_score > 0:
            for idx in als_scores:
                als_scores[idx] /= max_als_score

    combined_scores = {}

    all_indices = set(als_scores.keys()) | set(item_item_scores.keys())
    
    for idx in all_indices:
        als_score = als_scores.get(idx, 0)
        item_score = item_item_scores.get(idx, 0)

        combined_scores[idx] = als_weight * als_score + (1 - als_weight) * item_score

    sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    top_indices = [idx for idx, _ in sorted_items[:n]]

    recommended_tracks = [int(track_map_inv[idx]) for idx in top_indices]

    return recommended_tracks

def generate_hybrid_recommendations_for_all_users(user_ids, als_model, item_model, user_map, track_map, track_map_inv, interactions_matrix, n=200, als_weight=0.7):
    recommendations = {}

    for user_id in tqdm(user_ids, desc='Hybric recs'):
        recommendations[user_id] = generate_hybrid_recommendations(
            user_id, als_model, item_model, user_map, track_map, track_map_inv, 
            interactions_matrix, n, als_weight
        )

    return recommendations

with open('als_model.pkl', 'rb') as f:
    als_model = pickle.load(f)

with open('item_item_model.pkl', 'rb') as f:
    item_model = pickle.load(f)

with open('maps_data.pkl', 'rb') as f:
    maps_data = pickle.load(f)

user_map = maps_data['user_map']
track_map = maps_data['track_map']
track_map_inv = maps_data['track_map_inv']

all_user_ids = users_df['user'].tolist()

hybrid_recommendations = generate_hybrid_recommendations_for_all_users(
    all_user_ids, 
    als_model,
    item_model,
    user_map,
    track_map,
    track_map_inv, 
    interactions_matrix,
    n=200,
    als_weight=0.7
)

with open('hybrid_recommendations.json', 'w') as f:
    for user, tracks in hybrid_recommendations.items():
        f.write(json.dumps({'user': user, 'tracks': tracks}) + '\n')

hit_rate_10 = evaluate_recommendations(test_logs, hybrid_recommendations, top_n=10)
hit_rate_50 = evaluate_recommendations(test_logs, hybrid_recommendations, top_n=50)
hit_rate_200 = evaluate_recommendations(test_logs, hybrid_recommendations, top_n=200)

print(f'Hybrid Hit Rate@10: {hit_rate_10:.4f}')
print(f'Hybrid Hit Rate@50: {hit_rate_50:.4f}')
print(f'Hybrid Hit Rate@200: {hit_rate_200:.4f}')

Hybric recs: 100%|██████████| 10000/10000 [00:17<00:00, 575.38it/s]


Hybrid Hit Rate@10: 0.1529
Hybrid Hit Rate@50: 0.3525
Hybrid Hit Rate@200: 0.5318
