In [None]:
import pandas as pd
import numpy as np
from this_mapping import supported_genres

tracks_df = pd.read_csv('../edadata.csv')
min_popularity = 65

# === Параметры ===
n_users = 5500
min_interactions = 70
max_interactions = 200
min_interactions_per_item = 60  # ← ГАРАНТИРОВАННЫЙ МИНИМУМ НА ТРЕК
np.random.seed(42)

# === Подготовка треков ===
tracks_df['item_id'] = tracks_df['item_id'].astype(int)
idx_to_genre = {i: g for i, g in enumerate(supported_genres)}
tracks_df['genre_name'] = tracks_df['genre'].map(idx_to_genre)

if tracks_df['genre_name'].isna().any():
    raise ValueError("Обнаружены некорректные значения жанров в edadata.csv")

# Индексируем для быстрого доступа
tracks_indexed = tracks_df.set_index('item_id')

# Группировка по жанрам
genre_groups = tracks_df.groupby('genre_name')
genre_track_pool = {}
for genre, group in genre_groups:
    pop = group['popularity'].values.astype(float)
    weights = pop / pop.sum() if pop.sum() > 0 else np.ones(len(pop)) / len(pop)
    genre_track_pool[genre] = {
        'item_ids': group['item_id'].values,
        'weights': weights,
        'popularity': group['popularity'].values
    }

all_genres = list(genre_track_pool.keys())
genre_global_pop = {g: tracks_df[tracks_df['genre_name'] == g]['popularity'].sum() for g in all_genres}
bias_weights = np.array([genre_global_pop[g] for g in all_genres], dtype=float)
bias_weights = bias_weights / bias_weights.sum() if bias_weights.sum() > 0 else np.ones_like(bias_weights) / len(bias_weights)

# === Шаг 1: Генерация основных взаимодействий ===
interactions = []
user_main_genres = {}  # сохраним любимые жанры для каждого пользователя

for user_id in range(n_users):
    n_main = np.random.choice([1, 2, 3], p=[0.3, 0.5, 0.2])  # ← 3 жанра у 20%
    main_genres = np.random.choice(all_genres, size=n_main, replace=False, p=bias_weights)
    user_main_genres[user_id] = set(main_genres)

    relevant_items = []
    for genre in main_genres:
        pool = genre_track_pool[genre]
        top_k = min(250, len(pool['item_ids']))
        top_indices = np.argsort(-pool['popularity'])[:top_k]
        top_items = pool['item_ids'][top_indices]
        relevant_items.extend(top_items.tolist())
    
    relevant_items = list(set(relevant_items))
    if not relevant_items:
        continue

    n_interactions = np.random.randint(min_interactions, max_interactions + 1)
    user_interactions = []

    # a) Основные взаимодействия (40–70%)
    n_relevant = np.random.randint(int(0.4 * n_interactions), int(0.7 * n_interactions) + 1)
    for _ in range(n_relevant):
        item = np.random.choice(relevant_items)
        pop_norm = (tracks_indexed.loc[item, 'popularity'] - min_popularity) / (100 - min_popularity)
        pop_norm = np.clip(pop_norm, 0, 1)
        p_like = 0.4 + 0.3 * pop_norm   # ← снижено
        p_skip = 0.1
        p_full = 1.0 - p_like - p_skip
        probs = np.array([p_like, p_full, p_skip])
        probs = np.clip(probs, 0, 1)
        probs = probs / probs.sum()
        rating = np.random.choice([4, 3, 2], p=probs)
        user_interactions.append((item, rating))

    # b) Фоновые взаимодействия (по 2-3 другим жанрам)
    n_background = n_interactions - n_relevant
    if n_background > 0:
        other_genres = [g for g in all_genres if g not in main_genres]
        if other_genres:
            n_bg_genres = min(3, len(other_genres))
            bg_genres = np.random.choice(other_genres, size=n_bg_genres, replace=False)
            for _ in range(n_background):
                genre = np.random.choice(bg_genres)
                pool = genre_track_pool[genre]
                item = np.random.choice(pool['item_ids'], p=pool['weights'])
                rating = np.random.choice([2, 1, 3], p=[0.5, 0.2, 0.3])
                user_interactions.append((item, rating))
        else:
            for _ in range(n_background):
                item = np.random.choice(relevant_items)
                user_interactions.append((item, 2))

    # c) Случайные взаимодействия (эксперименты)
    n_random = max(0, n_interactions // 10)
    for _ in range(n_random):
        item = np.random.choice(tracks_df['item_id'])
        rating = np.random.choice([2, 3], p=[0.7, 0.3])
        user_interactions.append((item, rating))

    # Убираем дубликаты
    seen = set()
    final_interactions = []
    for item, rating in reversed(user_interactions):
        if item not in seen:
            seen.add(item)
            final_interactions.append((item, rating))
    final_interactions = final_interactions[::-1][:n_interactions]

    for item, rating in final_interactions:
        interactions.append({'user_id': user_id, 'item_id': item, 'rating': rating})

# === Шаг 2: Дополним недостающие взаимодействия до min_interactions_per_item ==
interactions_df = pd.DataFrame(interactions)
existing_pairs = set(zip(interactions_df['user_id'], interactions_df['item_id']))
user_interaction_counts = interactions_df['user_id'].value_counts().to_dict()
current_counts = interactions_df['item_id'].value_counts().to_dict()

# Список треков, которым не хватает взаимодействий
undercovered_items = [
    item for item in tracks_df['item_id']
    if current_counts.get(item, 0) < min_interactions_per_item
]

print(f"Треков с < {min_interactions_per_item} взаимодействиями: {len(undercovered_items)}")

# Для быстрого поиска: какие пользователи "подходят" под жанр трека?
genre_to_users = {}
for user_id, genres in user_main_genres.items():
    for g in genres:
        genre_to_users.setdefault(g, []).append(user_id)

# Преобразуем в множества для быстрого поиска (опционально)
genre_to_users = {g: set(users) for g, users in genre_to_users.items()}

extra_interactions = []

for item in undercovered_items:
    current = current_counts.get(item, 0)
    needed = min_interactions_per_item - current
    if needed <= 0:
        continue

    genre = tracks_indexed.loc[item, 'genre_name']
    candidate_users = list(genre_to_users.get(genre, set()))

    # Если нет идеальных кандидатов — берём всех, но предпочтём тех, у кого < max_interactions
    if not candidate_users:
        # Фильтруем пользователей, у которых ещё есть "место" под новые взаимодействия
        candidate_users = [
            uid for uid in range(n_users)
            if user_interaction_counts.get(uid, 0) < max_interactions
        ]
        if not candidate_users:
            # Крайний случай: разрешаем превышение, но редко
            candidate_users = list(range(n_users))
    else:
        # Оставляем только тех, у кого ещё не достигнут лимит
        candidate_users = [
            uid for uid in candidate_users
            if user_interaction_counts.get(uid, 0) < max_interactions
        ]
        if not candidate_users:
            # Если все "идеальные" пользователи переполнены — брать из всех с местом
            candidate_users = [
                uid for uid in range(n_users)
                if user_interaction_counts.get(uid, 0) < max_interactions
            ]
            if not candidate_users:
                candidate_users = list(range(n_users))  # крайняя мера

    added = 0
    attempts = 0
    while added < needed and attempts < needed * 5:  # избегаем бесконечного цикла
        user_id = np.random.choice(candidate_users)
        attempts += 1

        # Проверка дубликата
        if (user_id, item) in existing_pairs:
            continue

        # Генерация рейтинга
        pop_norm = (tracks_indexed.loc[item, 'popularity'] - min_popularity) / (100 - min_popularity)
        pop_norm = np.clip(pop_norm, 0, 1)
        p_like = 0.3 + 0.4 * pop_norm
        p_like = np.clip(p_like, 0, 1)
        rating = np.random.choice([4, 3], p=[p_like, 1 - p_like])

        # Добавляем
        extra_interactions.append({'user_id': user_id, 'item_id': item, 'rating': rating})
        existing_pairs.add((user_id, item))
        user_interaction_counts[user_id] = user_interaction_counts.get(user_id, 0) + 1
        added += 1

all_interactions = interactions + extra_interactions
interactions_df = pd.DataFrame(all_interactions)

# === Генерация timestamp ===
start = pd.Timestamp('2024-10-26')
end = pd.Timestamp('2025-10-26')
time_diff = (end - start).total_seconds()
random_seconds = np.random.rand(len(interactions_df)) * time_diff
interactions_df['timestamp'] = start + pd.to_timedelta(random_seconds, unit='s')
interactions_df = interactions_df.sort_values(['user_id', 'timestamp']).reset_index(drop=True)

# === Финальная проверка ===
final_counts = interactions_df['item_id'].value_counts()
assert final_counts.min() >= min_interactions_per_item, f"Минимум: {final_counts.min()}"

# === Сохранение ===
interactions_df.to_csv("../interaction_data.csv", index=False)

Треков с < 60 взаимодействиями: 2078
