In [1]:
import pandas as pd
import requests
import json
import time
import os
from tqdm import tqdm


In [2]:
df = pd.read_csv('merged_vk_youtube_dataset.csv', encoding='utf-8-sig')


In [3]:
possible_fields = ['video_id_youtube', 'videoId', 'youtube_video_id', 'youtube_id', 'id', 'youtubeId']
found_field = None
for field in possible_fields:
    if field in df.columns:
        found_field = field
        break


In [4]:
YOUTUBE_API_KEY = "AIzaSyCPF8G9khU4YQUyq5AENuHSwhgOIqGs-sg"
YOUTUBE_VIDEO_ID_FIELD = found_field if found_field else "videoId"


In [5]:
YOUTUBE_CATEGORIES = {
    '1': 'Film & Animation',
    '2': 'Autos & Vehicles',
    '10': 'Music',
    '15': 'Pets & Animals',
    '17': 'Sports',
    '19': 'Travel & Events',
    '20': 'Gaming',
    '22': 'People & Blogs',
    '23': 'Comedy',
    '24': 'Entertainment',
    '25': 'News & Politics',
    '26': 'Howto & Style',
    '27': 'Education',
    '28': 'Science & Technology',
    '29': 'Nonprofits & Activism'
}


In [6]:
def get_youtube_categories_batch(video_ids, api_key):
    if not video_ids:
        return {}
    video_ids_str = ','.join(str(vid) for vid in video_ids[:50])
    url = "https://www.googleapis.com/youtube/v3/videos"
    params = {
        'id': video_ids_str,
        'part': 'snippet',
        'key': api_key
    }
    result = {}
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        if 'items' in data:
            for item in data['items']:
                video_id = str(item['id'])
                category_id = item['snippet'].get('categoryId')
                category_name = YOUTUBE_CATEGORIES.get(str(category_id), f'Unknown ({category_id})')
                result[video_id] = (str(category_id), category_name)
    except Exception:
        pass
    return result


In [7]:
youtube_rows = df[df[YOUTUBE_VIDEO_ID_FIELD].notna()].copy()
unique_video_ids = youtube_rows[YOUTUBE_VIDEO_ID_FIELD].unique()
progress_file = 'youtube_categories_cache.json'
category_cache = {}
if os.path.exists(progress_file):
    try:
        with open(progress_file, 'r', encoding='utf-8') as f:
            saved_cache = json.load(f)
            for k, v in saved_cache.items():
                category_cache[str(k)] = tuple(v) if isinstance(v, list) else v
    except Exception:
        pass


In [8]:
BATCH_SIZE = 50
TOTAL_VIDEOS = len(unique_video_ids)
BATCHES_NEEDED = (TOTAL_VIDEOS + BATCH_SIZE - 1) // BATCH_SIZE
for i in tqdm(range(0, TOTAL_VIDEOS, BATCH_SIZE), desc="Обработка батчей", total=BATCHES_NEEDED):
    batch = unique_video_ids[i:i+BATCH_SIZE]
    batch_to_fetch = [str(vid) for vid in batch if str(vid) not in category_cache]
    if batch_to_fetch:
        batch_results = get_youtube_categories_batch(batch_to_fetch, YOUTUBE_API_KEY)
        category_cache.update(batch_results)
        time.sleep(1.0)
    if (i // BATCH_SIZE) % 5 == 0 and i > 0:
        try:
            with open(progress_file, 'w', encoding='utf-8') as f:
                json.dump(category_cache, f, ensure_ascii=False, indent=2)
        except Exception:
            pass
try:
    with open(progress_file, 'w', encoding='utf-8') as f:
        json.dump(category_cache, f, ensure_ascii=False, indent=2)
except Exception:
    pass


Обработка батчей: 100%|██████████| 177/177 [00:04<00:00, 40.16it/s]


In [9]:
def get_category_id(video_id):
    if pd.isna(video_id):
        return None
    video_id_str = str(video_id)
    if video_id_str in category_cache:
        return category_cache[video_id_str][0]
    return None

def get_category_name(video_id):
    if pd.isna(video_id):
        return None
    video_id_str = str(video_id)
    if video_id_str in category_cache:
        return category_cache[video_id_str][1]
    return None

df['youtube_category_id'] = df[YOUTUBE_VIDEO_ID_FIELD].apply(get_category_id)
df['youtube_category_name'] = df[YOUTUBE_VIDEO_ID_FIELD].apply(get_category_name)


In [10]:
df.to_csv('merged_vk_youtube_dataset_with_categories.csv', index=False, encoding='utf-8-sig')
try:
    df.to_excel('merged_vk_youtube_dataset_with_categories.xlsx', index=False, engine='openpyxl')
except Exception:
    pass
