In [1]:
import os
import sys
from pathlib import Path
import asyncio
import pandas as pd
from datetime import datetime, timedelta

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Import collectors
from collectors.collectors import (
    collect_x_data,
    collect_youtube_data,
    collect_reddit_data,
    collect_tiktok_data,
)

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Create data directories
Path("data/raw").mkdir(parents=True, exist_ok=True)

## 1. YouTube Data Collection

In [3]:
# Collect YouTube data
async def get_youtube_data(target):
    print(f"Collecting YouTube data for {target}...")
    youtube_result = await collect_youtube_data(
        song_id=target,
        api_key=os.getenv('YOUTUBE_API_KEY'),
    )

    if 'error' not in youtube_result:
        # Convert to DataFrames
        video_df = pd.DataFrame([youtube_result['data']])  # Wrap single video in list for DataFrame
        comments_df = pd.DataFrame(youtube_result['comments'])
        
        # Save data
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        safe_target = target.replace(' ', '_').lower()
        
        video_file = f"data/raw/{safe_target}_youtube_video.csv"
        comments_file = f"data/raw/{safe_target}_youtube_comments.csv"
        
        video_df.to_csv(video_file, index=False)
        comments_df.to_csv(comments_file, index=False)
        
        print(f"✓{len(comments_df)} comments")
        print(f"Saved to: {video_file} and {comments_file}")


In [4]:
await get_youtube_data("Kendrick Lamar - Luther")

2025-05-20 01:48:09,623 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


Collecting YouTube data for Kendrick Lamar - Luther...
✓50776 comments
Saved to: data/raw/kendrick_lamar_-_luther_youtube_video.csv and data/raw/kendrick_lamar_-_luther_youtube_comments.csv


## 3. X (Twitter) Data Collection

In [None]:
# Collect X data
print("Collecting X data...")
x_result = await collect_x_data(
    song_id=target,
    api_key=os.getenv('X_API_KEY'),
    api_secret=os.getenv('X_API_SECRET'),
    start_date=start_date,
    end_date=end_date
)

if 'error' not in x_result:
    # Convert to DataFrame
    x_df = pd.DataFrame(x_result['data'])
    
    # Save data
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    safe_target = target.replace(' ', '_').lower()
    x_file = f"data/raw/{safe_target}_x_{timestamp}.csv"
    
    x_df.to_csv(x_file, index=False)
    
    print(f"✓ Found {len(x_df)} posts")
    print(f"Saved to: {x_file}")
else:
    print(f"✗ Error collecting X data: {x_result['error']}")

## 4. Reddit Data Collection

In [None]:
# Collect Reddit data
print("Collecting Reddit data...")
reddit_result = await collect_reddit_data(
    song_id=target,
    client_id=os.getenv('REDDIT_CLIENT_ID'),
    client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
    start_date=start_date,
    end_date=end_date
)

if 'error' not in reddit_result:
    # Convert to DataFrame
    reddit_df = pd.DataFrame(reddit_result['data'])
    
    # Save data
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    safe_target = target.replace(' ', '_').lower()
    reddit_file = f"data/raw/{safe_target}_reddit_{timestamp}.csv"
    
    reddit_df.to_csv(reddit_file, index=False)
    
    print(f"✓ Found {len(reddit_df)} posts")
    print(f"Saved to: {reddit_file}")
else:
    print(f"✗ Error collecting Reddit data: {reddit_result['error']}")

## 5. TikTok Data Collection

In [None]:
# Collect TikTok data
print("Collecting TikTok data...")
tiktok_result = await collect_tiktok_data(
    song_id=target,
    ms_token=os.getenv('TIKTOK_MS_TOKEN'),
    start_date=start_date,
    end_date=end_date
)

if 'error' not in tiktok_result:
    # Convert to DataFrame
    tiktok_df = pd.DataFrame(tiktok_result['data'])
    
    # Save data
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    safe_target = target.replace(' ', '_').lower()
    tiktok_file = f"data/raw/{safe_target}_tiktok_{timestamp}.csv"
    
    tiktok_df.to_csv(tiktok_file, index=False)
    
    print(f"✓ Found {len(tiktok_df)} videos")
    print(f"Saved to: {tiktok_file}")
else:
    print(f"✗ Error collecting TikTok data: {tiktok_result['error']}")

## 6. Save Collection Metadata

In [None]:
# Save collection metadata
metadata = {
    'target': target,
    'date_range': {
        'start': start_date.isoformat(),
        'end': end_date.isoformat()
    },
    'timestamp': datetime.now().strftime('%Y%m%d_%H%M%S'),
    'files': {
        'youtube': {
            'videos': videos_file if 'videos_file' in locals() else None,
            'comments': comments_file if 'comments_file' in locals() else None
        },
        'x': x_file if 'x_file' in locals() else None,
        'reddit': reddit_file if 'reddit_file' in locals() else None,
        'tiktok': tiktok_file if 'tiktok_file' in locals() else None
    }
}

import json
metadata_file = f'data/raw/{safe_target}_collection_metadata_{metadata["timestamp"]}.json'
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\nCollection metadata saved to: {metadata_file}")