# Social Media Data Collection

This notebook focuses on collecting data from various social media platforms and saving it in CSV format for further analysis.

In [None]:
import os
import sys
from pathlib import Path
import asyncio
import pandas as pd
from datetime import datetime

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Import collectors
from src.collectors.collectors import (
    collect_x_data,
    collect_youtube_data,
    collect_reddit_data,
    collect_tiktok_data
)

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Create data directories
Path("data/raw").mkdir(parents=True, exist_ok=True)

## 1. Define Collection Targets

Define the artists or songs you want to analyze.

In [None]:
# Define your targets
targets = [
    "Taylor Swift",  # Artist analysis
    "Taylor Swift Anti-Hero",  # Specific song analysis
    "Drake",  # Another artist for comparison
    # Add more targets as needed
]

# Collection parameters
params = {
    'days': 30,  # Time window for data collection
    'platforms': ['x', 'youtube', 'reddit', 'tiktok']  # Platforms to collect from
}

## 2. Data Collection Function

In [None]:
async def collect_data(target: str, days: int = 30):
    """Collect data from all platforms and return as DataFrames"""
    print(f"\nCollecting data for: {target}")
    platform_data = {}
    
    try:
        if 'x' in params['platforms']:
            print("Collecting from X...")
            x_result = await collect_x_data(
                song_id=target,
                api_key=os.getenv('X_API_KEY'),
                api_secret=os.getenv('X_API_SECRET'),
                days=days
            )
            if 'error' not in x_result:
                platform_data['x'] = pd.DataFrame(x_result['data'])
                print(f"✓ Found {len(platform_data['x'])} posts on X")
            else:
                print(f"✗ Error collecting X data: {x_result['error']}")
            
        if 'youtube' in params['platforms']:
            print("Collecting from YouTube...")
            youtube_result = await collect_youtube_data(
                song_id=target,
                api_key=os.getenv('YOUTUBE_API_KEY'),
                days=days
            )
            if 'error' not in youtube_result:
                platform_data['youtube'] = pd.DataFrame(youtube_result['data'])
                print(f"✓ Found {len(platform_data['youtube'])} videos on YouTube")
            else:
                print(f"✗ Error collecting YouTube data: {youtube_result['error']}")

        if 'reddit' in params['platforms']:
            print("Collecting from Reddit...")
            reddit_result = await collect_reddit_data(
                song_id=target,
                client_id=os.getenv('REDDIT_CLIENT_ID'),
                client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
                days=days
            )
            if 'error' not in reddit_result:
                platform_data['reddit'] = pd.DataFrame(reddit_result['data'])
                print(f"✓ Found {len(platform_data['reddit'])} discussions on Reddit")
            else:
                print(f"✗ Error collecting Reddit data: {reddit_result['error']}")
                
        if 'tiktok' in params['platforms']:
            print("Collecting from TikTok...")
            tiktok_result = await collect_tiktok_data(
                song_id=target,
                ms_token=os.getenv('TIKTOK_MS_TOKEN'),
                days=days
            )
            if 'error' not in tiktok_result:
                platform_data['tiktok'] = pd.DataFrame(tiktok_result['data'])
                print(f"✓ Found {len(platform_data['tiktok'])} videos on TikTok")
            else:
                print(f"✗ Error collecting TikTok data: {tiktok_result['error']}")

    except Exception as e:
        print(f"Error during data collection: {str(e)}")
    
    return platform_data

## 3. Collect and Save Data

In [None]:
# Generate timestamp for this collection run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Dictionary to store file paths
saved_files = {}

# Collect and save data for each target
for target in targets:
    # Collect data
    platform_data = await collect_data(target, params['days'])
    
    # Save data for each platform
    target_files = {}
    for platform, df in platform_data.items():
        # Create sanitized filename
        safe_target = target.replace(' ', '_').lower()
        filename = f"data/raw/{safe_target}_{platform}_{timestamp}.csv"
        
        # Save to CSV
        df.to_csv(filename, index=False)
        target_files[platform] = filename
        print(f"Saved {platform} data to: {filename}")
    
    saved_files[target] = target_files

# Save collection metadata
metadata = {
    'timestamp': timestamp,
    'targets': targets,
    'parameters': params,
    'files': saved_files
}

import json
with open(f'data/raw/collection_metadata_{timestamp}.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("\nCollection complete! Data saved to CSV files.")