# Download comments

In [4]:
import requests
import json
import time
import sys
import os
import dotenv
from urllib.parse import urlencode
from typing import Dict, Optional

In [12]:
class YouTubeCommentsDownloader:
    def __init__(self, api_key: str):
        """Initialize the downloader with API key."""
        self.api_key = api_key
        self.base_url = "https://api.scrapecreators.com/v1/youtube/video/comments"
        self.session = requests.Session()
        self.session.headers.update({
            'x-api-key': self.api_key,
            'User-Agent': 'YouTube-Comments-Downloader/1.0'
        })
    
    def make_request(self, video_url: str, order: str = "newest", continuation_token: Optional[str] = None) -> Dict:
        """Make a request to the API with optional continuation token."""
        params = {
            'url': video_url,
            'order': order
        }
        
        if continuation_token:
            params['continuationToken'] = continuation_token
        
        try:
            print(self.base_url)
            print(params)
            response = self.session.get(self.base_url, params=params, timeout=30)
            response.raise_for_status()
            return response.json()
        
        except requests.exceptions.RequestException as e:
            print(f"Error making request: {e}")
            return {}
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON response: {e}")
            return {}
    
    def download_all_comments(self, video_url: str, order: str = "newest", output_file: str = "output.json", delay: float = 1.0) -> None:
        """
        Download all comments for a video and save to JSON file.
        
        Args:
            video_url: YouTube video URL
            order: Comment order ("newest", "top", etc.)
            output_file: Output JSON file path
            delay: Delay between requests in seconds
        """
        print(f"Starting download for video: {video_url}")
        print(f"Order: {order}")
        print(f"Output file: {output_file}")
        
        all_chunks = {}
        chunk_number = 1
        continuation_token = None
        total_comments = 0
        
        while True:
            print(f"\nFetching chunk {chunk_number}...")
            
            # Make request
            chunk_data = self.make_request(video_url, order, continuation_token)
            
            if not chunk_data:
                print("Failed to fetch data or received empty response")
                break
            
            # Save chunk data
            chunk_key = f"chunk{chunk_number}"
            all_chunks[chunk_key] = chunk_data
            
            # Count comments in this chunk
            comments_in_chunk = len(chunk_data.get('comments', []))
            total_comments += comments_in_chunk
            
            print(f"Chunk {chunk_number}: {comments_in_chunk} comments")
            print(f"Total comments so far: {total_comments}")
            
            # Check for continuation token
            continuation_token = chunk_data.get('continuationToken')
            
            if not continuation_token:
                print("No more pages available")
                break
            
            print(f"Continuation token found: {continuation_token[:50]}...")
            
            # Save progress periodically
            if chunk_number % 5 == 0:
                self._save_chunks(all_chunks, output_file)
                print(f"Progress saved to {output_file}")
            
            chunk_number += 1
            
            # Add delay to be respectful to the API
            if delay > 0:
                time.sleep(delay)
        
        # Save final results
        self._save_chunks(all_chunks, output_file)
        
        print(f"\n✅ Download completed!")
        print(f"Total chunks: {len(all_chunks)}")
        print(f"Total comments: {total_comments}")
        print(f"Results saved to: {output_file}")
    
    def _save_chunks(self, chunks_data: Dict, output_file: str) -> None:
        """Save chunks data to JSON file."""
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(chunks_data, f, indent=2, ensure_ascii=False)
        except Exception as e:
            print(f"Error saving to file: {e}")
    
    def get_video_stats(self, output_file: str = "output.json") -> None:
        """Print statistics about downloaded comments."""
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            total_chunks = len(data)
            total_comments = 0
            creators_comments = 0
            verified_comments = 0
            
            for chunk_key, chunk_data in data.items():
                comments = chunk_data.get('comments', [])
                total_comments += len(comments)
                
                for comment in comments:
                    author = comment.get('author', {})
                    if author.get('isCreator'):
                        creators_comments += 1
                    if author.get('isVerified'):
                        verified_comments += 1
            
            print(f"\n📊 Download Statistics:")
            print(f"Total chunks: {total_chunks}")
            print(f"Total comments: {total_comments}")
            print(f"Creator comments: {creators_comments}")
            print(f"Verified author comments: {verified_comments}")
            print(f"Average comments per chunk: {total_comments / total_chunks:.1f}")
            
        except Exception as e:
            print(f"Error reading stats: {e}")

In [15]:
def run():
    dotenv.load_dotenv()
    """Main function to run the downloader."""
    # Configuration
    API_KEY = os.environ['SCRAPPER_API_KEY']
    VIDEO_URL = "https://youtu.be/eZM2Ik-FHEU?si=73Rw2GQxCCzDqg63"  # Replace with target video URL
    ORDER = "newest"  # or "top", "relevance", etc.
    OUTPUT_FILE = "output.json"
    REQUEST_DELAY = 1.0  # seconds between requests
    
    # Initialize downloader
    downloader = YouTubeCommentsDownloader(API_KEY)
    
    try:
        # Download all comments
        downloader.download_all_comments(
            video_url=VIDEO_URL,
            order=ORDER,
            output_file=OUTPUT_FILE,
            delay=REQUEST_DELAY
        )
        
        # Show statistics
        downloader.get_video_stats(OUTPUT_FILE)
        
    except KeyboardInterrupt:
        print("\n\n⚠️  Download interrupted by user")
        print("Partial results may be saved in the output file")
    except Exception as e:
        print(f"An error occurred: {e}")

In [16]:
run()

Starting download for video: https://youtu.be/eZM2Ik-FHEU?si=73Rw2GQxCCzDqg63
Order: newest
Output file: output.json

Fetching chunk 1...
https://api.scrapecreators.com/v1/youtube/video/comments
{'url': 'https://youtu.be/eZM2Ik-FHEU?si=73Rw2GQxCCzDqg63', 'order': 'newest'}
Chunk 1: 20 comments
Total comments so far: 20
Continuation token found: Eg0SC2VaTTJJay1GSEVVGAYyjQEKZGdldF9uZXdlc3RfZmlyc3...

Fetching chunk 2...
https://api.scrapecreators.com/v1/youtube/video/comments
{'url': 'https://youtu.be/eZM2Ik-FHEU?si=73Rw2GQxCCzDqg63', 'order': 'newest', 'continuationToken': 'Eg0SC2VaTTJJay1GSEVVGAYyjQEKZGdldF9uZXdlc3RfZmlyc3QtLUNnZ0lnQVFWRjdmUk9CSUZDSWdnR0FBU0JRaUhJQmdBRWdVSWlTQVlBQklGQ0owZ0dBRVNCUWlvSUJnQUlnNEtEQWl3eGFIR0JoRElwNkstQVEiESILZVpNMklrLUZIRVUwAXgBKBRCEGNvbW1lbnRzLXNlY3Rpb24%3D'}
Chunk 2: 20 comments
Total comments so far: 40
Continuation token found: Eg0SC2VaTTJJay1GSEVVGAYyiwEKYmdldF9uZXdlc3RfZmlyc3...

Fetching chunk 3...
https://api.scrapecreators.com/v1/youtube/video/com

# Add comments to data frame

In [18]:
import json
import pandas as pd
from datetime import datetime
from typing import List, Dict, Any
import sys

In [19]:
def load_comments_data(json_file: str) -> Dict[str, Any]:
    """
    Load comments data from JSON file.
    
    Args:
        json_file: Path to the JSON file containing comment chunks
        
    Returns:
        Dictionary containing all comment chunks
    """
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ Successfully loaded {len(data)} chunks from {json_file}")
        return data
    except FileNotFoundError:
        print(f"❌ Error: File '{json_file}' not found")
        return {}
    except json.JSONDecodeError as e:
        print(f"❌ Error parsing JSON: {e}")
        return {}
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        return {}


def extract_comments_from_chunks(chunks_data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Extract all comments from all chunks.
    
    Args:
        chunks_data: Dictionary containing comment chunks
        
    Returns:
        List of all comments from all chunks
    """
    all_comments = []
    
    for chunk_key, chunk_data in chunks_data.items():
        comments = chunk_data.get('comments', [])
        print(f"Chunk {chunk_key}: {len(comments)} comments")
        all_comments.extend(comments)
    
    print(f"📊 Total comments extracted: {len(all_comments)}")
    return all_comments


def parse_published_time(published_time_str: str) -> pd.Timestamp:
    """
    Parse published time string to pandas Timestamp.
    
    Args:
        published_time_str: ISO format datetime string
        
    Returns:
        Pandas Timestamp object
    """
    try:
        return pd.to_datetime(published_time_str)
    except:
        # If parsing fails, return NaT (Not a Time)
        return pd.NaT


def create_dataframe(comments: List[Dict[str, Any]]) -> pd.DataFrame:
    """
    Create pandas DataFrame from comments list.
    
    Args:
        comments: List of comment dictionaries
        
    Returns:
        Pandas DataFrame with columns: content, published_at, likes, replies
    """
    processed_comments = []
    
    for comment in comments:
        # Extract required fields
        content = comment.get('content', '')
        published_time = comment.get('publishedTime', '')
        
        # Extract engagement data
        engagement = comment.get('engagement', {})
        likes = engagement.get('likes', 0)
        replies = engagement.get('replies', 0)
        
        # Parse published time
        published_at = parse_published_time(published_time)
        
        processed_comments.append({
            'content': content,
            'published_at': published_at,
            'likes': likes,
            'replies': replies
        })
    
    # Create DataFrame
    df = pd.DataFrame(processed_comments)
    
    # Ensure data types
    df['content'] = df['content'].astype(str)
    df['published_at'] = pd.to_datetime(df['published_at'])
    df['likes'] = pd.to_numeric(df['likes'], errors='coerce').fillna(0).astype(int)
    df['replies'] = pd.to_numeric(df['replies'], errors='coerce').fillna(0).astype(int)
    
    return df


def analyze_dataframe(df: pd.DataFrame) -> None:
    """
    Print analysis of the DataFrame.
    
    Args:
        df: Pandas DataFrame to analyze
    """
    print(f"\n📈 DataFrame Analysis:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    print(f"\n📊 Data Summary:")
    print(f"Total comments: {len(df)}")
    print(f"Date range: {df['published_at'].min()} to {df['published_at'].max()}")
    print(f"Total likes: {df['likes'].sum():,}")
    print(f"Total replies: {df['replies'].sum():,}")
    print(f"Average likes per comment: {df['likes'].mean():.2f}")
    print(f"Average replies per comment: {df['replies'].mean():.2f}")
    
    print(f"\n🔍 Top Stats:")
    print(f"Most liked comment: {df['likes'].max()} likes")
    print(f"Most replied comment: {df['replies'].max()} replies")
    
    # Show top 3 most liked comments (first 100 chars)
    print(f"\n🏆 Top 3 Most Liked Comments:")
    top_liked = df.nlargest(3, 'likes')
    for i, (_, row) in enumerate(top_liked.iterrows(), 1):
        content_preview = row['content'][:100] + "..." if len(row['content']) > 100 else row['content']
        print(f"{i}. {row['likes']} likes: {content_preview}")
    
    # Data quality check
    print(f"\n🔍 Data Quality:")
    print(f"Missing published_at: {df['published_at'].isna().sum()}")
    print(f"Empty content: {(df['content'] == '').sum()}")
    print(f"Zero engagement (no likes/replies): {((df['likes'] == 0) & (df['replies'] == 0)).sum()}")


def save_dataframe(df: pd.DataFrame, output_formats: List[str] = ['csv']) -> None:
    """
    Save DataFrame in various formats.
    
    Args:
        df: Pandas DataFrame to save
        output_formats: List of formats to save ('csv', 'excel', 'parquet')
    """
    base_name = "youtube_comments"
    
    for format_type in output_formats:
        try:
            if format_type == 'csv':
                filename = f"{base_name}.csv"
                df.to_csv(filename, index=False, encoding='utf-8')
                print(f"💾 Saved as CSV: {filename}")
                
            elif format_type == 'excel':
                filename = f"{base_name}.xlsx"
                df.to_excel(filename, index=False, engine='openpyxl')
                print(f"💾 Saved as Excel: {filename}")
                
            elif format_type == 'parquet':
                filename = f"{base_name}.parquet"
                df.to_parquet(filename, index=False)
                print(f"💾 Saved as Parquet: {filename}")
                
        except Exception as e:
            print(f"❌ Error saving as {format_type}: {e}")

In [22]:
def load_to_dataframe():
    """Main function to convert JSON to DataFrame."""
    # Configuration
    json_file = "output.json"
    save_formats = ['csv']  # Options: 'csv', 'excel', 'parquet'
    
    # Check if file exists
    if not os.path.exists(json_file):
        print(f"❌ File '{json_file}' does not exist")
        print("Make sure to run the YouTube comments downloader first")
        return
    
    print(f"🔄 Converting {json_file} to Pandas DataFrame...")
    
    # Load data
    chunks_data = load_comments_data(json_file)
    if not chunks_data:
        return
    
    # Extract comments
    all_comments = extract_comments_from_chunks(chunks_data)
    if not all_comments:
        print("❌ No comments found in the data")
        return
    
    # Create DataFrame
    print("🔄 Creating Pandas DataFrame...")
    df = create_dataframe(all_comments)
    
    # Show DataFrame info
    print(f"✅ DataFrame created successfully!")
    print(f"\nDataFrame Head:")
    print(df.head())
    
    print(f"\nDataFrame Info:")
    print(df.info())
    
    # Analyze data
    analyze_dataframe(df)
    
    # Save DataFrame
    if save_formats:
        print(f"\n💾 Saving DataFrame in {len(save_formats)} format(s)...")
        save_dataframe(df, save_formats)
    
    # Return DataFrame for interactive use
    return df

In [23]:
df = load_to_dataframe()

🔄 Converting output.json to Pandas DataFrame...
✅ Successfully loaded 50 chunks from output.json
Chunk chunk1: 20 comments
Chunk chunk2: 20 comments
Chunk chunk3: 20 comments
Chunk chunk4: 20 comments
Chunk chunk5: 20 comments
Chunk chunk6: 20 comments
Chunk chunk7: 20 comments
Chunk chunk8: 20 comments
Chunk chunk9: 20 comments
Chunk chunk10: 20 comments
Chunk chunk11: 20 comments
Chunk chunk12: 20 comments
Chunk chunk13: 20 comments
Chunk chunk14: 20 comments
Chunk chunk15: 20 comments
Chunk chunk16: 20 comments
Chunk chunk17: 20 comments
Chunk chunk18: 20 comments
Chunk chunk19: 20 comments
Chunk chunk20: 20 comments
Chunk chunk21: 20 comments
Chunk chunk22: 20 comments
Chunk chunk23: 20 comments
Chunk chunk24: 20 comments
Chunk chunk25: 20 comments
Chunk chunk26: 20 comments
Chunk chunk27: 20 comments
Chunk chunk28: 20 comments
Chunk chunk29: 20 comments
Chunk chunk30: 20 comments
Chunk chunk31: 20 comments
Chunk chunk32: 20 comments
Chunk chunk33: 20 comments
Chunk chunk34: 20 com

In [24]:
df.shape

(981, 4)

In [25]:
df.head(10)

Unnamed: 0,content,published_at,likes,replies
0,Stay better informed https://ground.news/caspi...,2025-09-14 19:52:09.110000+00:00,50,14
1,it's not a conflict its a genocide,2025-09-15 19:52:09.110000+00:00,0,0
2,Lured-in with promises of peace...\nWas that B...,2025-09-15 19:52:09.110000+00:00,0,0
3,with friends like these who needs enemies and ...,2025-09-15 19:52:09.110000+00:00,0,0
4,Qatar mediates while funding terrorism lol,2025-09-15 19:52:09.110000+00:00,0,0
5,Funny the obvious route for Israel above Syria...,2025-09-15 19:52:09.110000+00:00,0,0
6,At war with all neighbors (sans US backed Jord...,2025-09-15 19:52:09.110000+00:00,0,0
7,"@CaspianReport In your video ""How Israel plans...",2025-09-15 19:52:09.110000+00:00,0,0
8,"The ""messenger"" here is the biggest provider o...",2025-09-15 19:52:09.110000+00:00,0,0
9,It seems that you forgot to mention the multib...,2025-09-15 19:52:09.110000+00:00,0,0
