In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, round
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType
import requests
import time
from datetime import datetime

query_id = "6863c3830cc1b17b4d95a265"
token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6ImF5dXNoaS5iaGFpcmFtQHR1ZGlwLmNvbSIsInVzZXJfaWQiOiI2N2I2ZWI2MmYwODE5NjkxMWY2MmUyMTkiLCJvcmdhbml6YXRpb25faWQiOiI2NmFhNjJlY2QyZmQ5ZjJiM2RhMjc5OGEiLCJleHAiOjE3NTMxNzk3NDV9.H4MT-MJOGkZE4Fyt3838AQgLo_4EPwhHn1NqfQYZrQg"

# Request headers
request_headers = {
    'accept': 'application/json',
    'authorization': f'Bearer {token}',
}

base_url = f'https://discover-api-dev.tellagence.ai/api/v1/analysis/post/{query_id}'

payload = {
    'start_date': '2021-08-17T18:30:00.000Z',
    'end_date': '2025-06-30T18:29:59.999Z',
    'data_source_type': 'YouTube Comments',
    'page': 1,
    'limit': 2000  # I increased to 2000 for less no. of requests (default is 10)
}

# Schema definition
schema = StructType([
    StructField("record_type", StringType(), True),
    StructField("author", StringType(), True),
    StructField("author_channel_url", StringType(), True),
    StructField("author_profile_image_url", StringType(), True),
    StructField("date", TimestampType(), True),
    StructField("like_count", IntegerType(), True),
    StructField("reply_count", FloatType(), True),
    StructField("text", StringType(), True),
    StructField("video_id", StringType(), True),
    StructField("cluster_sentiment", StringType(), True),
    StructField("cluster_sentiment_reasoning", StringType(), True),
    StructField("id", StringType(), True),
    StructField("creator", StringType(), True),
    StructField("video_title", StringType(), True),
    StructField("video_thumbnail_url", StringType(), True),
    StructField("channel_name", StringType(), True)
])

# Fetch API data
def fetch_all_data():
    spark = SparkSession.builder.getOrCreate()
    all_data = []
    while True:
        response = requests.get(base_url, headers=request_headers, params=payload)
        if response.status_code != 200:
            raise Exception(f"API request failed: {response.status_code}")
        api_data = response.json()
        page_data = api_data.get('result', [])
        if not page_data:
            break
        for record in page_data:
            if 'date' in record and record['date']:
                record['date'] = datetime.strptime(record['date'], '%Y-%m-%dT%H:%M:%S')
            all_data.append(record)
        if api_data['current_page'] >= api_data['total_pages']:
            break
        payload['page'] += 1
        time.sleep(3)
    
    if not all_data:
        raise Exception("No data fetched from API")
    
    df = spark.createDataFrame(all_data, schema)
    df = df.withColumn("ingestion_timestamp", current_timestamp())
    df = df.withColumn("reply_count", round(df["reply_count"], 2))  # Limit to 2 decimal places
    return df

# Execute ingestion
try:
    df = fetch_all_data()
    
    # Save to Delta table
    spark.sql("CREATE CATALOG IF NOT EXISTS youtube2")
    spark.sql("CREATE SCHEMA IF NOT EXISTS youtube2.bronze")
    spark.sql("USE CATALOG youtube2")
    spark.sql("USE SCHEMA bronze")

    df.write.format("delta").mode("append").saveAsTable("youtube2.bronze.raw_api_data")

    print(f"Data saved to table youtube2.bronze.raw_api_data successfully")

except Exception as e:
    print(f"Error: {str(e)}")

Data saved to table youtube2.bronze.raw_api_data successfully
