In [None]:
import os
import uuid
import requests
import time
import pandas as pd
from tqdm import tqdm

# ---------------------------------------------------------------------
# Configuration and Setup
# ---------------------------------------------------------------------

# Output directory
os.makedirs("02_Result/images", exist_ok=True)
os.makedirs("02_Result", exist_ok=True)

# CSV file path (used both for saving and loading)
csv_path = "02_Result/results.csv"

# Global dictionary to store collected posts.
collected_posts = {}

# Global list to store a row per image.
collected_images = []

# If the CSV exists, load its contents into collected_posts so we don't lose prior data.
if os.path.exists(csv_path):
    try:
        df_existing = pd.read_csv(csv_path, encoding='utf-8-sig')
        for _, row in df_existing.iterrows():
            collected_posts[row['Id']] = row.to_dict()
    except Exception as e:
        print(f"Error loading existing CSV: {e}")

# ---------------------------------------------------------------------
# Helper Functions
# ---------------------------------------------------------------------

def download_image(image_url: str, post_id: str) -> tuple[str, str]:
    try:
        image_id = str(uuid.uuid4())
        image_extension = image_url.split('.')[-1].split('?')[0]
        file_name = f"02_Result/images/{image_id}.{image_extension}"
        response = requests.get(image_url, timeout=10)
        if response.status_code == 200:
            with open(file_name, 'wb') as f:
                f.write(response.content)
            return f"{image_id}:", file_name
    except Exception as e:
        print(f"Error downloading image from {image_url}: {e}")
    return '', ''

def process_submission_data(json_data: dict, keyword: str, end_time: int) -> tuple[int, int]:
    data = json_data.get('data', [])
    if not data:
        return 0, None

    last_timestamp = None
    new_posts_count = 0

    for item in data:
        created_utc = item.get('created_utc', 0)
        if created_utc >= end_time:
            continue

        post_id = item.get('id', '')
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_utc))
        content = item.get('selftext', '')
        if content in ['[removed]', '[deleted]']:
            content = ''

        image_urls = []
        if item.get('gallery_data') and item.get('media_metadata'):
            media_metadata = item.get('media_metadata', {})
            for media_id, media in media_metadata.items():
                if media.get('e') == 'Image':
                    mime = media.get('m', '')
                    ext = mime.split('/')[-1] if '/' in mime else 'jpg'
                    image_urls.append(f'https://i.redd.it/{media_id}.{ext}')
        elif item.get('url', '').lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            image_urls.append(item.get('url'))

        image_tuples = []
        for url in image_urls:
            image_id, file_name = download_image(url, post_id)
            if image_id:
                image_tuples.append((image_id, file_name))

        if not image_tuples:
            continue

        existing_rows = [row for row in collected_images if row['Id'] == post_id]
        if existing_rows:
            for row in existing_rows:
                existing_keywords = set(row['Keyword'].split(','))
                if keyword not in existing_keywords:
                    existing_keywords.add(keyword)
                    row['Keyword'] = ','.join(sorted(existing_keywords))
            for image_id, file_name in image_tuples:
                if not any(row for row in existing_rows if row.get('Image_ID') == image_id):
                    new_row = {
                        'Id': post_id,
                        'Keyword': existing_rows[0]['Keyword'],
                        'Timestamp': timestamp,
                        'Content': content,
                        'Author': item.get('author', ''),
                        'Post_url': 'https://www.reddit.com' + item.get('permalink', ''),
                        'Image_ID': image_id,
                        'File': file_name,
                        'Upvotes': item.get('ups', 0),
                        'Downvotes': item.get('downs', 0),
                        'Comments': item.get('num_comments', 0)
                    }
                    collected_images.append(new_row)
        else:
            for image_id, file_name in image_tuples:
                new_row = {
                    'Id': post_id,
                    'Keyword': keyword,
                    'Timestamp': timestamp,
                    'Content': content,
                    'Author': item.get('author', ''),
                    'Post_url': 'https://www.reddit.com' + item.get('permalink', ''),
                    'Image_ID': image_id,
                    'File': file_name,
                    'Upvotes': item.get('ups', 0),
                    'Downvotes': item.get('downs', 0),
                    'Comments': item.get('num_comments', 0)
                }
                collected_images.append(new_row)
            new_posts_count += 1

        post_rows = [row for row in collected_images if row['Id'] == post_id]
        total_images = len(post_rows)
        for row in post_rows:
            row['Num_Images'] = total_images

        last_timestamp = created_utc

    return new_posts_count, last_timestamp

def save_to_csv(csv_path: str) -> None:
    df = pd.DataFrame(collected_images)
    df.to_csv(csv_path, index=False, encoding='utf-8-sig')

# ---------------------------------------------------------------------
# Keyword Processing Function
# ---------------------------------------------------------------------

def process_keyword(keyword: str, start_time: int, end_time: int, size: int = 100) -> None:
    base_url = "https://api.pullpush.io/reddit/search/submission/"
    session = requests.Session()

    current_day = start_time
    while current_day < end_time:
        day_start = current_day
        day_end = current_day + 86400
        day_str = time.strftime("%Y-%m-%d", time.localtime(day_start))
        print(f"{keyword} - {day_str}")

        day_current_after = day_start
        while True:
            params = {
                'q': keyword,
                'after': day_current_after,
                'before': day_end,
                'sort': 'asc',
                'sort_type': 'created_utc',
                'size': size,
            }
            try:
                response = session.get(base_url, params=params, timeout=10)
                if response.status_code != 200:
                    time.sleep(2)
                    continue

                json_data = response.json()
                num_results = len(json_data.get('data', []))

                new_count, last_timestamp = process_submission_data(json_data, keyword, day_end)

                if num_results < size or last_timestamp is None:
                    break

                day_current_after = int(last_timestamp) + 1
                time.sleep(1)
            except Exception as e:
                print(f"Error processing keyword {keyword} on {day_str}: {e}")
                time.sleep(2)
                continue

        save_to_csv(csv_path)
        current_day = day_end



In [None]:
# ---------------------------------------------------------------------
# Main Execution Block
# ---------------------------------------------------------------------

if __name__ == '__main__':
    start = "2024-07-14 00:00:00"
    end = "2025-05-14 00:00:00"
    start_time_epoch = int(time.mktime(time.strptime(start, "%Y-%m-%d %H:%M:%S")))
    end_time_epoch = int(time.mktime(time.strptime(end, "%Y-%m-%d %H:%M:%S")))

    keywords = [
        # Democrats
        'Bernie', 'Sanders',
        'Alexandria', 'Ocasio-Cortez', 'Ocasio', 'Cortez', 'AOC',
        'Kamala', 'Harris',
        'Gavin', 'Newsom',
        'Elizabeth', 'Warren',
        'Cory', 'Booker',
        'Nancy', 'Pelosi',
        'Charles', 'Schumer',
        'Kamala', 'Harris',
        # Republicans
        'Donald', 'Trump',
        'Elon', 'Musk',
        'JD', 'Vance',
        'Ron', 'DeSantis',
        'Marco', 'Rubio',
        'Ted', 'Cruz',
        'Marjorie', 'Greene'
    ]

    for keyword in tqdm(keywords, desc="Processing Keywords"):
        print(f"Processing keyword: {keyword}")
        process_keyword(keyword, start_time_epoch, end_time_epoch)

    save_to_csv(csv_path)
    print(f"Final data saved to CSV with {len(collected_images)} image rows.")
