In [11]:
# Import required libraries
import praw
import requests
import os
import re
import json
import time
from datetime import datetime
from urllib.parse import urlparse
from collections import defaultdict
import logging

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create directories for output
os.makedirs('images', exist_ok=True)
os.makedirs('metadata', exist_ok=True)

print("Libraries imported successfully!")
print("Environment variables loaded from .env file")
print("Directories created: images/, metadata/")

Libraries imported successfully!
Environment variables loaded from .env file
Directories created: images/, metadata/


In [12]:
# Reddit API Configuration
# Credentials are loaded from .env file

# Load credentials from environment variables
REDDIT_CLIENT_ID = os.getenv('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.getenv('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT = "PersonalApp/1.0 by reupped"

# Check if credentials are loaded
if not REDDIT_CLIENT_ID or not REDDIT_CLIENT_SECRET:
    print("Error: Reddit API credentials not found in .env file")
    print("Please make sure your .env file contains:")
    print("REDDIT_CLIENT_ID=your_client_id")
    print("REDDIT_CLIENT_SECRET=your_client_secret")
else:
    print(f"Loaded credentials - Client ID: {REDDIT_CLIENT_ID[:10]}...")

# Initialize Reddit instance
try:
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )

    # Test the connection
    print("Reddit API connection successful!")
    print(f"Read-only mode: {reddit.read_only}")

except Exception as e:
    print(f"Error connecting to Reddit API: {e}")
    print("Please check your Reddit API credentials in the .env file")

Loaded credentials - Client ID: QH_iPhx9pf...
Reddit API connection successful!
Read-only mode: True


In [13]:
# Advanced bug type detection and classification system
BUG_TYPES = {
    'mosquito': ['mosquito', 'mosquitoes', 'skeeter', 'skeeters', 'mozzie', 'mozzies'],
    'spider': ['spider', 'spiders', 'arachnid', 'black widow', 'brown recluse', 'wolf spider', 'house spider'],
    'tick': ['tick', 'ticks', 'lyme', 'deer tick', 'wood tick', 'lone star tick'],
    'flea': ['flea', 'fleas', 'flea bite', 'flea bites', 'cat flea', 'dog flea'],
    'bed_bug': ['bed bug', 'bedbug', 'bed bugs', 'bedbugs', 'bed-bug', 'bed-bugs'],
    'ant': ['ant', 'ants', 'fire ant', 'carpenter ant', 'red ant', 'black ant'],
    'bee': ['bee', 'bees', 'honey bee', 'bumble bee', 'wasp', 'hornet', 'yellow jacket'],
    'fly': ['fly', 'flies', 'horse fly', 'deer fly', 'black fly', 'sand fly', 'biting fly'],
    'mite': ['mite', 'mites', 'chigger', 'chiggers', 'dust mite', 'scabies'],
    'gnat': ['gnat', 'gnats', 'no-see-um', 'biting midge']
}

# Contextual clues that might help identify bug types
CONTEXTUAL_CLUES = {
    'mosquito': ['itchy', 'raised', 'red bump', 'welts', 'summer', 'evening', 'outdoors', 'water nearby'],
    'spider': ['two puncture', 'fang marks', 'necrotic', 'dark center', 'spreading', 'painful'],
    'tick': ['bullseye', 'bulls eye', 'circular', 'rash', 'lyme', 'woods', 'hiking', 'attached'],
    'flea': ['ankles', 'lower legs', 'clusters', 'small red', 'pets', 'cat', 'dog'],
    'bed_bug': ['linear', 'line', 'breakfast lunch dinner', 'bed', 'mattress', 'multiple', 'arms', 'back'],
    'bee': ['stinger', 'swollen', 'allergic', 'painful', 'immediate', 'wasp'],
    'ant': ['burning', 'fire', 'pustule', 'white head', 'yard', 'mound'],
    'mite': ['extremely itchy', 'burrow', 'between fingers', 'scabies', 'chigger'],
    'fly': ['painful', 'horse', 'deer', 'immediate pain', 'bleeding']
}

# Seasonal patterns
SEASONAL_PATTERNS = {
    'mosquito': ['summer', 'spring', 'warm', 'humid'],
    'tick': ['spring', 'summer', 'fall', 'hiking season'],
    'flea': ['year round', 'indoor'],
    'bed_bug': ['year round', 'indoor'],
    'bee': ['spring', 'summer', 'warm weather'],
    'fly': ['summer', 'warm'],
    'mite': ['year round']
}

def detect_bug_type_advanced(text, comments_text="", debug=False):
    """
    Advanced bug type detection using multiple strategies
    """
    text = text.lower()
    comments_text = comments_text.lower()
    combined_text = f"{text} {comments_text}"

    if debug:
        print(f"Analyzing text: '{text[:100]}...'")
        if comments_text:
            print(f"Comments: '{comments_text[:100]}...'")

    # Strategy 1: Direct keyword matching
    keyword_scores = defaultdict(int)
    found_keywords = defaultdict(list)

    for bug_type, keywords in BUG_TYPES.items():
        for keyword in keywords:
            pattern = r'\b' + re.escape(keyword) + r'\b'
            matches = len(re.findall(pattern, combined_text))
            if matches > 0:
                keyword_scores[bug_type] += matches * 3  # High weight for direct mentions
                found_keywords[bug_type].append(f"{keyword}({matches})")

    # Strategy 2: Contextual clue analysis
    context_scores = defaultdict(int)
    found_context = defaultdict(list)

    for bug_type, clues in CONTEXTUAL_CLUES.items():
        for clue in clues:
            if clue in combined_text:
                context_scores[bug_type] += 1
                found_context[bug_type].append(clue)

    # Strategy 3: Pattern-based detection for common scenarios
    pattern_scores = defaultdict(int)
    found_patterns = defaultdict(list)

    # Common bite patterns
    if any(phrase in combined_text for phrase in ['line of bites', 'three in a row', 'breakfast lunch dinner']):
        pattern_scores['bed_bug'] += 2
        found_patterns['bed_bug'].append('linear pattern')

    if any(phrase in combined_text for phrase in ['cluster', 'grouped', 'multiple small']):
        pattern_scores['flea'] += 1
        found_patterns['flea'].append('clustered bites')

    if any(phrase in combined_text for phrase in ['bullseye', 'bulls eye', 'expanding', 'circular rash']):
        pattern_scores['tick'] += 3
        found_patterns['tick'].append('bullseye pattern')

    if any(phrase in combined_text for phrase in ['two holes', 'fang marks', 'puncture']):
        pattern_scores['spider'] += 2
        found_patterns['spider'].append('puncture marks')

    # Strategy 4: Location-based hints
    location_scores = defaultdict(int)
    found_locations = defaultdict(list)

    location_hints = {
        'bed_bug': ['bed', 'mattress', 'hotel', 'couch', 'furniture'],
        'flea': ['ankle', 'lower leg', 'sock line', 'pet'],
        'mosquito': ['arm', 'leg', 'exposed skin', 'outside', 'evening'],
        'tick': ['hairline', 'scalp', 'armpit', 'groin', 'hiking', 'woods'],
        'spider': ['corner', 'basement', 'garage', 'closet', 'undisturbed area']
    }

    for bug_type, locations in location_hints.items():
        for location in locations:
            if location in combined_text:
                location_scores[bug_type] += 1
                found_locations[bug_type].append(location)

    # Combine all scores
    total_scores = defaultdict(int)
    for bug_type in BUG_TYPES.keys():
        total_scores[bug_type] = (
            keyword_scores[bug_type] +
            context_scores[bug_type] +
            pattern_scores[bug_type] +
            location_scores[bug_type]
        )

    if debug:
        print("\nScoring breakdown:")
        for bug_type in BUG_TYPES.keys():
            if total_scores[bug_type] > 0:
                print(f"  {bug_type}: {total_scores[bug_type]} points")
                if found_keywords[bug_type]:
                    print(f"    Keywords: {', '.join(found_keywords[bug_type])}")
                if found_context[bug_type]:
                    print(f"    Context: {', '.join(found_context[bug_type])}")
                if found_patterns[bug_type]:
                    print(f"    Patterns: {', '.join(found_patterns[bug_type])}")
                if found_locations[bug_type]:
                    print(f"    Locations: {', '.join(found_locations[bug_type])}")

    # Return best match or use fallback strategy
    if total_scores:
        best_match = max(total_scores.items(), key=lambda x: x[1])
        if best_match[1] > 0:
            if debug:
                print(f"Best match: {best_match[0]} (score: {best_match[1]})")
            return best_match[0]

    # Fallback strategy: Use simple heuristics based on common post patterns
    fallback_result = fallback_classification(combined_text, debug)
    if debug:
        print(f"Using fallback classification: {fallback_result}")

    return fallback_result

def fallback_classification(text, debug=False):
    """
    Fallback classification for posts with minimal information
    Uses heuristics based on common post characteristics
    """
    # If it's asking for identification and has minimal context,
    # try to infer from post characteristics

    if any(phrase in text for phrase in ['what bit', 'what is this', 'help id', 'identify']):
        # These are identification requests - try to infer from any available context

        # Look for size/appearance clues
        if any(word in text for word in ['small', 'tiny', 'little']):
            if any(word in text for word in ['red', 'bump', 'itchy']):
                return 'mosquito'  # Small, red, itchy = likely mosquito

        # Look for location clues
        if any(word in text for word in ['bed', 'night', 'morning']):
            return 'bed_bug'

        if any(word in text for word in ['ankle', 'leg', 'pet']):
            return 'flea'

        # Look for severity clues
        if any(word in text for word in ['painful', 'swollen', 'severe']):
            return 'spider'  # More severe reactions often spiders

        # If mentioned outdoors/nature
        if any(word in text for word in ['outside', 'yard', 'woods', 'hiking']):
            return 'tick'

    # Default for truly ambiguous cases
    return 'unknown'

# Update the main function for backward compatibility
detect_bug_type = detect_bug_type_advanced

def reclassify_existing_data():
    """
    Reclassify all existing data with the new improved algorithm
    """
    try:
        with open('metadata/scraping_results.json', 'r') as f:
            data = json.load(f)

        print(f"Reclassifying {len(data)} existing entries...")
        print("=" * 60)

        reclassified_count = 0
        type_changes = defaultdict(int)

        for item in data:
            old_type = item['bug_type']

            # Use title and any available context for reclassification
            new_type = detect_bug_type_advanced(
                item['post_title'],
                comments_text="",  # We don't have comments stored
                debug=False
            )

            if old_type != new_type:
                item['bug_type'] = new_type
                item['reclassified'] = True
                reclassified_count += 1
                type_changes[f"{old_type} -> {new_type}"] += 1

        # Save the reclassified data
        with open('metadata/scraping_results_reclassified.json', 'w') as f:
            json.dump(data, f, indent=2)

        print(f"Reclassification complete!")
        print(f"Changes made: {reclassified_count}/{len(data)} items")
        print(f"Results saved to: metadata/scraping_results_reclassified.json")

        if type_changes:
            print("\nClassification changes:")
            for change, count in sorted(type_changes.items()):
                print(f"  {change}: {count} items")

        # Show new distribution
        new_distribution = defaultdict(int)
        for item in data:
            new_distribution[item['bug_type']] += 1

        print(f"\nNew bug type distribution:")
        for bug_type, count in sorted(new_distribution.items()):
            percentage = (count / len(data)) * 100
            print(f"  {bug_type.upper()}: {count} images ({percentage:.1f}%)")

        return data

    except FileNotFoundError:
        print("No existing data found to reclassify")
        return None

# Test the improved function
test_texts = [
    "What bit my kid?",
    "Spider bite or something else?",
    "Mosquito bite on my arm",
    "Bed bug infestation help!",
    "Can anyone help me ID these bug bites?",
    "What is biting my son?",
    "Three bites in a row on my arm",
    "Small red bumps on ankles after being outside",
    "Painful bite with two puncture marks"
]

print("Testing advanced bug detection:")
for text in test_texts:
    bug_type = detect_bug_type_advanced(text, debug=False)
    print(f"'{text}' -> {bug_type}")

print("\nAdvanced bug type detection system ready!")

# Reclassify existing data if available
if os.path.exists('metadata/scraping_results.json'):
    print("\n" + "="*60)
    print("RECLASSIFYING EXISTING DATA")
    print("="*60)
    reclassify_existing_data()

Testing advanced bug detection:
'What bit my kid?' -> unknown
'Spider bite or something else?' -> spider
'Mosquito bite on my arm' -> mosquito
'Bed bug infestation help!' -> bed_bug
'Can anyone help me ID these bug bites?' -> unknown
'What is biting my son?' -> unknown
'Three bites in a row on my arm' -> mosquito
'Small red bumps on ankles after being outside' -> flea
'Painful bite with two puncture marks' -> spider

Advanced bug type detection system ready!


In [14]:
# File renaming utility based on reclassification
def rename_files_by_classification():
    """
    Rename existing downloaded files based on new classifications
    """
    try:
        # Load the reclassified data
        reclassified_file = 'metadata/scraping_results_reclassified.json'
        if not os.path.exists(reclassified_file):
            print("No reclassified data found. Run reclassification first!")
            return

        with open(reclassified_file, 'r') as f:
            data = json.load(f)

        # Group by new bug type and create new counters
        new_counters = defaultdict(int)
        rename_mapping = []

        # Sort data by bug type to ensure consistent numbering
        data_by_type = defaultdict(list)
        for item in data:
            data_by_type[item['bug_type']].append(item)

        # Create new filenames for each type
        for bug_type in sorted(data_by_type.keys()):
            items = data_by_type[bug_type]
            for item in items:
                old_filename = item['filename']
                new_counters[bug_type] += 1
                new_filename = f"images/{bug_type.upper()}_{new_counters[bug_type]}.jpg"

                if old_filename != new_filename:
                    rename_mapping.append((old_filename, new_filename))
                    item['filename'] = new_filename  # Update metadata

        # Perform the actual renaming
        renamed_count = 0
        for old_path, new_path in rename_mapping:
            if os.path.exists(old_path):
                # Ensure no conflict with existing files
                if os.path.exists(new_path):
                    # Create a temporary name to avoid conflicts
                    temp_path = f"{new_path}.temp"
                    os.rename(old_path, temp_path)
                    old_path = temp_path

                os.rename(old_path, new_path)
                renamed_count += 1
                print(f"Renamed: {old_path} -> {new_path}")

        # Save updated metadata
        with open(reclassified_file, 'w') as f:
            json.dump(data, f, indent=2)

        print(f"\nFile renaming complete!")
        print(f"Renamed {renamed_count} files")
        print(f"Updated metadata saved to {reclassified_file}")

        # Show current file structure
        print(f"\nCurrent images directory:")
        try:
            images = sorted([f for f in os.listdir('images') if f.endswith('.jpg')])
            type_counts = defaultdict(int)
            for img in images:
                bug_type = img.split('_')[0].lower()
                type_counts[bug_type] += 1

            for bug_type, count in sorted(type_counts.items()):
                print(f"  {bug_type.upper()}: {count} files")

        except FileNotFoundError:
            print("  No images directory found")

    except Exception as e:
        print(f"Error during file renaming: {e}")

print("File renaming utility ready!")

File renaming utility ready!


In [15]:
# Image download and processing functions with timestamped runs
def create_run_directory():
    """Create a timestamped directory for this scraping run"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = f"images/run{timestamp}"
    os.makedirs(run_dir, exist_ok=True)

    print(f"Created run directory: {run_dir}")
    return run_dir, timestamp

def is_image_url(url):
    """Check if URL points to an image"""
    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']
    parsed_url = urlparse(url.lower())
    return any(parsed_url.path.endswith(ext) for ext in image_extensions)

def get_reddit_image_urls(submission):
    """Extract image URLs from a Reddit submission"""
    urls = []

    # Direct image link
    if hasattr(submission, 'url') and is_image_url(submission.url):
        urls.append(submission.url)

    # Reddit gallery
    if hasattr(submission, 'is_gallery') and submission.is_gallery:
        try:
            for item in submission.gallery_data['items']:
                media_id = item['media_id']
                if media_id in submission.media_metadata:
                    media_info = submission.media_metadata[media_id]
                    if 's' in media_info and 'u' in media_info['s']:
                        # Convert preview URL to full resolution
                        url = media_info['s']['u'].replace('preview.redd.it', 'i.redd.it')
                        url = url.split('?')[0]  # Remove query parameters
                        urls.append(url)
        except Exception as e:
            logger.warning(f"Error processing gallery: {e}")

    # Check if it's an Imgur link
    if 'imgur.com' in submission.url:
        # Convert imgur links to direct image links
        if '/a/' in submission.url or '/gallery/' in submission.url:
            # Album/gallery - would need imgur API for full access
            logger.info(f"Imgur album detected: {submission.url}")
        else:
            # Single image
            imgur_id = submission.url.split('/')[-1].split('.')[0]
            direct_url = f"https://i.imgur.com/{imgur_id}.jpg"
            urls.append(direct_url)

    return urls

def download_image(url, filename):
    """Download an image from URL and save it"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        # Check if response is actually an image
        content_type = response.headers.get('content-type', '')
        if not content_type.startswith('image/'):
            logger.warning(f"URL doesn't return an image: {url}")
            return False

        with open(filename, 'wb') as f:
            f.write(response.content)

        logger.info(f"Downloaded: {filename}")
        return True

    except Exception as e:
        logger.error(f"Error downloading {url}: {e}")
        return False

# Global variables for run management
current_run_dir = None
current_timestamp = None
bug_counters = defaultdict(int)

def initialize_new_run():
    """Initialize a new scraping run with timestamped directory"""
    global current_run_dir, current_timestamp, bug_counters

    current_run_dir, current_timestamp = create_run_directory()
    bug_counters = defaultdict(int)  # Reset counters for new run

    return current_run_dir, current_timestamp

def get_next_filename(bug_type):
    """Get the next filename for a bug type in the current run"""
    global current_run_dir, bug_counters

    if current_run_dir is None:
        # Initialize if not already done
        initialize_new_run()

    bug_counters[bug_type] += 1
    return f"{current_run_dir}/{bug_type.upper()}_{bug_counters[bug_type]}.jpg"

def get_run_summary():
    """Get summary of the current run"""
    global current_run_dir, current_timestamp, bug_counters

    if current_run_dir is None:
        return "No active run"

    summary = {
        'run_directory': current_run_dir,
        'timestamp': current_timestamp,
        'bug_type_counts': dict(bug_counters),
        'total_images': sum(bug_counters.values())
    }

    return summary

print("Image processing functions with run management ready!")

Image processing functions with run management ready!


In [16]:
# Main scraping function with run management
def scrape_bugbites_subreddit(limit=50, time_filter='week'):
    """
    Scrape r/bugbites subreddit for images and classify them
    Each run gets its own timestamped directory

    Args:
        limit: Number of posts to scrape
        time_filter: Time filter for posts ('hour', 'day', 'week', 'month', 'year', 'all')
    """

    # Initialize new run
    run_dir, timestamp = initialize_new_run()
    scraped_data = []

    print(f"Starting new scraping run: {timestamp}")
    print(f"Images will be saved to: {run_dir}")
    print("=" * 60)

    try:
        subreddit = reddit.subreddit('bugbites')

        # Get posts from the subreddit
        posts = subreddit.hot(limit=limit)  # You can also use .new(), .top(time_filter=time_filter)

        for post_count, submission in enumerate(posts, 1):
            print(f"\n--- Processing post {post_count}/{limit} ---")
            print(f"Title: {submission.title[:80]}...")

            # Analyze title and selftext for bug type
            combined_text = f"{submission.title} {submission.selftext}"

            # Also check comments for additional context
            submission.comments.replace_more(limit=0)  # Don't load "more comments"
            comments_text = ""
            for comment in submission.comments.list()[:10]:  # First 10 comments
                if hasattr(comment, 'body'):
                    comments_text += f" {comment.body}"

            # Use the advanced detection system
            bug_type = detect_bug_type_advanced(combined_text, comments_text, debug=False)

            print(f"Detected bug type: {bug_type}")

            # Get image URLs
            image_urls = get_reddit_image_urls(submission)

            if image_urls:
                print(f"Found {len(image_urls)} image(s)")

                for img_url in image_urls:
                    filename = get_next_filename(bug_type)

                    if download_image(img_url, filename):
                        # Store metadata with run information
                        metadata = {
                            'run_timestamp': timestamp,
                            'run_directory': run_dir,
                            'filename': filename,
                            'bug_type': bug_type,
                            'post_title': submission.title,
                            'post_url': f"https://reddit.com{submission.permalink}",
                            'image_url': img_url,
                            'post_score': submission.score,
                            'num_comments': submission.num_comments,
                            'created_utc': submission.created_utc,
                            'author': str(submission.author) if submission.author else '[deleted]',
                            'scraped_at': datetime.now().isoformat()
                        }
                        scraped_data.append(metadata)

                        print(f"Saved as: {filename}")
            else:
                print("No images found in this post")

            # Be respectful to Reddit's API
            time.sleep(1)

    except Exception as e:
        logger.error(f"Error scraping subreddit: {e}")

    return scraped_data, timestamp

def save_metadata_with_run(scraped_data, timestamp):
    """Save scraping metadata with run-specific information"""

    # Save run-specific metadata
    run_metadata_file = f'metadata/scraping_results_run{timestamp}.json'
    with open(run_metadata_file, 'w') as f:
        json.dump(scraped_data, f, indent=2)

    # Also append to master metadata file
    master_metadata_file = 'metadata/all_scraping_results.json'

    # Load existing master data if it exists
    all_data = []
    if os.path.exists(master_metadata_file):
        try:
            with open(master_metadata_file, 'r') as f:
                all_data = json.load(f)
        except json.JSONDecodeError:
            all_data = []

    # Add new data
    all_data.extend(scraped_data)

    # Save updated master file
    with open(master_metadata_file, 'w') as f:
        json.dump(all_data, f, indent=2)

    print(f"Run metadata saved to: {run_metadata_file}")
    print(f"Master metadata updated: {master_metadata_file}")

    # Save run summary
    run_summary = get_run_summary()
    run_summary['metadata_file'] = run_metadata_file
    run_summary['total_posts_processed'] = len(scraped_data)

    summary_file = f'metadata/run_summary_{timestamp}.json'
    with open(summary_file, 'w') as f:
        json.dump(run_summary, f, indent=2)

    print(f"Run summary saved to: {summary_file}")

    return run_metadata_file, master_metadata_file

def list_all_runs():
    """List all previous scraping runs"""
    try:
        runs = []

        # Look for run directories
        if os.path.exists('images'):
            for item in os.listdir('images'):
                if item.startswith('run') and os.path.isdir(f'images/{item}'):
                    timestamp = item[3:]  # Remove 'run' prefix

                    # Count files in directory
                    run_dir = f'images/{item}'
                    image_count = len([f for f in os.listdir(run_dir) if f.endswith('.jpg')])

                    # Try to load summary if available
                    summary_file = f'metadata/run_summary_{timestamp}.json'
                    bug_counts = {}
                    if os.path.exists(summary_file):
                        with open(summary_file, 'r') as f:
                            summary = json.load(f)
                            bug_counts = summary.get('bug_type_counts', {})

                    runs.append({
                        'timestamp': timestamp,
                        'directory': run_dir,
                        'image_count': image_count,
                        'bug_counts': bug_counts
                    })

        # Sort by timestamp
        runs.sort(key=lambda x: x['timestamp'], reverse=True)

        if runs:
            print(f"Found {len(runs)} previous runs:")
            print("-" * 80)
            for run in runs:
                print(f"Run {run['timestamp']}: {run['image_count']} images in {run['directory']}")
                if run['bug_counts']:
                    for bug_type, count in run['bug_counts'].items():
                        print(f"  {bug_type.upper()}: {count}")
                print()
        else:
            print("No previous runs found")

        return runs

    except Exception as e:
        print(f"Error listing runs: {e}")
        return []

print("Main scraping function with run management ready!")

Main scraping function with run management ready!


In [17]:
# Execute the scraping with timestamped runs
# Note: Make sure you've updated the Reddit API credentials above before running this!

# Configure scraping parameters
POSTS_TO_SCRAPE = 10  # Start with a smaller number for testing
TIME_FILTER = 'week'  # 'hour', 'day', 'week', 'month', 'year', 'all'

print("Starting new Reddit scraping run...")
print(f"Will scrape {POSTS_TO_SCRAPE} posts from the past {TIME_FILTER}")
print("=" * 60)

# List existing runs first
print("\nPrevious runs:")
list_all_runs()

print("\n" + "=" * 60)
print("STARTING NEW RUN")
print("=" * 60)

# Run the scraper with new timestamped system
scraped_data, run_timestamp = scrape_bugbites_subreddit(limit=POSTS_TO_SCRAPE, time_filter=TIME_FILTER)

# Save metadata with run information
if scraped_data:
    run_metadata_file, master_metadata_file = save_metadata_with_run(scraped_data, run_timestamp)

    # Print detailed summary
    print("\n" + "=" * 60)
    print("SCRAPING RUN COMPLETE!")
    print("=" * 60)
    print(f"Run timestamp: {run_timestamp}")
    print(f"Total images downloaded: {len(scraped_data)}")

    # Count by bug type for this run
    run_bug_counts = defaultdict(int)
    for item in scraped_data:
        run_bug_counts[item['bug_type']] += 1

    print(f"\nImages by bug type (this run):")
    for bug_type, count in sorted(run_bug_counts.items()):
        print(f"  {bug_type.upper()}: {count} images")

    # Show run directory structure
    print(f"\nRun directory: {current_run_dir}")
    try:
        files = sorted([f for f in os.listdir(current_run_dir) if f.endswith('.jpg')])
        print(f"Files created:")
        for f in files[:10]:  # Show first 10 files
            print(f"  {f}")
        if len(files) > 10:
            print(f"  ... and {len(files) - 10} more")
    except:
        pass

    print(f"\nMetadata files:")
    print(f"  Run-specific: {run_metadata_file}")
    print(f"  Master file: {master_metadata_file}")

else:
    print("\nNo images were downloaded. Check your Reddit API credentials and internet connection.")

print(f"\nRun completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Starting new Reddit scraping run...
Will scrape 10 posts from the past week

Previous runs:
Found 1 previous runs:
--------------------------------------------------------------------------------
Run 0: 19 images in images/run0


STARTING NEW RUN
Created run directory: images/run20250806_200212
Starting new scraping run: 20250806_200212
Images will be saved to: images/run20250806_200212

--- Processing post 1/10 ---
Title: Read this before posting....

--- Processing post 1/10 ---
Title: Read this before posting....
Detected bug type: flea
No images found in this post
Detected bug type: flea
No images found in this post

--- Processing post 2/10 ---
Title: What bit my kid?...
Detected bug type: bed_bug
Found 2 image(s)

--- Processing post 2/10 ---
Title: What bit my kid?...
Detected bug type: bed_bug
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/BED_BUG_1.jpg
INFO:__main__:Downloaded: images/run20250806_200212/BED_BUG_2.jpg
INFO:__main__:Downloaded: images/run20250806_200212/BED_BUG_2.jpg


Saved as: images/run20250806_200212/BED_BUG_1.jpg
Saved as: images/run20250806_200212/BED_BUG_2.jpg

--- Processing post 3/10 ---
Title: How do I get rid of this?...
Detected bug type: mosquito
Found 1 image(s)

--- Processing post 3/10 ---
Title: How do I get rid of this?...
Detected bug type: mosquito
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_1.jpg


Saved as: images/run20250806_200212/MOSQUITO_1.jpg

--- Processing post 4/10 ---
Title: What got me ??? Texas...
Detected bug type: mosquito
Found 2 image(s)

--- Processing post 4/10 ---
Title: What got me ??? Texas...
Detected bug type: mosquito
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_2.jpg
INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_3.jpg
INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_3.jpg


Saved as: images/run20250806_200212/MOSQUITO_2.jpg
Saved as: images/run20250806_200212/MOSQUITO_3.jpg

--- Processing post 5/10 ---
Title: Bug bites?? on dad...
Detected bug type: mosquito
Found 2 image(s)

--- Processing post 5/10 ---
Title: Bug bites?? on dad...
Detected bug type: mosquito
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_4.jpg
INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_5.jpg
INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_5.jpg


Saved as: images/run20250806_200212/MOSQUITO_4.jpg
Saved as: images/run20250806_200212/MOSQUITO_5.jpg

--- Processing post 6/10 ---
Title: What is biting my son?...
Detected bug type: unknown
Found 8 image(s)

--- Processing post 6/10 ---
Title: What is biting my son?...
Detected bug type: unknown
Found 8 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_1.jpg
INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_2.jpg
INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_2.jpg


Saved as: images/run20250806_200212/UNKNOWN_1.jpg
Saved as: images/run20250806_200212/UNKNOWN_2.jpg


INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_3.jpg
INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_4.jpg
INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_4.jpg


Saved as: images/run20250806_200212/UNKNOWN_3.jpg
Saved as: images/run20250806_200212/UNKNOWN_4.jpg


INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_5.jpg
INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_6.jpg
INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_6.jpg
INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_7.jpg
INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_7.jpg


Saved as: images/run20250806_200212/UNKNOWN_5.jpg
Saved as: images/run20250806_200212/UNKNOWN_6.jpg
Saved as: images/run20250806_200212/UNKNOWN_7.jpg


INFO:__main__:Downloaded: images/run20250806_200212/UNKNOWN_8.jpg


Saved as: images/run20250806_200212/UNKNOWN_8.jpg

--- Processing post 7/10 ---
Title: Bug bite...
Detected bug type: mosquito
Found 1 image(s)

--- Processing post 7/10 ---
Title: Bug bite...
Detected bug type: mosquito
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_6.jpg


Saved as: images/run20250806_200212/MOSQUITO_6.jpg

--- Processing post 8/10 ---
Title: Can anyone help me ID these bug bites?...
Detected bug type: bed_bug
Found 1 image(s)

--- Processing post 8/10 ---
Title: Can anyone help me ID these bug bites?...
Detected bug type: bed_bug
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/BED_BUG_3.jpg


Saved as: images/run20250806_200212/BED_BUG_3.jpg

--- Processing post 9/10 ---
Title: What is this ?? Showed up a couple days ago with another small dot on my stomach...
Detected bug type: mosquito
Found 1 image(s)

--- Processing post 9/10 ---
Title: What is this ?? Showed up a couple days ago with another small dot on my stomach...
Detected bug type: mosquito
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_7.jpg


Saved as: images/run20250806_200212/MOSQUITO_7.jpg

--- Processing post 10/10 ---
Title: What the freak is this...
Detected bug type: mosquito
Found 1 image(s)

--- Processing post 10/10 ---
Title: What the freak is this...
Detected bug type: mosquito
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_200212/MOSQUITO_8.jpg


Saved as: images/run20250806_200212/MOSQUITO_8.jpg
Run metadata saved to: metadata/scraping_results_run20250806_200212.json
Master metadata updated: metadata/all_scraping_results.json
Run summary saved to: metadata/run_summary_20250806_200212.json

SCRAPING RUN COMPLETE!
Run timestamp: 20250806_200212
Total images downloaded: 19

Images by bug type (this run):
  BED_BUG: 3 images
  MOSQUITO: 8 images
  UNKNOWN: 8 images

Run directory: images/run20250806_200212
Files created:
  BED_BUG_1.jpg
  BED_BUG_2.jpg
  BED_BUG_3.jpg
  MOSQUITO_1.jpg
  MOSQUITO_2.jpg
  MOSQUITO_3.jpg
  MOSQUITO_4.jpg
  MOSQUITO_5.jpg
  MOSQUITO_6.jpg
  MOSQUITO_7.jpg
  ... and 9 more

Metadata files:
  Run-specific: metadata/scraping_results_run20250806_200212.json
  Master file: metadata/all_scraping_results.json

Run completed at: 2025-08-06 20:02:26
Run metadata saved to: metadata/scraping_results_run20250806_200212.json
Master metadata updated: metadata/all_scraping_results.json
Run summary saved to: metadata

In [18]:
# Run management and analysis utilities

def analyze_run(timestamp=None):
    """Analyze a specific run or the most recent run"""

    if timestamp is None:
        # Find most recent run
        runs = list_all_runs()
        if not runs:
            print("No runs found")
            return
        timestamp = runs[0]['timestamp']
        print(f"Analyzing most recent run: {timestamp}")

    metadata_file = f'metadata/scraping_results_run{timestamp}.json'

    try:
        with open(metadata_file, 'r') as f:
            data = json.load(f)

        if not data:
            print("No data found in run")
            return

        print(f"\nAnalysis of run {timestamp}:")
        print("=" * 50)
        print(f"Total images: {len(data)}")
        print(f"Run directory: {data[0]['run_directory']}")
        print(f"Scraped at: {data[0]['scraped_at']}")

        # Bug type distribution
        bug_counts = defaultdict(int)
        total_score = 0
        total_comments = 0

        for item in data:
            bug_counts[item['bug_type']] += 1
            total_score += item.get('post_score', 0)
            total_comments += item.get('num_comments', 0)

        print(f"\nBug Type Distribution:")
        for bug_type, count in sorted(bug_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / len(data)) * 100
            print(f"  {bug_type.upper()}: {count} images ({percentage:.1f}%)")

        print(f"\nAverage post score: {total_score / len(data):.1f}")
        print(f"Average comments per post: {total_comments / len(data):.1f}")

        # Show some sample classifications
        print(f"\nSample classifications:")
        for i, item in enumerate(data[:5]):
            print(f"  {item['bug_type'].upper()}: {item['post_title'][:60]}...")

    except FileNotFoundError:
        print(f"Run metadata not found: {metadata_file}")
    except Exception as e:
        print(f"Error analyzing run: {e}")

def compare_runs():
    """Compare all runs"""
    runs = list_all_runs()

    if len(runs) < 2:
        print("Need at least 2 runs to compare")
        return

    print("Run Comparison:")
    print("=" * 80)

    # Load all run data
    all_run_data = []
    for run in runs:
        timestamp = run['timestamp']
        metadata_file = f'metadata/scraping_results_run{timestamp}.json'

        try:
            with open(metadata_file, 'r') as f:
                data = json.load(f)

            bug_counts = defaultdict(int)
            for item in data:
                bug_counts[item['bug_type']] += 1

            all_run_data.append({
                'timestamp': timestamp,
                'total_images': len(data),
                'bug_counts': dict(bug_counts)
            })
        except:
            continue

    # Print comparison table
    print(f"{'Run':<16} {'Total':<8} {'Unknown':<8} {'Mosquito':<10} {'Spider':<8} {'Others':<8}")
    print("-" * 80)

    for run_data in all_run_data:
        timestamp = run_data['timestamp']
        total = run_data['total_images']
        unknown = run_data['bug_counts'].get('unknown', 0)
        mosquito = run_data['bug_counts'].get('mosquito', 0)
        spider = run_data['bug_counts'].get('spider', 0)
        others = total - unknown - mosquito - spider

        print(f"{timestamp:<16} {total:<8} {unknown:<8} {mosquito:<10} {spider:<8} {others:<8}")

def cleanup_old_runs(keep_recent=5):
    """Clean up old runs, keeping only the most recent ones"""
    runs = list_all_runs()

    if len(runs) <= keep_recent:
        print(f"Only {len(runs)} runs found, nothing to clean up")
        return

    runs_to_delete = runs[keep_recent:]

    print(f"Will delete {len(runs_to_delete)} old runs, keeping {keep_recent} most recent:")
    for run in runs_to_delete:
        print(f"  {run['timestamp']}")

    confirm = input("\nProceed with deletion? (y/N): ")
    if confirm.lower() != 'y':
        print("Cleanup cancelled")
        return

    deleted_count = 0
    for run in runs_to_delete:
        timestamp = run['timestamp']

        # Delete run directory
        run_dir = f"images/run{timestamp}"
        if os.path.exists(run_dir):
            import shutil
            shutil.rmtree(run_dir)
            deleted_count += 1

        # Delete metadata files
        for file_pattern in [f'metadata/scraping_results_run{timestamp}.json',
                           f'metadata/run_summary_{timestamp}.json']:
            if os.path.exists(file_pattern):
                os.remove(file_pattern)

    print(f"Cleanup complete! Deleted {deleted_count} run directories")

# Quick analysis of existing data
print("Run Management Utilities Ready!")
print("\nCurrent runs:")
list_all_runs()

if os.path.exists('images') and any(d.startswith('run') for d in os.listdir('images')):
    print("\nAnalyzing most recent run:")
    analyze_run()

Run Management Utilities Ready!

Current runs:
Found 2 previous runs:
--------------------------------------------------------------------------------
Run 20250806_200212: 19 images in images/run20250806_200212
  BED_BUG: 3
  MOSQUITO: 8
  UNKNOWN: 8

Run 0: 19 images in images/run0


Analyzing most recent run:
Found 2 previous runs:
--------------------------------------------------------------------------------
Run 20250806_200212: 19 images in images/run20250806_200212
  BED_BUG: 3
  MOSQUITO: 8
  UNKNOWN: 8

Run 0: 19 images in images/run0

Analyzing most recent run: 20250806_200212

Analysis of run 20250806_200212:
Total images: 19
Run directory: images/run20250806_200212
Scraped at: 2025-08-06T20:02:14.159615

Bug Type Distribution:
  MOSQUITO: 8 images (42.1%)
  UNKNOWN: 8 images (42.1%)
  BED_BUG: 3 images (15.8%)

Average post score: 1.1
Average comments per post: 1.0

Sample classifications:
  BED_BUG: What bit my kid?...
  BED_BUG: What bit my kid?...
  MOSQUITO: How do I ge

In [19]:
# Analysis and utility functions
def analyze_scraped_data(metadata_file='metadata/scraping_results.json'):
    """Analyze the scraped data and show statistics"""

    try:
        with open(metadata_file, 'r') as f:
            data = json.load(f)

        if not data:
            print("No data found in metadata file")
            return

        print(f"Analysis of {len(data)} scraped images:")
        print("=" * 40)

        # Bug type distribution
        bug_counts = defaultdict(int)
        total_score = 0
        total_comments = 0

        for item in data:
            bug_counts[item['bug_type']] += 1
            total_score += item.get('post_score', 0)
            total_comments += item.get('num_comments', 0)

        print("\nBug Type Distribution:")
        for bug_type, count in sorted(bug_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / len(data)) * 100
            print(f"  {bug_type.upper()}: {count} images ({percentage:.1f}%)")

        print(f"\nAverage post score: {total_score / len(data):.1f}")
        print(f"Average comments per post: {total_comments / len(data):.1f}")

        # Show some sample filenames
        print(f"\nSample downloaded files:")
        for i, item in enumerate(data[:5]):
            print(f"  {item['filename']} - {item['bug_type']} - {item['post_title'][:50]}...")

    except FileNotFoundError:
        print(f"Metadata file {metadata_file} not found. Run the scraper first!")
    except Exception as e:
        print(f"Error analyzing data: {e}")

def list_downloaded_images():
    """List all downloaded images"""
    try:
        images = [f for f in os.listdir('images') if f.endswith('.jpg')]
        if images:
            print(f"Downloaded images ({len(images)} total):")
            for img in sorted(images):
                print(f"  {img}")
        else:
            print("No images found in images/ directory")
    except FileNotFoundError:
        print("Images directory not found")

# Run analysis if metadata exists
if os.path.exists('metadata/scraping_results.json'):
    analyze_scraped_data()
else:
    print("No metadata file found yet. Run the scraper first!")

No metadata file found yet. Run the scraper first!


## Instructions for Use

### Before Running the Scraper:

1. **Set up Reddit API credentials:**
   - Go to https://www.reddit.com/prefs/apps/
   - Click "Create App" or "Create Another App"
   - Choose "script" as the app type
   - Fill in the required fields
   - Copy your `client_id` and `client_secret`
   - Update the credentials in the "Reddit API Configuration" cell above

2. **Configure scraping parameters:**
   - Adjust `POSTS_TO_SCRAPE` (recommended: start with 10-25 for testing)
   - Choose `TIME_FILTER` ('hour', 'day', 'week', 'month', 'year', 'all')

### How the Script Works:

1. **Connects to Reddit** using PRAW in read-only mode
2. **Fetches posts** from r/bugbites subreddit
3. **Analyzes text** (title + comments) to detect bug types using keyword matching
4. **Downloads images** from posts and saves them as `BUGTYPE_N.jpg`
5. **Saves metadata** about each image in JSON format

### Output Structure:
```
images/
├── MOSQUITO_1.jpg
├── MOSQUITO_2.jpg
├── SPIDER_1.jpg
├── UNKNOWN_1.jpg
└── ...

metadata/
└── scraping_results.json
```

### Troubleshooting:

- **"Error connecting to Reddit API"**: Check your credentials
- **"No images found"**: Some posts may not contain direct image links
- **Rate limiting**: The script includes delays to respect Reddit's API limits
- **Imgur albums**: Currently only handles single Imgur images, not full albums

### Notes:

- The script respects Reddit's API rate limits with built-in delays
- Images are classified based on text analysis of titles and comments
- Unknown or ambiguous posts are classified as 'UNKNOWN'
- All metadata is preserved for later analysis