In [12]:
# Bug classification system with improved accuracy and comment upvote weighting

# Define bug bite types with specific keywords (FIXED - removed overly generic terms)
BUG_TYPES = {
    'mosquito': [
        'mosquito', 'mosquitos', 'mosquitoes', 'skeeter', 'skeeters',
        'itchy', 'itching', 'itch', 'itches', 'scratching',
        'red bumps', 'small bumps', 'tiny bumps', 'welts', 'welt',
        'swollen', 'swelling', 'inflammation', 'inflamed'
    ],
    'spider': [
        'spider', 'spiders', 'arachnid', 'eight legs', 'web',
        'black widow', 'brown recluse', 'recluse', 'widow',
        'fang marks', 'fangs', 'puncture', 'necrosis', 'necrotic'
    ],
    'bed_bug': [
        'bed bug', 'bedbug', 'bed bugs', 'bedbugs',
        'hotel', 'motel', 'mattress', 'bed', 'sleeping',
        'line of bites', 'breakfast lunch dinner', 'three bites',
        'cluster', 'clustered', 'pattern', 'row'
    ],
    'flea': [
        'flea', 'fleas', 'pet', 'dog', 'cat', 'pets',
        'ankle', 'ankles', 'lower leg', 'feet', 'foot',
        'carpet', 'jumping', 'tiny bites'
    ],
    'tick': [
        'tick', 'ticks', 'lyme', 'bullseye', 'bulls eye',
        'hiking', 'woods', 'forest', 'outdoors', 'camping',
        'embedded', 'attached', 'circular rash', 'rash'
    ],
    'ant': [
        'ant', 'ants', 'fire ant', 'fire ants',
        'burning', 'burn', 'stinging', 'sting',
        'pustule', 'pustules', 'pus', 'white head'
    ],
    'bee': [
        'bee', 'bees', 'wasp', 'wasps', 'hornet', 'hornets',
        'sting', 'stinger', 'stung', 'swollen', 'allergic',
        'yellow jacket', 'bumble bee'
    ],
    'chigger': [
        'chigger', 'chiggers', 'harvest mite', 'red bug',
        'grass', 'tall grass', 'vegetation', 'mowing',
        'waistline', 'waist', 'belt line', 'socks'
    ],
    'mite': [
        'mite', 'mites', 'dust mite', 'scabies',
        'burrow', 'tunnels', 'between fingers', 'wrists'
    ],
    'unknown': [
        'unknown', 'unidentified', 'mystery', 'unclear', 'unsure'
        # REMOVED: 'bug', 'bite', 'what bit', 'help', 'identify' - these were too generic!
    ]
}

# Enhanced contextual clues for better classification
CONTEXTUAL_CLUES = {
    'location_hints': {
        'bed_bug': ['bed', 'mattress', 'hotel', 'motel', 'sleeping', 'woke up'],
        'flea': ['pet', 'dog', 'cat', 'ankle', 'lower leg', 'carpet'],
        'mosquito': ['outside', 'evening', 'dusk', 'water', 'pond', 'lake'],
        'tick': ['hiking', 'woods', 'forest', 'camping', 'outdoors', 'grass'],
        'spider': ['corner', 'basement', 'garage', 'shed', 'dark'],
        'chigger': ['grass', 'lawn', 'mowing', 'gardening']
    },
    'pattern_hints': {
        'bed_bug': ['line', 'row', 'breakfast lunch dinner', 'three', 'cluster'],
        'flea': ['multiple', 'scattered', 'random'],
        'mosquito': ['single', 'isolated', 'few'],
        'spider': ['two', 'pair', 'double', 'fang']
    },
    'symptom_hints': {
        'mosquito': ['itchy', 'itch', 'scratching', 'red', 'swollen'],
        'spider': ['pain', 'necrosis', 'spreading', 'severe'],
        'bee': ['swollen', 'allergic', 'immediate', 'painful'],
        'ant': ['burning', 'fire', 'pustule', 'pus']
    }
}

# Seasonal patterns for additional context
SEASONAL_PATTERNS = {
    'mosquito': ['summer', 'warm', 'humid', 'rain'],
    'tick': ['spring', 'summer', 'fall', 'warm weather'],
    'chigger': ['late summer', 'fall', 'humid']
}

def detect_bug_type_advanced(text, comments="", comment_scores=None, debug=False):
    """
    Advanced bug bite detection with comment upvote weighting for community validation

    Args:
        text: Post title and content to analyze
        comments: Comments from the post
        comment_scores: List of comment upvote scores for weighting
        debug: Whether to print debug information

    Returns:
        Detected bug type as string
    """

    # Combine text and comments for analysis
    combined_text = f"{text} {comments}".lower()

    # Calculate comment score weighting multiplier
    comment_weight_multiplier = 1.0
    if comment_scores and len(comment_scores) > 0:
        # Filter out negative scores for calculations
        positive_scores = [score for score in comment_scores if score > 0]

        if positive_scores:
            max_score = max(comment_scores)
            avg_score = sum(positive_scores) / len(positive_scores)

            # Tiered weighting system based on community validation
            if max_score >= 10:
                comment_weight_multiplier = 2.5  # Very high confidence
            elif max_score >= 5:
                comment_weight_multiplier = 2.0  # High confidence
            elif avg_score >= 2:
                comment_weight_multiplier = 1.5  # Moderate confidence
            elif avg_score >= 1:
                comment_weight_multiplier = 1.2  # Slight confidence boost
            # else: 1.0 (no boost for low/negative scores)

            if debug:
                print(f"Comment scores: {comment_scores}")
                print(f"Max score: {max_score}, Avg score: {avg_score:.1f}")
                print(f"Weight multiplier: {comment_weight_multiplier}x")

    scores = {}

    # Score each bug type
    for bug_type, keywords in BUG_TYPES.items():
        score = 0
        matched_keywords = []

        for keyword in keywords:
            if keyword in combined_text:
                # Smart weighting: generic terms get lower weight
                if keyword in ['bug', 'bite', 'what bit', 'help', 'identify']:
                    weight = 0.5  # Very low weight for generic terms
                elif len(keyword) <= 3:  # Short terms
                    weight = 1.0
                else:
                    weight = 2.0  # Higher weight for specific terms

                score += weight
                matched_keywords.append(keyword)

        # Bonus scoring for contextual clues
        if bug_type in CONTEXTUAL_CLUES['location_hints']:
            for hint in CONTEXTUAL_CLUES['location_hints'][bug_type]:
                if hint in combined_text:
                    score += 3.0  # Location context is very valuable
                    matched_keywords.append(f"location:{hint}")

        if bug_type in CONTEXTUAL_CLUES['pattern_hints']:
            for hint in CONTEXTUAL_CLUES['pattern_hints'][bug_type]:
                if hint in combined_text:
                    score += 2.5  # Pattern context is valuable
                    matched_keywords.append(f"pattern:{hint}")

        if bug_type in CONTEXTUAL_CLUES['symptom_hints']:
            for hint in CONTEXTUAL_CLUES['symptom_hints'][bug_type]:
                if hint in combined_text:
                    score += 2.0  # Symptom context helps
                    matched_keywords.append(f"symptom:{hint}")

        # Apply comment upvote weighting to final score
        final_score = score * comment_weight_multiplier
        scores[bug_type] = final_score

        if debug and final_score > 0:
            print(f"{bug_type}: {score:.1f} * {comment_weight_multiplier}x = {final_score:.1f} (keywords: {matched_keywords})")

    # Find the highest scoring type
    if not scores or max(scores.values()) == 0:
        if debug:
            print("No matches found, defaulting to unknown")
        return 'unknown'

    best_type = max(scores.items(), key=lambda x: x[1])

    # Enhanced fallback logic for question-style titles
    if best_type[1] < 1.0:  # Very low confidence
        # Look for any specific clues in questions like "What bit me?"
        question_indicators = ['what bit', 'what is', 'help identify', 'any idea']
        if any(indicator in combined_text for indicator in question_indicators):
            # Re-examine for any specific clues
            for bug_type, keywords in BUG_TYPES.items():
                if bug_type == 'unknown':
                    continue
                for keyword in keywords:
                    if len(keyword) > 4 and keyword in combined_text:  # Only specific terms
                        if debug:
                            print(f"Fallback detection: found '{keyword}' for {bug_type}")
                        return bug_type

    if debug:
        print(f"Best match: {best_type[0]} with score {best_type[1]:.1f}")

    return best_type[0]

print("✅ Bug classification system loaded with comment upvote weighting!")
print()
print("🏆 NEW FEATURE: Comment Upvote Weighting")
print("   • High-scoring comments (10+ upvotes): 2.5x classification weight")
print("   • Medium-scoring comments (5+ upvotes): 2.0x classification weight")
print("   • Positive comments (2+ avg): 1.5x classification weight")
print("   • Slight positive (1+ avg): 1.2x classification weight")
print("   • Low/negative scores: No weight boost")
print()
print("🎯 Benefits:")
print("   • Community validation improves accuracy")
print("   • Expert identifications get priority")
print("   • Uncertain comments don't mislead classification")
print()
print("📚 Classification categories available:")
for bug_type in sorted(BUG_TYPES.keys()):
    if bug_type != 'unknown':
        print(f"   • {bug_type}")
print(f"   • unknown (fallback category)")

✅ Bug classification system loaded with comment upvote weighting!

🏆 NEW FEATURE: Comment Upvote Weighting
   • High-scoring comments (10+ upvotes): 2.5x classification weight
   • Medium-scoring comments (5+ upvotes): 2.0x classification weight
   • Positive comments (2+ avg): 1.5x classification weight
   • Slight positive (1+ avg): 1.2x classification weight
   • Low/negative scores: No weight boost

🎯 Benefits:
   • Community validation improves accuracy
   • Expert identifications get priority
   • Uncertain comments don't mislead classification

📚 Classification categories available:
   • ant
   • bed_bug
   • bee
   • chigger
   • flea
   • mite
   • mosquito
   • spider
   • tick
   • unknown (fallback category)


In [13]:
# Import required libraries
import praw
import requests
import os
import re
import json
import time
from datetime import datetime
from urllib.parse import urlparse
from collections import defaultdict
import logging

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create directories for output
os.makedirs('images', exist_ok=True)
os.makedirs('metadata', exist_ok=True)

print("Libraries imported successfully!")
print("Environment variables loaded from .env file")
print("Directories created: images/, metadata/")

Libraries imported successfully!
Environment variables loaded from .env file
Directories created: images/, metadata/


In [14]:
# Reddit API Configuration
# Credentials are loaded from .env file

# Load credentials from environment variables
REDDIT_CLIENT_ID = os.getenv('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.getenv('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT = "PersonalApp/1.0 by reupped"
REDDIT_SUBREDDIT = os.getenv('REDDIT_SUBREDDIT', 'bugbites')  # Default to 'bugbites' if not set
REDDIT_POST_FETCH_STRATEGY = os.getenv('REDDIT_POST_FETCH_STRATEGY', 'balanced')  # Default strategy
REDDIT_POSTS_COUNT = int(os.getenv('REDDIT_POSTS_COUNT', '15'))  # Default count

# Check if credentials are loaded
if not REDDIT_CLIENT_ID or not REDDIT_CLIENT_SECRET:
    print("Error: Reddit API credentials not found in .env file")
    print("Please make sure your .env file contains:")
    print("REDDIT_CLIENT_ID=your_client_id")
    print("REDDIT_CLIENT_SECRET=your_client_secret")
    print("REDDIT_SUBREDDIT=subreddit_name (optional, defaults to 'bugbites')")
    print("REDDIT_POST_FETCH_STRATEGY=strategy (optional, defaults to 'balanced')")
    print("REDDIT_POSTS_COUNT=number (optional, defaults to 15)")
else:
    print(f"Loaded credentials - Client ID: {REDDIT_CLIENT_ID[:10]}...")
    print(f"Target subreddit: r/{REDDIT_SUBREDDIT}")
    print(f"Fetch strategy: {REDDIT_POST_FETCH_STRATEGY}")
    print(f"Posts count: {REDDIT_POSTS_COUNT}")

# Initialize Reddit instance
try:
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )

    # Test the connection
    print("Reddit API connection successful!")
    print(f"Read-only mode: {reddit.read_only}")

except Exception as e:
    print(f"Error connecting to Reddit API: {e}")
    print("Please check your Reddit API credentials in the .env file")

Loaded credentials - Client ID: QH_iPhx9pf...
Target subreddit: r/bugbites
Fetch strategy: discussion_heavy
Posts count: 25
Reddit API connection successful!
Read-only mode: True


In [15]:
# File renaming utility based on reclassification
def rename_files_by_classification():
    """
    Rename existing downloaded files based on new classifications
    """
    try:
        # Load the reclassified data
        reclassified_file = 'metadata/scraping_results_reclassified.json'
        if not os.path.exists(reclassified_file):
            print("No reclassified data found. Run reclassification first!")
            return

        with open(reclassified_file, 'r') as f:
            data = json.load(f)

        # Group by new bug type and create new counters
        new_counters = defaultdict(int)
        rename_mapping = []

        # Sort data by bug type to ensure consistent numbering
        data_by_type = defaultdict(list)
        for item in data:
            data_by_type[item['bug_type']].append(item)

        # Create new filenames for each type
        for bug_type in sorted(data_by_type.keys()):
            items = data_by_type[bug_type]
            for item in items:
                old_filename = item['filename']
                new_counters[bug_type] += 1
                new_filename = f"images/{bug_type.upper()}_{new_counters[bug_type]}.jpg"

                if old_filename != new_filename:
                    rename_mapping.append((old_filename, new_filename))
                    item['filename'] = new_filename  # Update metadata

        # Perform the actual renaming
        renamed_count = 0
        for old_path, new_path in rename_mapping:
            if os.path.exists(old_path):
                # Ensure no conflict with existing files
                if os.path.exists(new_path):
                    # Create a temporary name to avoid conflicts
                    temp_path = f"{new_path}.temp"
                    os.rename(old_path, temp_path)
                    old_path = temp_path

                os.rename(old_path, new_path)
                renamed_count += 1
                print(f"Renamed: {old_path} -> {new_path}")

        # Save updated metadata
        with open(reclassified_file, 'w') as f:
            json.dump(data, f, indent=2)

        print(f"\nFile renaming complete!")
        print(f"Renamed {renamed_count} files")
        print(f"Updated metadata saved to {reclassified_file}")

        # Show current file structure
        print(f"\nCurrent images directory:")
        try:
            images = sorted([f for f in os.listdir('images') if f.endswith('.jpg')])
            type_counts = defaultdict(int)
            for img in images:
                bug_type = img.split('_')[0].lower()
                type_counts[bug_type] += 1

            for bug_type, count in sorted(type_counts.items()):
                print(f"  {bug_type.upper()}: {count} files")

        except FileNotFoundError:
            print("  No images directory found")

    except Exception as e:
        print(f"Error during file renaming: {e}")

print("File renaming utility ready!")

File renaming utility ready!


In [16]:
# Image download and processing functions with timestamped runs
def create_run_directory():
    """Create a timestamped directory for this scraping run"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = f"images/run{timestamp}"
    os.makedirs(run_dir, exist_ok=True)

    print(f"Created run directory: {run_dir}")
    return run_dir, timestamp

def is_image_url(url):
    """Check if URL points to an image"""
    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']
    parsed_url = urlparse(url.lower())
    return any(parsed_url.path.endswith(ext) for ext in image_extensions)

def get_reddit_image_urls(submission):
    """Extract image URLs from a Reddit submission"""
    urls = []

    # Direct image link
    if hasattr(submission, 'url') and is_image_url(submission.url):
        urls.append(submission.url)

    # Reddit gallery
    if hasattr(submission, 'is_gallery') and submission.is_gallery:
        try:
            for item in submission.gallery_data['items']:
                media_id = item['media_id']
                if media_id in submission.media_metadata:
                    media_info = submission.media_metadata[media_id]
                    if 's' in media_info and 'u' in media_info['s']:
                        # Convert preview URL to full resolution
                        url = media_info['s']['u'].replace('preview.redd.it', 'i.redd.it')
                        url = url.split('?')[0]  # Remove query parameters
                        urls.append(url)
        except Exception as e:
            logger.warning(f"Error processing gallery: {e}")

    # Check if it's an Imgur link
    if 'imgur.com' in submission.url:
        # Convert imgur links to direct image links
        if '/a/' in submission.url or '/gallery/' in submission.url:
            # Album/gallery - would need imgur API for full access
            logger.info(f"Imgur album detected: {submission.url}")
        else:
            # Single image
            imgur_id = submission.url.split('/')[-1].split('.')[0]
            direct_url = f"https://i.imgur.com/{imgur_id}.jpg"
            urls.append(direct_url)

    return urls

def download_image(url, filename):
    """Download an image from URL and save it"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        # Check if response is actually an image
        content_type = response.headers.get('content-type', '')
        if not content_type.startswith('image/'):
            logger.warning(f"URL doesn't return an image: {url}")
            return False

        with open(filename, 'wb') as f:
            f.write(response.content)

        logger.info(f"Downloaded: {filename}")
        return True

    except Exception as e:
        logger.error(f"Error downloading {url}: {e}")
        return False

# Global variables for run management
current_run_dir = None
current_timestamp = None
bug_counters = defaultdict(int)

def initialize_new_run():
    """Initialize a new scraping run with timestamped directory"""
    global current_run_dir, current_timestamp, bug_counters

    current_run_dir, current_timestamp = create_run_directory()
    bug_counters = defaultdict(int)  # Reset counters for new run

    return current_run_dir, current_timestamp

def get_next_filename(bug_type):
    """Get the next filename for a bug type in the current run"""
    global current_run_dir, bug_counters

    if current_run_dir is None:
        # Initialize if not already done
        initialize_new_run()

    bug_counters[bug_type] += 1
    return f"{current_run_dir}/{bug_type.upper()}_{bug_counters[bug_type]}.jpg"

def get_run_summary():
    """Get summary of the current run"""
    global current_run_dir, current_timestamp, bug_counters

    if current_run_dir is None:
        return "No active run"

    summary = {
        'run_directory': current_run_dir,
        'timestamp': current_timestamp,
        'bug_type_counts': dict(bug_counters),
        'total_images': sum(bug_counters.values())
    }

    return summary

print("Image processing functions with run management ready!")

Image processing functions with run management ready!


In [17]:
# Main scraping function with run management and improved post selection
def scrape_bugbites_subreddit(limit=50, time_filter='week', sort_method='top'):
    """
    Scrape a configurable subreddit for images and classify them
    Subreddit is set via REDDIT_SUBREDDIT env variable (defaults to 'bugbites')
    Each run gets its own timestamped directory

    Args:
        limit: Number of posts to scrape
        time_filter: Time filter for posts ('hour', 'day', 'week', 'month', 'year', 'all')
        sort_method: How to sort posts ('top', 'hot', 'new', 'controversial', 'rising')
    """

    # Initialize new run
    run_dir, timestamp = initialize_new_run()
    scraped_data = []

    print(f"Starting new scraping run: {timestamp}")
    print(f"Target subreddit: r/{REDDIT_SUBREDDIT}")
    print(f"Images will be saved to: {run_dir}")
    print(f"Sort method: {sort_method}, Time filter: {time_filter}")
    print("=" * 60)

    try:
        subreddit = reddit.subreddit(REDDIT_SUBREDDIT)

        # Get posts from the subreddit using different sorting methods
        if sort_method == 'top':
            posts = subreddit.top(time_filter=time_filter, limit=limit)
            print(f"Fetching top {limit} posts from the past {time_filter}")
        elif sort_method == 'hot':
            posts = subreddit.hot(limit=limit)
            print(f"Fetching {limit} hot posts")
        elif sort_method == 'new':
            posts = subreddit.new(limit=limit)
            print(f"Fetching {limit} newest posts")
        elif sort_method == 'controversial':
            posts = subreddit.controversial(time_filter=time_filter, limit=limit)
            print(f"Fetching {limit} controversial posts from the past {time_filter}")
        elif sort_method == 'rising':
            posts = subreddit.rising(limit=limit)
            print(f"Fetching {limit} rising posts")
        else:
            # Default to top if invalid method
            posts = subreddit.top(time_filter=time_filter, limit=limit)
            print(f"Unknown sort method '{sort_method}', defaulting to top posts")

        # Collect posts first to analyze engagement
        posts_list = list(posts)

        # Sort by number of comments (descending) to prioritize posts with more discussion
        posts_with_engagement = []
        for submission in posts_list:
            posts_with_engagement.append({
                'submission': submission,
                'num_comments': submission.num_comments,
                'score': submission.score,
                'engagement_score': submission.num_comments * 2 + submission.score  # Weight comments more heavily
            })

        # Sort by engagement score (comments weighted more heavily than upvotes)
        posts_with_engagement.sort(key=lambda x: x['engagement_score'], reverse=True)

        print(f"\nPost engagement analysis:")
        print("Top 5 posts by engagement (comments weighted 2x):")
        for i, post_data in enumerate(posts_with_engagement[:5]):
            submission = post_data['submission']
            print(f"  {i+1}. Comments: {post_data['num_comments']}, Score: {post_data['score']}, "
                  f"Engagement: {post_data['engagement_score']}")
            print(f"     Title: {submission.title[:70]}...")

        print(f"\nProcessing posts in order of engagement:")
        print("=" * 60)

        for post_count, post_data in enumerate(posts_with_engagement, 1):
            submission = post_data['submission']

            print(f"\n--- Processing post {post_count}/{len(posts_with_engagement)} ---")
            print(f"Title: {submission.title[:80]}...")
            print(f"Comments: {submission.num_comments}, Score: {submission.score}")

            # Analyze title and selftext for bug type
            combined_text = f"{submission.title} {submission.selftext}"

            # Load more comments since we're prioritizing posts with comments
            submission.comments.replace_more(limit=2)  # Load more comment threads
            comments_text = ""
            comment_count = 0
            comment_scores = []  # Track comment upvotes for weighting
            total_comment_score = 0

            # Get more comments for better classification with upvote tracking
            for comment in submission.comments.list()[:20]:  # Increased from 10 to 20
                if hasattr(comment, 'body') and len(comment.body) > 10:  # Skip very short comments
                    comment_score = getattr(comment, 'score', 0)  # Get comment upvotes
                    comments_text += f" {comment.body}"
                    comment_scores.append(comment_score)
                    total_comment_score += max(comment_score, 0)  # Don't count negative scores
                    comment_count += 1

            print(f"Loaded {comment_count} comments for analysis")
            if comment_scores:
                avg_comment_score = total_comment_score / comment_count
                max_comment_score = max(comment_scores)
                print(f"Comment upvotes: avg={avg_comment_score:.1f}, max={max_comment_score}, total={total_comment_score}")

            # Use the advanced detection system with comments and upvote weighting
            bug_type = detect_bug_type_advanced(
                combined_text,
                comments_text,
                comment_scores=comment_scores,
                debug=False
            )

            print(f"Detected bug type: {bug_type}")

            # Get image URLs
            image_urls = get_reddit_image_urls(submission)

            if image_urls:
                print(f"Found {len(image_urls)} image(s)")

                for img_url in image_urls:
                    filename = get_next_filename(bug_type)

                    if download_image(img_url, filename):
                        # Store metadata with run information, engagement data, and comment scores
                        metadata = {
                            'run_timestamp': timestamp,
                            'run_directory': run_dir,
                            'filename': filename,
                            'bug_type': bug_type,
                            'post_title': submission.title,
                            'post_url': f"https://reddit.com{submission.permalink}",
                            'image_url': img_url,
                            'post_score': submission.score,
                            'num_comments': submission.num_comments,
                            'engagement_score': post_data['engagement_score'],
                            'comments_analyzed': comment_count,
                            'comment_scores': comment_scores,  # NEW: Store individual comment scores
                            'total_comment_score': total_comment_score,  # NEW: Sum of positive comment scores
                            'avg_comment_score': total_comment_score / comment_count if comment_count > 0 else 0,  # NEW: Average comment score
                            'max_comment_score': max(comment_scores) if comment_scores else 0,  # NEW: Highest comment score
                            'sort_method': sort_method,
                            'time_filter': time_filter,
                            'created_utc': submission.created_utc,
                            'author': str(submission.author) if submission.author else '[deleted]',
                            'scraped_at': datetime.now().isoformat()
                        }
                        scraped_data.append(metadata)

                        print(f"Saved as: {filename}")
            else:
                print("No images found in this post")

            # Be respectful to Reddit's API - slightly longer delay for comment loading
            time.sleep(1.5)

    except Exception as e:
        logger.error(f"Error scraping subreddit: {e}")

    return scraped_data, timestamp

def save_metadata_with_run(scraped_data, timestamp):
    """Save scraping metadata with run-specific information"""

    # Save run-specific metadata
    run_metadata_file = f'metadata/scraping_results_run{timestamp}.json'
    with open(run_metadata_file, 'w') as f:
        json.dump(scraped_data, f, indent=2)

    # Also append to master metadata file
    master_metadata_file = 'metadata/all_scraping_results.json'

    # Load existing master data if it exists
    all_data = []
    if os.path.exists(master_metadata_file):
        try:
            with open(master_metadata_file, 'r') as f:
                all_data = json.load(f)
        except json.JSONDecodeError:
            all_data = []

    # Add new data
    all_data.extend(scraped_data)

    # Save updated master file
    with open(master_metadata_file, 'w') as f:
        json.dump(all_data, f, indent=2)

    print(f"Run metadata saved to: {run_metadata_file}")
    print(f"Master metadata updated: {master_metadata_file}")

    # Save run summary with engagement statistics
    run_summary = get_run_summary()
    run_summary['metadata_file'] = run_metadata_file
    run_summary['total_posts_processed'] = len(scraped_data)

    # Add engagement statistics
    if scraped_data:
        total_comments = sum(item.get('comments_analyzed', 0) for item in scraped_data)
        avg_engagement = sum(item.get('engagement_score', 0) for item in scraped_data) / len(scraped_data)
        run_summary['total_comments_analyzed'] = total_comments
        run_summary['average_engagement_score'] = avg_engagement
        run_summary['sort_method'] = scraped_data[0].get('sort_method', 'unknown')
        run_summary['time_filter'] = scraped_data[0].get('time_filter', 'unknown')

    summary_file = f'metadata/run_summary_{timestamp}.json'
    with open(summary_file, 'w') as f:
        json.dump(run_summary, f, indent=2)

    print(f"Run summary saved to: {summary_file}")

    return run_metadata_file, master_metadata_file

def list_all_runs():
    """List all previous scraping runs"""
    try:
        runs = []

        # Look for run directories
        if os.path.exists('images'):
            for item in os.listdir('images'):
                if item.startswith('run') and os.path.isdir(f'images/{item}'):
                    timestamp = item[3:]  # Remove 'run' prefix

                    # Count files in directory
                    run_dir = f'images/{item}'
                    image_count = len([f for f in os.listdir(run_dir) if f.endswith('.jpg')])

                    # Try to load summary if available
                    summary_file = f'metadata/run_summary_{timestamp}.json'
                    bug_counts = {}
                    sort_method = 'unknown'
                    engagement_info = {}

                    if os.path.exists(summary_file):
                        with open(summary_file, 'r') as f:
                            summary = json.load(f)
                            bug_counts = summary.get('bug_type_counts', {})
                            sort_method = summary.get('sort_method', 'unknown')
                            engagement_info = {
                                'total_comments': summary.get('total_comments_analyzed', 0),
                                'avg_engagement': summary.get('average_engagement_score', 0)
                            }

                    runs.append({
                        'timestamp': timestamp,
                        'directory': run_dir,
                        'image_count': image_count,
                        'bug_counts': bug_counts,
                        'sort_method': sort_method,
                        'engagement_info': engagement_info
                    })

        # Sort by timestamp
        runs.sort(key=lambda x: x['timestamp'], reverse=True)

        if runs:
            print(f"Found {len(runs)} previous runs:")
            print("-" * 100)
            for run in runs:
                print(f"Run {run['timestamp']}: {run['image_count']} images, "
                      f"Sort: {run['sort_method']}, "
                      f"Comments: {run['engagement_info'].get('total_comments', 0)}")
                print(f"  Directory: {run['directory']}")
                if run['bug_counts']:
                    bug_summary = ", ".join([f"{bt.upper()}: {cnt}" for bt, cnt in run['bug_counts'].items()])
                    print(f"  Types: {bug_summary}")
                print()
        else:
            print("No previous runs found")

        return runs

    except Exception as e:
        print(f"Error listing runs: {e}")
        return []

print("Enhanced main scraping function with engagement-based sorting ready!")

Enhanced main scraping function with engagement-based sorting ready!


In [18]:
# Execute the scraping with enhanced post selection
# Note: Make sure you've updated the Reddit API credentials above before running this!

# Configure scraping parameters from environment variables
POSTS_TO_SCRAPE = REDDIT_POSTS_COUNT  # Use count from .env file
TIME_FILTER = 'month'  # Changed to 'month' to get more posts with established discussions

# Map strategy names to sort methods and configurations
STRATEGY_CONFIGS = {
    'balanced': {'sort_method': 'top', 'time_filter': 'month'},
    'discussion_heavy': {'sort_method': 'controversial', 'time_filter': 'all'},
    'recent_active': {'sort_method': 'hot', 'time_filter': 'week'},
    'controversial': {'sort_method': 'controversial', 'time_filter': 'month'},
    'quality_focused': {'sort_method': 'top', 'time_filter': 'year'}
}

# Get configuration from strategy
if REDDIT_POST_FETCH_STRATEGY in STRATEGY_CONFIGS:
    strategy_config = STRATEGY_CONFIGS[REDDIT_POST_FETCH_STRATEGY]
    SORT_METHOD = strategy_config['sort_method']
    if REDDIT_POST_FETCH_STRATEGY in ['discussion_heavy', 'quality_focused']:
        TIME_FILTER = strategy_config['time_filter']  # Override for specific strategies
else:
    SORT_METHOD = 'top'  # Default fallback
    print(f"⚠️ Unknown strategy '{REDDIT_POST_FETCH_STRATEGY}', using default 'top' sort")

# Sort method explanation:
# 'top' - Most upvoted posts (good for quality content with discussions)
# 'hot' - Currently trending (mix of new and popular)
# 'new' - Newest posts (may have fewer comments)
# 'controversial' - Posts with mixed reactions (often more discussion)
# 'rising' - Posts gaining traction quickly

print("Starting new Reddit scraping run with enhanced post selection...")
print(f"Configuration:")
print(f"  Target subreddit: r/{REDDIT_SUBREDDIT}")
print(f"  Fetch strategy: {REDDIT_POST_FETCH_STRATEGY}")
print(f"  Posts to scrape: {POSTS_TO_SCRAPE}")
print(f"  Time filter: {TIME_FILTER}")
print(f"  Sort method: {SORT_METHOD}")
print(f"  Strategy: Prioritizing posts with more comments for better classification")
print("=" * 70)

# List existing runs first
print("\nPrevious runs:")
list_all_runs()

print("\n" + "=" * 70)
print("STARTING NEW RUN WITH ENHANCED POST SELECTION")
print("=" * 70)

# Run the scraper with new engagement-focused system
scraped_data, run_timestamp = scrape_bugbites_subreddit(
    limit=POSTS_TO_SCRAPE,
    time_filter=TIME_FILTER,
    sort_method=SORT_METHOD
)

# Save metadata with run information
if scraped_data:
    run_metadata_file, master_metadata_file = save_metadata_with_run(scraped_data, run_timestamp)

    # Print detailed summary
    print("\n" + "=" * 70)
    print("SCRAPING RUN COMPLETE!")
    print("=" * 70)
    print(f"Run timestamp: {run_timestamp}")
    print(f"Total images downloaded: {len(scraped_data)}")

    # Calculate engagement statistics
    total_comments_analyzed = sum(item.get('comments_analyzed', 0) for item in scraped_data)
    avg_engagement = sum(item.get('engagement_score', 0) for item in scraped_data) / len(scraped_data)
    posts_with_comments = sum(1 for item in scraped_data if item.get('comments_analyzed', 0) > 0)

    print(f"Engagement Statistics:")
    print(f"  Total comments analyzed: {total_comments_analyzed}")
    print(f"  Average engagement score: {avg_engagement:.1f}")
    print(f"  Posts with comments: {posts_with_comments}/{len(scraped_data)}")

    # Count by bug type for this run
    run_bug_counts = defaultdict(int)
    unknown_count = 0
    for item in scraped_data:
        bug_type = item['bug_type']
        run_bug_counts[bug_type] += 1
        if bug_type == 'unknown':
            unknown_count += 1

    print(f"\nImages by bug type (this run):")
    for bug_type, count in sorted(run_bug_counts.items()):
        percentage = (count / len(scraped_data)) * 100
        print(f"  {bug_type.upper()}: {count} images ({percentage:.1f}%)")

    # Show improvement
    unknown_percentage = (unknown_count / len(scraped_data)) * 100
    if unknown_percentage < 80:  # Arbitrary threshold
        print(f"\n✅ Classification improvement! Only {unknown_percentage:.1f}% unknown (down from previous runs)")
    else:
        print(f"\n⚠️  Still {unknown_percentage:.1f}% unknown classifications")

    # Show run directory structure
    print(f"\nRun directory: {current_run_dir}")
    try:
        files = sorted([f for f in os.listdir(current_run_dir) if f.endswith('.jpg')])
        print(f"Files created ({len(files)} total):")

        # Group files by type for better display
        files_by_type = defaultdict(list)
        for f in files:
            bug_type = f.split('_')[0]
            files_by_type[bug_type].append(f)

        for bug_type, type_files in sorted(files_by_type.items()):
            print(f"  {bug_type}: {len(type_files)} files")
            # Show first few files of each type
            for f in type_files[:3]:
                print(f"    {f}")
            if len(type_files) > 3:
                print(f"    ... and {len(type_files) - 3} more")

    except Exception as e:
        print(f"Error listing files: {e}")

    print(f"\nMetadata files:")
    print(f"  Run-specific: {run_metadata_file}")
    print(f"  Master file: {master_metadata_file}")

    # Show top posts by engagement
    print(f"\nTop posts by engagement in this run:")
    sorted_data = sorted(scraped_data, key=lambda x: x.get('engagement_score', 0), reverse=True)
    for i, item in enumerate(sorted_data[:3]):
        print(f"  {i+1}. {item['bug_type'].upper()}: {item['post_title'][:50]}...")
        print(f"     Comments: {item.get('comments_analyzed', 0)}, "
              f"Score: {item.get('post_score', 0)}, "
              f"Engagement: {item.get('engagement_score', 0)}")

else:
    print("\nNo images were downloaded. Check your Reddit API credentials and internet connection.")

print(f"\nRun completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Next run suggestions:")
print(f"  - Try TIME_FILTER='week' for more recent discussions")
print(f"  - Try SORT_METHOD='controversial' for posts with more debate")
print(f"  - Try SORT_METHOD='hot' for currently trending posts")

Starting new Reddit scraping run with enhanced post selection...
Configuration:
  Target subreddit: r/bugbites
  Fetch strategy: discussion_heavy
  Posts to scrape: 25
  Time filter: all
  Sort method: controversial
  Strategy: Prioritizing posts with more comments for better classification

Previous runs:
Found 6 previous runs:
----------------------------------------------------------------------------------------------------
Run 20250806_203307: 36 images, Sort: top, Comments: 271
  Directory: images/run20250806_203307
  Types: MOSQUITO: 14, BED_BUG: 13, ANT: 1, TICK: 3, FLEA: 5

Run 20250806_201819: 36 images, Sort: top, Comments: 271
  Directory: images/run20250806_201819
  Types: TICK: 5, BED_BUG: 16, DERMATITIS: 1, SCABIES: 1, SPIDER: 2, FLEA: 2, MITE: 4, BEE: 5

Run 20250806_201209: 36 images, Sort: top, Comments: 271
  Directory: images/run20250806_201209
  Types: TICK: 1, UNKNOWN: 24, BED_BUG: 10, SCABIES: 1

Run 20250806_200730: 36 images, Sort: top, Comments: 271
  Director

INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_1.jpg


Saved as: images/run20250806_203706/BED_BUG_1.jpg

--- Processing post 2/25 ---
Title: Are these bed bug bites?...
Comments: 20, Score: 0

--- Processing post 2/25 ---
Title: Are these bed bug bites?...
Comments: 20, Score: 0
Loaded 17 comments for analysis
Comment upvotes: avg=1.0, max=1, total=17
Detected bug type: bed_bug
Found 7 image(s)
Loaded 17 comments for analysis
Comment upvotes: avg=1.0, max=1, total=17
Detected bug type: bed_bug
Found 7 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_2.jpg


Saved as: images/run20250806_203706/BED_BUG_2.jpg


INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_3.jpg
INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_4.jpg
INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_4.jpg


Saved as: images/run20250806_203706/BED_BUG_3.jpg
Saved as: images/run20250806_203706/BED_BUG_4.jpg


INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_5.jpg
INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_6.jpg
INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_6.jpg


Saved as: images/run20250806_203706/BED_BUG_5.jpg
Saved as: images/run20250806_203706/BED_BUG_6.jpg


INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_7.jpg
INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_8.jpg
INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_8.jpg


Saved as: images/run20250806_203706/BED_BUG_7.jpg
Saved as: images/run20250806_203706/BED_BUG_8.jpg

--- Processing post 3/25 ---
Title: What bit my toddler while sleeping? ...
Comments: 13, Score: 0

--- Processing post 3/25 ---
Title: What bit my toddler while sleeping? ...
Comments: 13, Score: 0
Loaded 12 comments for analysis
Comment upvotes: avg=1.1, max=2, total=13
Detected bug type: mosquito
Found 3 image(s)
Loaded 12 comments for analysis
Comment upvotes: avg=1.1, max=2, total=13
Detected bug type: mosquito
Found 3 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_1.jpg


Saved as: images/run20250806_203706/MOSQUITO_1.jpg


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_2.jpg


Saved as: images/run20250806_203706/MOSQUITO_2.jpg


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_3.jpg


Saved as: images/run20250806_203706/MOSQUITO_3.jpg

--- Processing post 4/25 ---
Title: Anyone know what this could be?...
Comments: 8, Score: 0
Loaded 8 comments for analysis
Comment upvotes: avg=1.6, max=4, total=13
Detected bug type: bed_bug
Found 1 image(s)

--- Processing post 4/25 ---
Title: Anyone know what this could be?...
Comments: 8, Score: 0
Loaded 8 comments for analysis
Comment upvotes: avg=1.6, max=4, total=13
Detected bug type: bed_bug
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_9.jpg


Saved as: images/run20250806_203706/BED_BUG_9.jpg

--- Processing post 5/25 ---
Title: Can anyone identify this bite....
Comments: 4, Score: 0

--- Processing post 5/25 ---
Title: Can anyone identify this bite....
Comments: 4, Score: 0


INFO:__main__:Downloaded: images/run20250806_203706/ANT_1.jpg


Loaded 2 comments for analysis
Comment upvotes: avg=1.5, max=2, total=3
Detected bug type: ant
Found 3 image(s)
Saved as: images/run20250806_203706/ANT_1.jpg


INFO:__main__:Downloaded: images/run20250806_203706/ANT_2.jpg
INFO:__main__:Downloaded: images/run20250806_203706/ANT_3.jpg
INFO:__main__:Downloaded: images/run20250806_203706/ANT_3.jpg


Saved as: images/run20250806_203706/ANT_2.jpg
Saved as: images/run20250806_203706/ANT_3.jpg

--- Processing post 6/25 ---
Title: My husband has had this on his leg for 3 days now....
Comments: 4, Score: 0

--- Processing post 6/25 ---
Title: My husband has had this on his leg for 3 days now....
Comments: 4, Score: 0


INFO:__main__:Downloaded: images/run20250806_203706/SPIDER_1.jpg


Loaded 4 comments for analysis
Comment upvotes: avg=1.8, max=3, total=7
Detected bug type: spider
Found 2 image(s)
Saved as: images/run20250806_203706/SPIDER_1.jpg


INFO:__main__:Downloaded: images/run20250806_203706/SPIDER_2.jpg


Saved as: images/run20250806_203706/SPIDER_2.jpg

--- Processing post 7/25 ---
Title: What type of bite is this? New bites weekly...
Comments: 3, Score: 0
Loaded 3 comments for analysis
Comment upvotes: avg=1.3, max=2, total=4
Detected bug type: flea
Found 2 image(s)

--- Processing post 7/25 ---
Title: What type of bite is this? New bites weekly...
Comments: 3, Score: 0
Loaded 3 comments for analysis
Comment upvotes: avg=1.3, max=2, total=4
Detected bug type: flea
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/FLEA_1.jpg
INFO:__main__:Downloaded: images/run20250806_203706/FLEA_2.jpg
INFO:__main__:Downloaded: images/run20250806_203706/FLEA_2.jpg


Saved as: images/run20250806_203706/FLEA_1.jpg
Saved as: images/run20250806_203706/FLEA_2.jpg

--- Processing post 8/25 ---
Title: freaking out. what bug bite is this weird circle on my hand...
Comments: 3, Score: 0
Loaded 3 comments for analysis
Comment upvotes: avg=1.0, max=1, total=3
Detected bug type: flea
Found 1 image(s)

--- Processing post 8/25 ---
Title: freaking out. what bug bite is this weird circle on my hand...
Comments: 3, Score: 0
Loaded 3 comments for analysis
Comment upvotes: avg=1.0, max=1, total=3
Detected bug type: flea
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/FLEA_3.jpg


Saved as: images/run20250806_203706/FLEA_3.jpg

--- Processing post 9/25 ---
Title: Help...
Comments: 3, Score: 0
Loaded 3 comments for analysis
Comment upvotes: avg=1.0, max=1, total=3
Detected bug type: bed_bug
Found 1 image(s)

--- Processing post 9/25 ---
Title: Help...
Comments: 3, Score: 0
Loaded 3 comments for analysis
Comment upvotes: avg=1.0, max=1, total=3
Detected bug type: bed_bug
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_10.jpg


Saved as: images/run20250806_203706/BED_BUG_10.jpg

--- Processing post 10/25 ---
Title: Pls help is this tick in there...
Comments: 3, Score: 0
Loaded 3 comments for analysis
Comment upvotes: avg=2.0, max=3, total=6
Detected bug type: flea
Found 2 image(s)

--- Processing post 10/25 ---
Title: Pls help is this tick in there...
Comments: 3, Score: 0
Loaded 3 comments for analysis
Comment upvotes: avg=2.0, max=3, total=6
Detected bug type: flea
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/FLEA_4.jpg
INFO:__main__:Downloaded: images/run20250806_203706/FLEA_5.jpg
INFO:__main__:Downloaded: images/run20250806_203706/FLEA_5.jpg


Saved as: images/run20250806_203706/FLEA_4.jpg
Saved as: images/run20250806_203706/FLEA_5.jpg

--- Processing post 11/25 ---
Title: The heck did this??...
Comments: 2, Score: 0
Loaded 2 comments for analysis
Comment upvotes: avg=1.0, max=1, total=2
Detected bug type: mosquito
Found 2 image(s)

--- Processing post 11/25 ---
Title: The heck did this??...
Comments: 2, Score: 0
Loaded 2 comments for analysis
Comment upvotes: avg=1.0, max=1, total=2
Detected bug type: mosquito
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_4.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_5.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_5.jpg


Saved as: images/run20250806_203706/MOSQUITO_4.jpg
Saved as: images/run20250806_203706/MOSQUITO_5.jpg

--- Processing post 12/25 ---
Title: Whaat is this?...
Comments: 2, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: flea
Found 1 image(s)

--- Processing post 12/25 ---
Title: Whaat is this?...
Comments: 2, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: flea
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/FLEA_6.jpg


Saved as: images/run20250806_203706/FLEA_6.jpg

--- Processing post 13/25 ---
Title: Won't stop itching...
Comments: 2, Score: 0
Loaded 2 comments for analysis
Comment upvotes: avg=1.0, max=1, total=2
Detected bug type: mosquito
Found 3 image(s)

--- Processing post 13/25 ---
Title: Won't stop itching...
Comments: 2, Score: 0
Loaded 2 comments for analysis
Comment upvotes: avg=1.0, max=1, total=2
Detected bug type: mosquito
Found 3 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_6.jpg


Saved as: images/run20250806_203706/MOSQUITO_6.jpg


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_7.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_8.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_8.jpg


Saved as: images/run20250806_203706/MOSQUITO_7.jpg
Saved as: images/run20250806_203706/MOSQUITO_8.jpg

--- Processing post 14/25 ---
Title: Bugbites...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: flea
Found 1 image(s)

--- Processing post 14/25 ---
Title: Bugbites...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: flea
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/FLEA_7.jpg


Saved as: images/run20250806_203706/FLEA_7.jpg

--- Processing post 15/25 ---
Title: Juneau AK - What is this...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 1 image(s)

--- Processing post 15/25 ---
Title: Juneau AK - What is this...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_9.jpg


Saved as: images/run20250806_203706/MOSQUITO_9.jpg

--- Processing post 16/25 ---
Title: Are these bed bugs??...
Comments: 1, Score: 0

--- Processing post 16/25 ---
Title: Are these bed bugs??...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: bed_bug
No images found in this post
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: bed_bug
No images found in this post

--- Processing post 17/25 ---
Title: I have these bites on my lower leg—what bit me?...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: flea
Found 2 image(s)

--- Processing post 17/25 ---
Title: I have these bites on my lower leg—what bit me?...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: flea
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/FLEA_8.jpg


Saved as: images/run20250806_203706/FLEA_8.jpg


INFO:__main__:Downloaded: images/run20250806_203706/FLEA_9.jpg


Saved as: images/run20250806_203706/FLEA_9.jpg

--- Processing post 18/25 ---
Title: Bed bugs or fleas or ?...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: bed_bug
Found 1 image(s)

--- Processing post 18/25 ---
Title: Bed bugs or fleas or ?...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: bed_bug
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_11.jpg


Saved as: images/run20250806_203706/BED_BUG_11.jpg

--- Processing post 19/25 ---
Title: Is this just a mozzie bite ?...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 1 image(s)

--- Processing post 19/25 ---
Title: Is this just a mozzie bite ?...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_10.jpg


Saved as: images/run20250806_203706/MOSQUITO_10.jpg

--- Processing post 20/25 ---
Title: Large bite -kinda nsfw...
Comments: 1, Score: 0

--- Processing post 20/25 ---
Title: Large bite -kinda nsfw...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 2 image(s)
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_11.jpg


Saved as: images/run20250806_203706/MOSQUITO_11.jpg


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_12.jpg


Saved as: images/run20250806_203706/MOSQUITO_12.jpg

--- Processing post 21/25 ---
Title: Bugs...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: flea
No images found in this post

--- Processing post 21/25 ---
Title: Bugs...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: flea
No images found in this post

--- Processing post 22/25 ---
Title: What bug are these bites from!...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: bed_bug
Found 2 image(s)

--- Processing post 22/25 ---
Title: What bug are these bites from!...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: bed_bug
Found 2 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_12.jpg
INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_13.jpg
INFO:__main__:Downloaded: images/run20250806_203706/BED_BUG_13.jpg


Saved as: images/run20250806_203706/BED_BUG_12.jpg
Saved as: images/run20250806_203706/BED_BUG_13.jpg

--- Processing post 23/25 ---
Title: Bed Bugs?? What are these?...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 5 image(s)

--- Processing post 23/25 ---
Title: Bed Bugs?? What are these?...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 5 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_13.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_14.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_14.jpg


Saved as: images/run20250806_203706/MOSQUITO_13.jpg
Saved as: images/run20250806_203706/MOSQUITO_14.jpg


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_15.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_16.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_16.jpg


Saved as: images/run20250806_203706/MOSQUITO_15.jpg
Saved as: images/run20250806_203706/MOSQUITO_16.jpg


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_17.jpg


Saved as: images/run20250806_203706/MOSQUITO_17.jpg

--- Processing post 24/25 ---
Title: What bug bit my 1 year old...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 3 image(s)

--- Processing post 24/25 ---
Title: What bug bit my 1 year old...
Comments: 1, Score: 0
Loaded 1 comments for analysis
Comment upvotes: avg=1.0, max=1, total=1
Detected bug type: mosquito
Found 3 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_18.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_19.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_19.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_20.jpg
INFO:__main__:Downloaded: images/run20250806_203706/MOSQUITO_20.jpg


Saved as: images/run20250806_203706/MOSQUITO_18.jpg
Saved as: images/run20250806_203706/MOSQUITO_19.jpg
Saved as: images/run20250806_203706/MOSQUITO_20.jpg

--- Processing post 25/25 ---
Title: Possible black widow bite? Numb to touch...
Comments: 0, Score: 0
Loaded 0 comments for analysis
Detected bug type: spider
Found 1 image(s)

--- Processing post 25/25 ---
Title: Possible black widow bite? Numb to touch...
Comments: 0, Score: 0
Loaded 0 comments for analysis
Detected bug type: spider
Found 1 image(s)


INFO:__main__:Downloaded: images/run20250806_203706/SPIDER_3.jpg


Saved as: images/run20250806_203706/SPIDER_3.jpg
Run metadata saved to: metadata/scraping_results_run20250806_203706.json
Master metadata updated: metadata/all_scraping_results.json
Run summary saved to: metadata/run_summary_20250806_203706.json

SCRAPING RUN COMPLETE!
Run timestamp: 20250806_203706
Total images downloaded: 48
Engagement Statistics:
  Total comments analyzed: 240
  Average engagement score: 11.8
  Posts with comments: 47/48

Images by bug type (this run):
  ANT: 3 images (6.2%)
  BED_BUG: 13 images (27.1%)
  FLEA: 9 images (18.8%)
  MOSQUITO: 20 images (41.7%)
  SPIDER: 3 images (6.2%)

✅ Classification improvement! Only 0.0% unknown (down from previous runs)

Run directory: images/run20250806_203706
Files created (48 total):
  ANT: 3 files
    ANT_1.jpg
    ANT_2.jpg
    ANT_3.jpg
  BED: 13 files
    BED_BUG_1.jpg
    BED_BUG_10.jpg
    BED_BUG_11.jpg
    ... and 10 more
  FLEA: 9 files
    FLEA_1.jpg
    FLEA_2.jpg
    FLEA_3.jpg
    ... and 6 more
  MOSQUITO: 20 fil

In [19]:
# Quick Configuration Presets for Different Scraping Strategies

def run_strategy_preset(strategy=None):
    """
    Run predefined scraping strategies optimized for different goals
    If no strategy provided, uses REDDIT_POST_FETCH_STRATEGY from .env

    Strategies:
    - 'balanced': Good mix of popular posts with comments (recommended)
    - 'discussion_heavy': Focus on posts with lots of discussion
    - 'recent_active': Recent posts that are gaining traction
    - 'controversial': Posts with mixed reactions (often more detailed descriptions)
    - 'quality_focused': Top-rated posts from longer time period
    """

    # Use environment variable if no strategy specified
    if strategy is None:
        strategy = REDDIT_POST_FETCH_STRATEGY
        print(f"Using strategy from .env file: {strategy}")

    strategies = {
        'balanced': {
            'limit': REDDIT_POSTS_COUNT,
            'time_filter': 'month',
            'sort_method': 'top',
            'description': 'Top posts from past month with good engagement'
        },
        'discussion_heavy': {
            'limit': REDDIT_POSTS_COUNT,
            'time_filter': 'all',
            'sort_method': 'controversial',
            'description': 'Controversial posts (all time) that generate discussion'
        },
        'recent_active': {
            'limit': REDDIT_POSTS_COUNT,
            'time_filter': 'week',
            'sort_method': 'hot',
            'description': 'Currently hot posts from past week'
        },
        'controversial': {
            'limit': REDDIT_POSTS_COUNT,
            'time_filter': 'month',
            'sort_method': 'controversial',
            'description': 'Most controversial posts from past month'
        },
        'quality_focused': {
            'limit': REDDIT_POSTS_COUNT,
            'time_filter': 'year',
            'sort_method': 'top',
            'description': 'Highest quality posts from past year'
        }
    }

    if strategy not in strategies:
        print(f"Unknown strategy '{strategy}'. Available strategies:")
        for name, config in strategies.items():
            print(f"  {name}: {config['description']}")
        return

    config = strategies[strategy]
    print(f"Running '{strategy}' strategy:")
    print(f"  Target subreddit: r/{REDDIT_SUBREDDIT}")
    print(f"  {config['description']}")
    print(f"  Limit: {config['limit']}, Time: {config['time_filter']}, Sort: {config['sort_method']}")
    print("=" * 70)

    # Run the scraper with this configuration
    scraped_data, run_timestamp = scrape_bugbites_subreddit(
        limit=config['limit'],
        time_filter=config['time_filter'],
        sort_method=config['sort_method']
    )

    # Save and analyze results
    if scraped_data:
        run_metadata_file, master_metadata_file = save_metadata_with_run(scraped_data, run_timestamp)

        # Quick analysis
        print(f"\n🎯 Strategy '{strategy}' Results:")
        print(f"   Downloaded: {len(scraped_data)} images")

        bug_counts = defaultdict(int)
        for item in scraped_data:
            bug_counts[item['bug_type']] += 1

        unknown_pct = (bug_counts['unknown'] / len(scraped_data)) * 100 if scraped_data else 0
        print(f"   Unknown rate: {unknown_pct:.1f}%")

        if unknown_pct < 70:
            print("   ✅ Good classification rate!")
        elif unknown_pct < 85:
            print("   ⚠️ Moderate classification rate")
        else:
            print("   ❌ High unknown rate - try different strategy")

        return scraped_data, run_timestamp
    else:
        print("No data collected")
        return None, None

# Quick preset runner - examples:
print("Scraping Strategy Presets Available:")
print("1. run_strategy_preset() - Uses strategy from .env file")
print("2. run_strategy_preset('balanced') - Recommended for general use")
print("3. run_strategy_preset('discussion_heavy') - For posts with lots of comments")
print("4. run_strategy_preset('recent_active') - For trending recent posts")
print("5. run_strategy_preset('controversial') - For posts with debates")
print("6. run_strategy_preset('quality_focused') - For highest quality posts")
print()
print(f"Current .env settings:")
print(f"  Strategy: {REDDIT_POST_FETCH_STRATEGY}")
print(f"  Posts count: {REDDIT_POSTS_COUNT}")
print()
print("Example usage:")
print("  data, timestamp = run_strategy_preset()  # Uses .env settings")
print("  data, timestamp = run_strategy_preset('discussion_heavy')  # Override strategy")
print()
print("Or use the detailed configuration in the next cell.")

Scraping Strategy Presets Available:
1. run_strategy_preset() - Uses strategy from .env file
2. run_strategy_preset('balanced') - Recommended for general use
3. run_strategy_preset('discussion_heavy') - For posts with lots of comments
4. run_strategy_preset('recent_active') - For trending recent posts
5. run_strategy_preset('controversial') - For posts with debates
6. run_strategy_preset('quality_focused') - For highest quality posts

Current .env settings:
  Strategy: discussion_heavy
  Posts count: 25

Example usage:
  data, timestamp = run_strategy_preset()  # Uses .env settings
  data, timestamp = run_strategy_preset('discussion_heavy')  # Override strategy

Or use the detailed configuration in the next cell.


In [20]:
# Run management and analysis utilities

def analyze_run(timestamp=None):
    """Analyze a specific run or the most recent run"""

    if timestamp is None:
        # Find most recent run
        runs = list_all_runs()
        if not runs:
            print("No runs found")
            return
        timestamp = runs[0]['timestamp']
        print(f"Analyzing most recent run: {timestamp}")

    metadata_file = f'metadata/scraping_results_run{timestamp}.json'

    try:
        with open(metadata_file, 'r') as f:
            data = json.load(f)

        if not data:
            print("No data found in run")
            return

        print(f"\nAnalysis of run {timestamp}:")
        print("=" * 50)
        print(f"Total images: {len(data)}")
        print(f"Run directory: {data[0]['run_directory']}")
        print(f"Scraped at: {data[0]['scraped_at']}")

        # Bug type distribution
        bug_counts = defaultdict(int)
        total_score = 0
        total_comments = 0

        for item in data:
            bug_counts[item['bug_type']] += 1
            total_score += item.get('post_score', 0)
            total_comments += item.get('num_comments', 0)

        print(f"\nBug Type Distribution:")
        for bug_type, count in sorted(bug_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / len(data)) * 100
            print(f"  {bug_type.upper()}: {count} images ({percentage:.1f}%)")

        print(f"\nAverage post score: {total_score / len(data):.1f}")
        print(f"Average comments per post: {total_comments / len(data):.1f}")

        # Show some sample classifications
        print(f"\nSample classifications:")
        for i, item in enumerate(data[:5]):
            print(f"  {item['bug_type'].upper()}: {item['post_title'][:60]}...")

    except FileNotFoundError:
        print(f"Run metadata not found: {metadata_file}")
    except Exception as e:
        print(f"Error analyzing run: {e}")

def compare_runs():
    """Compare all runs"""
    runs = list_all_runs()

    if len(runs) < 2:
        print("Need at least 2 runs to compare")
        return

    print("Run Comparison:")
    print("=" * 80)

    # Load all run data
    all_run_data = []
    for run in runs:
        timestamp = run['timestamp']
        metadata_file = f'metadata/scraping_results_run{timestamp}.json'

        try:
            with open(metadata_file, 'r') as f:
                data = json.load(f)

            bug_counts = defaultdict(int)
            for item in data:
                bug_counts[item['bug_type']] += 1

            all_run_data.append({
                'timestamp': timestamp,
                'total_images': len(data),
                'bug_counts': dict(bug_counts)
            })
        except:
            continue

    # Print comparison table
    print(f"{'Run':<16} {'Total':<8} {'Unknown':<8} {'Mosquito':<10} {'Spider':<8} {'Others':<8}")
    print("-" * 80)

    for run_data in all_run_data:
        timestamp = run_data['timestamp']
        total = run_data['total_images']
        unknown = run_data['bug_counts'].get('unknown', 0)
        mosquito = run_data['bug_counts'].get('mosquito', 0)
        spider = run_data['bug_counts'].get('spider', 0)
        others = total - unknown - mosquito - spider

        print(f"{timestamp:<16} {total:<8} {unknown:<8} {mosquito:<10} {spider:<8} {others:<8}")

def cleanup_old_runs(keep_recent=5):
    """Clean up old runs, keeping only the most recent ones"""
    runs = list_all_runs()

    if len(runs) <= keep_recent:
        print(f"Only {len(runs)} runs found, nothing to clean up")
        return

    runs_to_delete = runs[keep_recent:]

    print(f"Will delete {len(runs_to_delete)} old runs, keeping {keep_recent} most recent:")
    for run in runs_to_delete:
        print(f"  {run['timestamp']}")

    confirm = input("\nProceed with deletion? (y/N): ")
    if confirm.lower() != 'y':
        print("Cleanup cancelled")
        return

    deleted_count = 0
    for run in runs_to_delete:
        timestamp = run['timestamp']

        # Delete run directory
        run_dir = f"images/run{timestamp}"
        if os.path.exists(run_dir):
            import shutil
            shutil.rmtree(run_dir)
            deleted_count += 1

        # Delete metadata files
        for file_pattern in [f'metadata/scraping_results_run{timestamp}.json',
                           f'metadata/run_summary_{timestamp}.json']:
            if os.path.exists(file_pattern):
                os.remove(file_pattern)

    print(f"Cleanup complete! Deleted {deleted_count} run directories")

# Quick analysis of existing data
print("Run Management Utilities Ready!")
print("\nCurrent runs:")
list_all_runs()

if os.path.exists('images') and any(d.startswith('run') for d in os.listdir('images')):
    print("\nAnalyzing most recent run:")
    analyze_run()

Run Management Utilities Ready!

Current runs:
Found 7 previous runs:
----------------------------------------------------------------------------------------------------
Run 20250806_203706: 48 images, Sort: controversial, Comments: 240
  Directory: images/run20250806_203706
  Types: BED_BUG: 13, MOSQUITO: 20, ANT: 3, SPIDER: 3, FLEA: 9

Run 20250806_203307: 36 images, Sort: top, Comments: 271
  Directory: images/run20250806_203307
  Types: MOSQUITO: 14, BED_BUG: 13, ANT: 1, TICK: 3, FLEA: 5

Run 20250806_201819: 36 images, Sort: top, Comments: 271
  Directory: images/run20250806_201819
  Types: TICK: 5, BED_BUG: 16, DERMATITIS: 1, SCABIES: 1, SPIDER: 2, FLEA: 2, MITE: 4, BEE: 5

Run 20250806_201209: 36 images, Sort: top, Comments: 271
  Directory: images/run20250806_201209
  Types: TICK: 1, UNKNOWN: 24, BED_BUG: 10, SCABIES: 1

Run 20250806_200730: 36 images, Sort: top, Comments: 271
  Directory: images/run20250806_200730
  Types: TICK: 5, BED_BUG: 16, ANT: 1, MITE: 5, SPIDER: 2, FLE

In [21]:
# Analysis and utility functions
def analyze_scraped_data(metadata_file='metadata/scraping_results.json'):
    """Analyze the scraped data and show statistics"""

    try:
        with open(metadata_file, 'r') as f:
            data = json.load(f)

        if not data:
            print("No data found in metadata file")
            return

        print(f"Analysis of {len(data)} scraped images:")
        print("=" * 40)

        # Bug type distribution
        bug_counts = defaultdict(int)
        total_score = 0
        total_comments = 0

        for item in data:
            bug_counts[item['bug_type']] += 1
            total_score += item.get('post_score', 0)
            total_comments += item.get('num_comments', 0)

        print("\nBug Type Distribution:")
        for bug_type, count in sorted(bug_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / len(data)) * 100
            print(f"  {bug_type.upper()}: {count} images ({percentage:.1f}%)")

        print(f"\nAverage post score: {total_score / len(data):.1f}")
        print(f"Average comments per post: {total_comments / len(data):.1f}")

        # Show some sample filenames
        print(f"\nSample downloaded files:")
        for i, item in enumerate(data[:5]):
            print(f"  {item['filename']} - {item['bug_type']} - {item['post_title'][:50]}...")

    except FileNotFoundError:
        print(f"Metadata file {metadata_file} not found. Run the scraper first!")
    except Exception as e:
        print(f"Error analyzing data: {e}")

def list_downloaded_images():
    """List all downloaded images"""
    try:
        images = [f for f in os.listdir('images') if f.endswith('.jpg')]
        if images:
            print(f"Downloaded images ({len(images)} total):")
            for img in sorted(images):
                print(f"  {img}")
        else:
            print("No images found in images/ directory")
    except FileNotFoundError:
        print("Images directory not found")

# Run analysis if metadata exists
if os.path.exists('metadata/scraping_results.json'):
    analyze_scraped_data()
else:
    print("No metadata file found yet. Run the scraper first!")

No metadata file found yet. Run the scraper first!


## Instructions for Use

### Before Running the Scraper:

1. **Set up Reddit API credentials:**
   - Go to https://www.reddit.com/prefs/apps/
   - Click "Create App" or "Create Another App"
   - Choose "script" as the app type
   - Fill in the required fields
   - Copy your `client_id` and `client_secret`
   - Update the credentials in the "Reddit API Configuration" cell above
   - Optionally set `REDDIT_SUBREDDIT=your_target_subreddit` (defaults to 'bugbites')
   - Optionally set `REDDIT_POST_FETCH_STRATEGY=strategy_name` (defaults to 'balanced')
     - Available strategies: 'balanced', 'discussion_heavy', 'recent_active', 'controversial', 'quality_focused'
   - Optionally set `REDDIT_POSTS_COUNT=number` (defaults to 15)

2. **Configure scraping parameters:**
   - Adjust `POSTS_TO_SCRAPE` (recommended: start with 10-25 for testing)
   - Choose `TIME_FILTER` ('hour', 'day', 'week', 'month', 'year', 'all')

### How the Script Works:

1. **Connects to Reddit** using PRAW in read-only mode
2. **Fetches posts** from r/bugbites subreddit
3. **Analyzes text** (title + comments) to detect bug types using keyword matching
4. **Downloads images** from posts and saves them as `BUGTYPE_N.jpg`
5. **Saves metadata** about each image in JSON format

### Output Structure:
```
images/
├── MOSQUITO_1.jpg
├── MOSQUITO_2.jpg
├── SPIDER_1.jpg
├── UNKNOWN_1.jpg
└── ...

metadata/
└── scraping_results.json
```

### Troubleshooting:

- **"Error connecting to Reddit API"**: Check your credentials
- **"No images found"**: Some posts may not contain direct image links
- **Rate limiting**: The script includes delays to respect Reddit's API limits
- **Imgur albums**: Currently only handles single Imgur images, not full albums

### Notes:

- The script respects Reddit's API rate limits with built-in delays
- Images are classified based on text analysis of titles and comments
- Unknown or ambiguous posts are classified as 'UNKNOWN'
- All metadata is preserved for later analysis

In [22]:
# DEBUG: Check what's in the current BUG_TYPES['unknown'] category
print("Current BUG_TYPES['unknown'] category:")
print(BUG_TYPES['unknown'])
print()
print("This should only contain: ['unknown', 'unidentified', 'mystery', 'unclear', 'unsure']")
print("If it contains 'bug', 'bite', 'what bit', etc. - that's the problem!")

Current BUG_TYPES['unknown'] category:
['unknown', 'unidentified', 'mystery', 'unclear', 'unsure']

This should only contain: ['unknown', 'unidentified', 'mystery', 'unclear', 'unsure']
If it contains 'bug', 'bite', 'what bit', etc. - that's the problem!
