# Scraper Testing Notebook

**Purpose**: Test ONLY the web scrapers - no sentiment analysis, minimal database

## What We Test
1. ‚úÖ Gemini AI Search Term Generator
2. ‚úÖ IMDb Rating Scraper  
3. ‚úÖ IMDb Review Scraper
4. ‚ö†Ô∏è  Reddit Scraper (needs API keys)
5. ‚ö†Ô∏è  Twitter Scraper (optional)

## What We DON'T Test
- ‚ùå Sentiment analysis (NLP team)
- ‚ùå Review weighting (NLP team)
- ‚ùå Complex SQL operations
- ‚ùå Recommendation models

## Step 1: Setup & Imports

In [None]:
import sys
from pathlib import Path
import os
from dotenv import load_dotenv
import json
from pprint import pprint

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

# Load environment variables
load_dotenv(project_root / '.env')

print("‚úÖ Setup complete!")
print(f"üìÅ Project root: {project_root}")

## Step 2: Test Gemini AI Search Term Generator

In [None]:
from scrapers.gemini_search import GeminiSearchTermGenerator

# Check API key
gemini_api_key = os.getenv('GEMINI_API_KEY')
if not gemini_api_key:
    print("‚ùå GEMINI_API_KEY not found in .env")
else:
    print(f"‚úÖ Gemini API key loaded: {gemini_api_key[:20]}...")

# Initialize generator
gemini = GeminiSearchTermGenerator()
print("‚úÖ Gemini generator initialized!")

In [None]:
# Test: Generate search terms for "Inception"
print("üé¨ Testing Gemini with 'Inception (2010)'...\n")

search_terms = gemini.generate_search_terms(
    title="Inception",
    year=2010,
    genres=["Action", "Sci-Fi", "Thriller"],
    overview="A thief who steals corporate secrets through dream-sharing technology."
)

print("üìä Result type:", type(search_terms))
print("üìä Keys:", list(search_terms.keys()) if search_terms else "None")
print("\n" + "="*80)

if search_terms:
    for platform, terms in search_terms.items():
        print(f"\n{platform.upper()}:")
        for term in terms[:3]:  # Show first 3 terms
            print(f"  ‚Ä¢ {term}")
        if len(terms) > 3:
            print(f"  ... and {len(terms)-3} more")
else:
    print("‚ö†Ô∏è  No search terms generated")

## Step 3: Test IMDb Rating Scraper

In [None]:
from scrapers.imdb_scraper import IMDbScraper

# Initialize scraper
imdb = IMDbScraper(rate_limit=2.0)
print("‚úÖ IMDb scraper initialized!")
print("‚è±Ô∏è  Rate limit: 2 seconds between requests")

In [None]:
# Test: Scrape rating for "Inception"
print("üé¨ Testing IMDb rating scraper with 'Inception (2010)'...\n")

rating_data = imdb.scrape_movie_rating(
    title="Inception",
    year=2010
)

print("üìä Result type:", type(rating_data))
print("\n" + "="*80)

if rating_data:
    print("\n‚úÖ RATING DATA:")
    print(f"  Rating: {rating_data.get('rating')}/10")
    print(f"  Votes: {rating_data.get('vote_count'):,}")
    print(f"  IMDb ID: {rating_data.get('imdb_id')}")
else:
    print("‚ö†Ô∏è  No rating data found")

## Step 4: Test IMDb Review Scraper

In [None]:
# Test: Scrape reviews for "Zootopia"
print("üé¨ Testing IMDb review scraper with 'Zootopia (2016)'...\n")
print("‚è±Ô∏è  This will take ~20 seconds (10 reviews with 2-second rate limit)\n")

reviews = imdb.scrape_movie_reviews(
    title="Zootopia",
    year=2016,
    max_reviews=10
)

print("\n" + "="*80)
print(f"\n‚úÖ Scraped {len(reviews)} reviews\n")

if reviews:
    print("üìä SAMPLE REVIEW (first one):")
    sample = reviews[0]
    print(f"\n  Source: {sample.get('source')}")
    print(f"  Rating: {sample.get('rating')}/10")
    print(f"  Author: {sample.get('author')}")
    print(f"  Helpful votes: {sample.get('helpful_count')}")
    print(f"  Text preview: {sample.get('text', '')[:200]}...")
    print(f"\n  Full structure:")
    pprint({k: v for k, v in sample.items() if k != 'text'}, indent=4)
else:
    print("‚ö†Ô∏è  No reviews found")

## Step 5: Test Reddit Scraper (Optional)

In [None]:
# Check if Reddit API keys are available
reddit_client_id = os.getenv('REDDIT_CLIENT_ID')
reddit_client_secret = os.getenv('REDDIT_CLIENT_SECRET')

if reddit_client_id and reddit_client_secret:
    print("‚úÖ Reddit API keys found!")
    from scrapers.reddit_scraper import RedditScraper
    
    reddit = RedditScraper()
    print("‚úÖ Reddit scraper initialized!")
    
    # Test search
    print("\nüîç Searching Reddit for 'Inception movie discussion'...\n")
    reddit_posts = reddit.search_posts(
        search_terms=["Inception movie discussion"],
        max_results_per_term=5
    )
    
    print(f"‚úÖ Found {len(reddit_posts)} Reddit posts")
    if reddit_posts:
        print("\nüìä SAMPLE POST:")
        sample = reddit_posts[0]
        print(f"  Subreddit: {sample.get('subreddit')}")
        print(f"  Score: {sample.get('score')}")
        print(f"  Text: {sample.get('text', '')[:200]}...")
else:
    print("‚ö†Ô∏è  Reddit API keys not found in .env")
    print("   To test Reddit scraper, add:")
    print("   REDDIT_CLIENT_ID=your_id")
    print("   REDDIT_CLIENT_SECRET=your_secret")

## Step 6: Test Twitter Scraper (Optional)

In [None]:
print("‚ö†Ô∏è  Twitter scraper uses snscrape (no API key needed)")
print("   However, Twitter/X has been restricting scraping recently.")
print("   Test at your own risk - may not work reliably.\n")

# Uncomment to test:
# from scrapers.twitter_scraper import TwitterScraper
# twitter = TwitterScraper()
# tweets = twitter.search_tweets(
#     search_terms=["#Inception movie"],
#     max_tweets_per_term=5
# )
# print(f"Found {len(tweets)} tweets")

## Summary: Scraper Test Results

In [None]:
print("="*80)
print("SCRAPER TEST SUMMARY")
print("="*80)
print()
print("‚úÖ Gemini AI: Generates search terms (dict with platform keys)")
print("‚úÖ IMDb Rating: Returns rating, vote count, IMDb ID")
print("‚úÖ IMDb Reviews: Returns list of review dictionaries")
print("‚ö†Ô∏è  Reddit: Needs API keys in .env")
print("‚ö†Ô∏è  Twitter: May not work due to platform restrictions")
print()
print("="*80)
print("NEXT STEPS")
print("="*80)
print()
print("1. Scrapers are working independently ‚úÖ")
print("2. Data structures are correct ‚úÖ")
print("3. Ready to integrate with database")
print("4. NLP team can process scraped reviews")
print("5. Recommendation team can use rated movies")