In [4]:
import requests
import gzip
import json
import os
from datetime import datetime, timedelta
import time
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)

def fetch_filtered_movie_ids(output_dir="data", min_popularity=0.1, max_retry_days=3):
    """
    Fetches and filters TMDB's daily export to get ONLY movie IDs
    Returns: list of movie dictionaries with metadata
    """
    os.makedirs(output_dir, exist_ok=True)
    valid_movies = []
    
    # Try recent dates
    for days_back in range(max_retry_days):
        date_str = (datetime.now() - timedelta(days=days_back)).strftime("%m_%d_%Y")
        url = f"https://files.tmdb.org/p/exports/movie_ids_{date_str}.json.gz"
        raw_path = os.path.join(output_dir, f"raw_movies_{date_str}.json")
        
        try:
            logging.info(f"Downloading {url}")
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            # Decompress directly in memory
            decompressed = gzip.decompress(response.content).decode('utf-8')
            lines = decompressed.strip().split('\n')
            
            # Process each entry
            for line in lines:
                try:
                    entry = json.loads(line)
                    
                    # FILTER: Only include actual movies
                    is_movie = (
                        entry.get('media_type', 'movie') == 'movie' and
                        not entry.get('video', True) and
                        entry.get('popularity', 0) >= min_popularity
                    )
                    
                    if is_movie:
                        valid_movies.append({
                            'id': entry['id'],
                            'title': entry.get('original_title', ''),
                            'popularity': entry.get('popularity', 0),
                            'adult': entry.get('adult', True),
                            'release_year': entry.get('release_date', '')[:4] if 'release_date' in entry else None
                        })
                        
                except json.JSONDecodeError:
                    continue
            
            # Save filtered data
            with open(raw_path, 'w') as f:
                json.dump(valid_movies, f, indent=2)
                
            logging.info(f"✅ Found {len(valid_movies)} valid movies")
            return valid_movies
            
        except requests.HTTPError:
            logging.warning(f"Date {date_str} not available, trying previous day...")
            time.sleep(2)
    
    raise RuntimeError("Failed to fetch movie data after multiple attempts")

def save_movie_data(movies, output_format='both'):
    """Saves movie data in specified format(s)"""
    timestamp = datetime.now().strftime("%Y%m%d")
    
    if output_format in ['json', 'both']:
        json_path = f"filtered_movies_{timestamp}.json"
        with open(json_path, 'w') as f:
            json.dump(movies, f, indent=2)
        logging.info(f"Saved JSON to {json_path}")
    
    if output_format in ['csv', 'both']:
        csv_path = f"filtered_movies_{timestamp}.csv"
        import pandas as pd
        df = pd.DataFrame(movies)
        df.to_csv(csv_path, index=False)
        logging.info(f"Saved CSV to {csv_path}")
    
    return json_path if output_format == 'json' else csv_path

if __name__ == "__main__":
    # Get filtered movies
    movies = fetch_filtered_movie_ids(
        min_popularity=0.1,  # Adjust as needed (higher = more popular)
        output_dir="./movie_data"
    )
    
    # Save results
    output_file = save_movie_data(movies, output_format='both')
    
    # Print summary
    print("\n" + "="*50)
    print(f"FOUND {len(movies)} VALID MOVIES")
    print("Sample movies:")
    for i, movie in enumerate(movies[:5]):
        print(f"{i+1}. {movie['title']} (ID: {movie['id']}, Popularity: {movie['popularity']})")
    print("="*50)

2025-06-22 03:01:49,009 - INFO - Downloading https://files.tmdb.org/p/exports/movie_ids_06_22_2025.json.gz
2025-06-22 03:01:52,988 - INFO - Downloading https://files.tmdb.org/p/exports/movie_ids_06_21_2025.json.gz
2025-06-22 03:02:28,531 - INFO - ✅ Found 426847 valid movies
2025-06-22 03:02:39,755 - INFO - Saved JSON to filtered_movies_20250622.json
2025-06-22 03:02:42,857 - INFO - Saved CSV to filtered_movies_20250622.csv



FOUND 426847 VALID MOVIES
Sample movies:
1. Blondie (ID: 3924, Popularity: 0.666)
2. Der Mann ohne Namen (ID: 6124, Popularity: 0.1715)
3. L'Amour à vingt ans (ID: 8773, Popularity: 0.5937)
4. New World Disorder 9: Never Enough (ID: 25449, Popularity: 0.1416)
5. Ariel (ID: 2, Popularity: 1.7674)
