In [5]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
import json
import logging
from pathlib import Path
import calendar

class NYTArticleScraper:
    def __init__(self, api_key, query, start_date, end_date=None):
        """
        Initialize the NYT Article Scraper
        
        Parameters:
        api_key (str): NYT API key
        query (str): Search query
        start_date (str): Start date in YYYYMMDD format
        end_date (str): Optional end date in YYYYMMDD format
        """
        self.api_key = api_key
        self.base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
        self.query = query
        self.start_date = start_date
        self.end_date = end_date or start_date
        self.daily_requests = 0
        self.last_request_time = None
        self.current_date = datetime.now().strftime('%Y%m%d')
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('Raw Data/NYT_Scraper.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
    def wait_for_rate_limit(self):
        """Manage API rate limits"""
        # Check if we need to reset daily counter
        current_date = datetime.now().strftime('%Y%m%d')
        if current_date != self.current_date:
            self.daily_requests = 0
            self.current_date = current_date
        
        # Check daily limit
        if self.daily_requests >= 500:
            wait_time = (datetime.now() + timedelta(days=1)).replace(
                hour=0, minute=0, second=0, microsecond=0
            ) - datetime.now()
            self.logger.info(f"Daily limit reached. Waiting {wait_time.seconds} seconds until midnight...")
            time.sleep(wait_time.seconds + 1)
            self.daily_requests = 0
            self.current_date = datetime.now().strftime('%Y%m%d')
        
        # Handle per-minute rate limit
        if self.last_request_time is not None:
            elapsed = time.time() - self.last_request_time
            if elapsed < 12:
                time.sleep(12 - elapsed)
        
        self.last_request_time = time.time()
        self.daily_requests += 1
        
    def get_params(self, page, current_date):
        """Generate parameters for API request"""
        return {
            'q': self.query,
            'news_desk': 'World;Politics',
            'begin_date': current_date,
            'end_date': current_date,
            'type_of_material': 'News',
            'api-key': self.api_key,
            'page': page,
            'sort': 'oldest'
        }
    
    def process_article(self, article):
        """Process a single article and extract relevant information"""
        try:
            persons = [kw['value'] for kw in article['keywords'] if kw['name'] == 'persons']
            places = [kw['value'] for kw in article['keywords'] if kw['name'] == 'glocations']
            subjects = [kw['value'] for kw in article['keywords'] if kw['name'] == 'subject']
            organizations = [kw['value'] for kw in article['keywords'] if kw['name'] == 'organizations']
            
            return {
                'headline': article['headline']['main'],
                'publication_date': pd.to_datetime(article['pub_date']),
                'people_mentioned': '; '.join(persons),
                'places_mentioned': '; '.join(places),
                'subjects': '; '.join(subjects),
                'organizations': '; '.join(organizations),
                'url': article['web_url'],
                'abstract': article.get('abstract', ''),
                'word_count': article.get('word_count', 0),
            }
        except KeyError as e:
            self.logger.error(f"Error processing article: {e}")
            return None
    
    def get_month_range(self, date_str):
        """Get the start and end date for a month containing the given date"""
        date = datetime.strptime(date_str, '%Y%m%d')
        _, last_day = calendar.monthrange(date.year, date.month)
        month_start = date.replace(day=1).strftime('%Y%m%d')
        month_end = date.replace(day=last_day).strftime('%Y%m%d')
        return month_start, month_end
    
    def fetch_articles_for_date(self, current_date):
        """Fetch all articles for a specific date"""
        page = 0
        articles_for_date = []
        
        while True:
            try:
                self.logger.info(f"Fetching page {page} for date {current_date} (Daily requests: {self.daily_requests}/500)")
                self.wait_for_rate_limit()
                
                params = self.get_params(page, current_date)
                response = requests.get(self.base_url, params=params)
                response.raise_for_status()
                data = response.json()
                
                if 'response' not in data or not data['response']['docs']:
                    break
                    
                processed_articles = [
                    article for article in [
                        self.process_article(doc) 
                        for doc in data['response']['docs']
                    ] 
                    if article is not None
                ]
                
                articles_for_date.extend(processed_articles)
                self.logger.info(f"Found {len(processed_articles)} articles on page {page}")
                
                # Check if we've reached the last page
                if len(data['response']['docs']) < 10:
                    break
                    
                page += 1
                
            except requests.exceptions.RequestException as e:
                self.logger.error(f"API request failed for date {current_date}, page {page}: {e}")
                break
            except json.JSONDecodeError as e:
                self.logger.error(f"JSON parsing error for date {current_date}, page {page}: {e}")
                break
            
        return articles_for_date
    
    def fetch_articles_by_month(self):
        """Fetch articles month by month"""
        current_date = datetime.strptime(self.start_date, '%Y%m%d')
        end_date = datetime.strptime(self.end_date, '%Y%m%d')
        
        while current_date <= end_date:
            # Get month range
            month_start, month_end = self.get_month_range(current_date.strftime('%Y%m%d'))
            month_articles = []
            
            # Process each day in the month
            current_day = max(current_date, datetime.strptime(month_start, '%Y%m%d'))
            month_end_date = min(end_date, datetime.strptime(month_end, '%Y%m%d'))
            
            while current_day <= month_end_date:
                current_day_str = current_day.strftime('%Y%m%d')
                self.logger.info(f"Processing date: {current_day_str}")
                
                articles = self.fetch_articles_for_date(current_day_str)
                month_articles.extend(articles)
                
                # # Save progress after each day
                # if articles:
                #     progress_file = month_dir / f'NYT_Articles_{current_day_str}.csv'
                #     pd.DataFrame(articles).to_csv(progress_file, index=False)
                #     self.logger.info(f"Saved {len(articles)} articles for {current_day_str}")
                
                current_day += timedelta(days=1)
            
            # Save month's articles
            if month_articles:
                monthly_file = f'Raw Data/NYT_Articles_{current_date.strftime("%Y_%m")}.csv'
                df = pd.DataFrame(month_articles)
                df.to_csv(monthly_file, index=False)
                self.logger.info(f"Saved {len(month_articles)} articles for {current_date.strftime('%Y-%m')} to {monthly_file}")
            
            # Move to first day of next month
            current_date = (current_date.replace(day=28) + timedelta(days=4)).replace(day=1)

def main():
    # Configuration
    API_KEY = "RvLloVIecCR4YHO3qYT7vg0ggVv3mOAb"
    QUERY = "Israel-Gaza War (2023-)"
    START_DATE = "20240607"
    END_DATE = "20241007"
    
    # Initialize and run scraper
    scraper = NYTArticleScraper(
        api_key=API_KEY,
        query=QUERY,
        start_date=START_DATE,
        end_date=END_DATE
    )
    
    scraper.fetch_articles_by_month()

if __name__ == "__main__":
    main()

2024-11-16 23:00:10,596 - INFO - Processing date: 20240607
2024-11-16 23:00:10,596 - INFO - Fetching page 0 for date 20240607 (Daily requests: 0/500)
2024-11-16 23:00:11,181 - INFO - Found 10 articles on page 0
2024-11-16 23:00:11,183 - INFO - Fetching page 1 for date 20240607 (Daily requests: 1/500)
2024-11-16 23:00:23,034 - INFO - Found 1 articles on page 1
2024-11-16 23:00:23,036 - INFO - Processing date: 20240608
2024-11-16 23:00:23,037 - INFO - Fetching page 0 for date 20240608 (Daily requests: 2/500)
2024-11-16 23:00:35,156 - INFO - Found 10 articles on page 0
2024-11-16 23:00:35,157 - INFO - Fetching page 1 for date 20240608 (Daily requests: 3/500)
2024-11-16 23:00:47,103 - INFO - Found 4 articles on page 1
2024-11-16 23:00:47,114 - INFO - Processing date: 20240609
2024-11-16 23:00:47,115 - INFO - Fetching page 0 for date 20240609 (Daily requests: 4/500)
2024-11-16 23:00:59,057 - INFO - Found 9 articles on page 0
2024-11-16 23:00:59,058 - INFO - Processing date: 20240610
2024-11