<a href="https://colab.research.google.com/github/ahteshamsalamatansari/colabcodes/blob/main/MIT_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
"""
MIT News Articles Scraper
Optimized for scraping 10,000+ articles in ~40 minutes
Run this entire cell in Google Colab with a single click
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin, urlparse
import logging
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Configure logging for transparency
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MITNewsScraper:
    def __init__(self, max_workers=20, request_timeout=10, retry_attempts=3):
        """
        Initialize the scraper with optimized settings for speed

        Args:
            max_workers (int): Number of concurrent threads (20 for 40min target)
            request_timeout (int): Request timeout in seconds
            retry_attempts (int): Number of retry attempts for failed requests
        """
        self.max_workers = max_workers
        self.request_timeout = request_timeout
        self.retry_attempts = retry_attempts
        self.session = requests.Session()

        # Optimize session for speed
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })

        # Results storage
        self.successful_scrapes = []
        self.failed_scrapes = []

    def clean_text(self, text):
        """Clean extracted text from unwanted characters and formatting"""
        if not text:
            return ""

        # Remove extra whitespace and newlines
        text = re.sub(r'\s+', ' ', text.strip())

        # Remove common unwanted patterns
        text = re.sub(r'\[.*?\]', '', text)  # Remove square brackets content
        text = re.sub(r'\(.*?email.*?\)', '', text, flags=re.IGNORECASE)  # Remove email references

        return text.strip()

    def extract_article_content(self, soup):
        """
        Extract clean article content from BeautifulSoup object

        Args:
            soup: BeautifulSoup parsed HTML

        Returns:
            tuple: (title, content)
        """
        title = ""
        content = ""

        try:
            # Extract title - multiple possible selectors
            title_selectors = [
                'h1.news-article--headline',
                'h1[itemprop="headline"]',
                'h1.article-title',
                'h1',
                '.news-article--headline',
                '[itemprop="headline"]'
            ]

            for selector in title_selectors:
                title_elem = soup.select_one(selector)
                if title_elem:
                    title = self.clean_text(title_elem.get_text())
                    break

            # Extract main content - target the specific content area
            content_selectors = [
                '.news-article--content--body--inner',
                '.news-article--content--body',
                '[itemprop="articleBody"]',
                '.article-body',
                '.content-body'
            ]

            content_container = None
            for selector in content_selectors:
                content_container = soup.select_one(selector)
                if content_container:
                    break

            if content_container:
                # Extract only paragraph text, avoiding unwanted elements
                paragraphs = []

                # Find all paragraph tags
                for p_tag in content_container.find_all('p'):
                    # Skip paragraphs that are likely social media or navigation
                    p_text = p_tag.get_text().strip()

                    # Skip if paragraph contains social media indicators
                    social_indicators = ['share', 'tweet', 'facebook', 'linkedin', 'reddit', 'print', 'follow us']
                    if any(indicator in p_text.lower() for indicator in social_indicators):
                        continue

                    # Skip very short paragraphs (likely navigation)
                    if len(p_text) < 10:
                        continue

                    # Skip paragraphs with mostly links
                    links_count = len(p_tag.find_all('a'))
                    words_count = len(p_text.split())
                    if links_count > 0 and words_count < 20 and links_count >= words_count / 5:
                        continue

                    if p_text:
                        paragraphs.append(self.clean_text(p_text))

                content = ' '.join(paragraphs)

            # Fallback if no content found
            if not content:
                # Try to get all text from body, then clean heavily
                body = soup.find('body')
                if body:
                    # Remove script and style elements
                    for script in body(["script", "style", "nav", "header", "footer", "aside"]):
                        script.decompose()
                    content = self.clean_text(body.get_text())

        except Exception as e:
            logger.error(f"Error extracting content: {str(e)}")

        return title, content

    def scrape_single_article(self, url):
        """
        Scrape a single article with retry logic

        Args:
            url (str): Article URL to scrape

        Returns:
            dict: Scraped article data or None if failed
        """
        for attempt in range(self.retry_attempts):
            try:
                response = self.session.get(url, timeout=self.request_timeout)
                response.raise_for_status()

                # Parse HTML
                soup = BeautifulSoup(response.content, 'html.parser')

                # Extract content
                title, content = self.extract_article_content(soup)

                if title and content and len(content) > 100:  # Minimum content length
                    return {
                        'url': url,
                        'title': title,
                        'content': content,
                        'status': 'success',
                        'content_length': len(content),
                        'attempt': attempt + 1
                    }
                else:
                    logger.warning(f"Insufficient content for {url}")

            except requests.exceptions.RequestException as e:
                if attempt < self.retry_attempts - 1:
                    time.sleep(1 * (attempt + 1))  # Exponential backoff
                    continue
                logger.error(f"Failed to scrape {url} after {self.retry_attempts} attempts: {str(e)}")

            except Exception as e:
                logger.error(f"Unexpected error scraping {url}: {str(e)}")
                break

        return {
            'url': url,
            'title': '',
            'content': '',
            'status': 'failed',
            'content_length': 0,
            'attempt': self.retry_attempts
        }

    def scrape_articles(self, urls):
        """
        Scrape multiple articles concurrently with progress visualization

        Args:
            urls (list): List of URLs to scrape

        Returns:
            tuple: (successful_results, failed_results)
        """
        print(f"🚀 Starting to scrape {len(urls)} articles...")
        print(f"⚙️  Configuration: {self.max_workers} threads, {self.request_timeout}s timeout")

        start_time = time.time()

        # Progress tracking
        with tqdm(total=len(urls), desc="Scraping Progress", ncols=100) as pbar:
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                # Submit all tasks
                future_to_url = {executor.submit(self.scrape_single_article, url): url for url in urls}

                # Process completed tasks
                for future in as_completed(future_to_url):
                    result = future.result()

                    if result['status'] == 'success':
                        self.successful_scrapes.append(result)
                        pbar.set_postfix({
                            'Success': len(self.successful_scrapes),
                            'Failed': len(self.failed_scrapes),
                            'Rate': f"{len(self.successful_scrapes)/(time.time()-start_time):.1f}/s"
                        })
                    else:
                        self.failed_scrapes.append(result)

                    pbar.update(1)

        end_time = time.time()
        duration = end_time - start_time

        # Print final statistics
        print(f"\n📊 Scraping Results:")
        print(f"✅ Successful: {len(self.successful_scrapes)}")
        print(f"❌ Failed: {len(self.failed_scrapes)}")
        print(f"⏱️  Total time: {duration:.2f} seconds ({duration/60:.2f} minutes)")
        print(f"📈 Average rate: {len(urls)/duration:.2f} articles/second")
        print(f"💾 Total content scraped: {sum(r['content_length'] for r in self.successful_scrapes):,} characters")

        return self.successful_scrapes, self.failed_scrapes

def load_urls_from_text(url_text):
    """Load URLs from text input, handling various formats"""
    urls = []
    for line in url_text.strip().split('\n'):
        line = line.strip()
        if line and line.startswith('http'):
            urls.append(line)
    return urls

# MAIN EXECUTION SECTION - MODIFY THIS PART
def main():
    """Main function - modify URLs here or upload file"""

    print("🔧 MIT News Articles Scraper")
    print("=" * 50)

    # OPTION 1: Paste URLs directly here (for testing)
    sample_urls = [
        "https://news.mit.edu/2025/ianacare-builds-lifeline-for-family-caregivers-across-us-0811",
        "https://news.mit.edu/2025/cloudian-helps-data-storage-keep-up-with-ai-revolution-0806",
        "https://news.mit.edu/2025/ushering-new-era-suture-free-tissue-reconstruction-better-healing-0801",
        "https://news.mit.edu/2025/supporting-mission-driven-space-innovation-aurelia-institute-0710",
        "https://news.mit.edu/2025/new-platform-foundation-alloy-developing-advanced-metals-scale-0703"
    ]

    # OPTION 2: Upload file method (uncomment and modify path)
    # try:
    #     with open('your_urls_file.txt', 'r') as f:
    #         urls_text = f.read()
    #         urls = load_urls_from_text(urls_text)
    # except FileNotFoundError:
    #     print("URLs file not found, using sample URLs")
    #     urls = sample_urls

    # For now, using sample URLs
    urls = sample_urls

    if not urls:
        print("❌ No URLs found! Please add URLs to the sample_urls list or upload a file.")
        return

    print(f"📋 Found {len(urls)} URLs to scrape")

    # Initialize scraper with optimized settings for speed
    scraper = MITNewsScraper(
        max_workers=20,  # Adjust based on your needs (higher = faster but more aggressive)
        request_timeout=10,
        retry_attempts=2
    )

    # Start scraping
    successful_results, failed_results = scraper.scrape_articles(urls)

    # Save results to CSV
    if successful_results:
        df = pd.DataFrame(successful_results)
        df = df[['url', 'title', 'content', 'content_length']]  # Select relevant columns

        # Save to CSV
        output_filename = f'mit_news_articles_{int(time.time())}.csv'
        df.to_csv(output_filename, index=False)
        print(f"💾 Results saved to: {output_filename}")

        # Display sample results
        print(f"\n📖 Sample of scraped content:")
        print("-" * 50)
        for i, result in enumerate(successful_results[:2]):  # Show first 2 results
            print(f"Title: {result['title']}")
            print(f"Content preview: {result['content'][:200]}...")
            print(f"Content length: {result['content_length']} characters")
            print("-" * 50)

    # Save failed URLs for retry if needed
    if failed_results:
        failed_df = pd.DataFrame(failed_results)
        failed_filename = f'failed_urls_{int(time.time())}.csv'
        failed_df.to_csv(failed_filename, index=False)
        print(f"⚠️  Failed URLs saved to: {failed_filename}")

# Execute the scraper
if __name__ == "__main__":
    main()

# FOR GOOGLE COLAB: Uncomment the line below to run automatically
# main()