## Step 1: Import Libraries 

In [39]:
# Import libraries 

import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import logging
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service  
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt 

##  Step2: Scrape the Key Events

In [None]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

def scrape_wikipedia_page(url, filename):
    """
    Robust Wikipedia scraper that handles different page structures
    """
    try:
        # Set headers to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        logger.info(f"Scraping page: {url}")
        
        # Send GET request with headers
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Try multiple methods to find content
        content_text = ""
        
        # Method 1: Try to find the main content div
        main_content = soup.find('div', {'class': 'mw-parser-output'})
        if main_content:
            logger.info("Found content using mw-parser-output")
            content_text = main_content.get_text()
        
        # Method 2: If first method fails, try getting all paragraphs
        elif not content_text:
            logger.info("Trying alternative method - getting all paragraphs")
            paragraphs = soup.find_all('p')
            for p in paragraphs:
                text = p.get_text().strip()
                if text and len(text) > 20:  # Filter out short paragraphs
                    content_text += text + '\n\n'
        
        # Method 3: If still no content, get body text
        if not content_text.strip():
            logger.info("Using body text as fallback")
            body = soup.find('body')
            if body:
                content_text = body.get_text()
        
        if content_text.strip():
            # Clean the text
            lines = []
            for line in content_text.split('\n'):
                cleaned_line = line.strip()
                if (cleaned_line and 
                    len(cleaned_line) > 10 and  # Minimum length
                    not cleaned_line.startswith('Jump to') and
                    not cleaned_line.startswith('Navigation menu') and
                    not cleaned_line.startswith('Main page') and
                    'cookie' not in cleaned_line.lower() and
                    'javascript' not in cleaned_line.lower()):
                    lines.append(cleaned_line)
            
            cleaned_text = '\n'.join(lines)
            
            # Add page title at the top
            title = soup.find('h1')
            if title:
                page_title = title.get_text().strip()
                cleaned_text = f"# {page_title}\n\n" + cleaned_text
            
            # Save to file
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
            
            logger.info(f"Successfully saved content to {filename}")
            logger.info(f"Total characters: {len(cleaned_text)}")
            logger.info(f"Total lines: {len(lines)}")
            
            return cleaned_text, len(cleaned_text), len(lines)
        else:
            logger.error("No content could be extracted from the page")
            return None, 0, 0
            
    except Exception as e:
        logger.error(f"An error occurred: {e}")
        return None, 0, 0

# Scrape the Key Events of the 20th century page
url = "https://en.wikipedia.org/wiki/Key_events_of_the_20th_century"
output_file = "key_events_20th_century.txt"

content, char_count, line_count = scrape_wikipedia_page(url, output_file)

if content:
    print("Scraping completed successfully!")
    print(f"File saved: {output_file}")
    print(f"Total characters: {char_count}")
    print(f"Total lines: {line_count}")
    
    # Display a preview
    print("\n--- PREVIEW OF CONTENT ---")
    lines = content.split('\n')
    for i, line in enumerate(lines[:15]):
        if line.strip():
            print(f"{line[:120]}{'...' if len(line) > 120 else ''}")
else:
    print("Scraping failed.")

In [None]:
# Verify the created file
import os

def verify_file():
    filename = 'key_events_20th_century.txt'
    
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            content = f.read()
        
        print("File verification successful!")
        print(f"File location: {os.path.abspath(filename)}")
        print(f"File size: {len(content)} characters")
        print(f"File size: {os.path.getsize(filename)} bytes")
        
        # Show file structure
        lines = content.split('\n')
        print(f"Total lines: {len(lines)}")
        
        print("\n--- FILE STRUCTURE ---")
        for i, line in enumerate(lines[:20]):  # Show first 20 lines
            if line.strip():
                print(f"Line {i+1}: {line[:80]}{'...' if len(line) > 80 else ''}")
                
        return True
    else:
        print("File not found")
        return False

verify_file()