In [1]:
!apt-get update
!apt install chromium-chromedriver
!pip install selenium beautifulsoup4 pandas requests schedule

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,609 kB]
Get:12 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,315 kB]
Get:13 https://r2u.stat.illinois.edu/ubunt

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging

# Configure logging
logging.basicConfig(
    filename='fca_scraper.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

USER_AGENT = 'Mozilla/5.0 (Research Project - UK Crypto Regulatory Tracking)'
WARNINGS_URL = "https://www.fca.org.uk/news/warnings"
RATE_LIMIT_DELAY = 3  # Compliance delay
MAX_PAGES = 50  # Increased for older warnings

class EnhancedFCAScraper:
    def __init__(self, output_file='fca_warnings.csv'):
        self.output_file = output_file
        self.session = self._create_session()
        self.scraped_urls = set()

    def _create_session(self):
        session = requests.Session()
        retries = Retry(
            total=5,
            backoff_factor=2,
            status_forcelist=[429, 500, 502, 503, 504]
        )
        session.mount('https://', HTTPAdapter(max_retries=retries))
        session.headers.update({'User-Agent': USER_AGENT})
        return session

    def _extract_text(self, soup, selector, default='Not available'):
        element = soup.select_one(selector)
        return element.get_text(strip=True) if element else default

    def _parse_date(self, date_str):
        try:
            return datetime.strptime(date_str, "%d %B %Y").isoformat()
        except ValueError:
            return None

    def _scrape_page(self, page_url):
        try:
            time.sleep(RATE_LIMIT_DELAY)
            response = self.session.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            title = self._extract_text(soup, 'h1', 'No title')
            date = self._parse_date(self._extract_text(soup, 'time'))
            category = self._extract_text(soup, '.field--name-field-warning-category')
            risk_level = self._extract_text(soup, '.field--name-field-risk-level')
            action_taken = self._extract_text(soup, '.field--name-field-action-taken')
            status = self._extract_text(soup, '.field--name-field-status')
            registration_status = self._extract_text(soup, '.field--name-field-registration-status')
            contact_info = self._extract_text(soup, '.field--name-field-contact-information')
            geographical_scope = self._extract_text(soup, '.field--name-field-geographical-scope')
            fraud_type = self._extract_text(soup, '.field--name-field-fraud-type')
            description = self._extract_text(soup, '.field--name-field-summary')

            return [{
                'title': title,
                'date': date,
                'description': description,
                'category': category,
                'risk_level': risk_level,
                'action_taken': action_taken,
                'status': status,
                'registration_status': registration_status,
                'contact_info': contact_info,
                'geographical_scope': geographical_scope,
                'fraud_type': fraud_type,
                'source_url': page_url
            }]
        except Exception as e:
            logging.error(f"Error scraping {page_url}: {str(e)}")
            return []

    def _find_next_page(self, soup):
        next_link = soup.select_one('.pager__item--next a')
        return f"https://www.fca.org.uk{next_link['href']}" if next_link else None

    def scrape(self):
        try:
            current_page = WARNINGS_URL
            all_warnings = []
            page_count = 0

            while current_page and page_count < MAX_PAGES:
                response = self.session.get(current_page)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')

                for link in soup.select('.views-row a[href^="/news/warnings/"]'):
                    page_url = f"https://www.fca.org.uk{link['href']}"
                    if page_url not in self.scraped_urls:
                        all_warnings.extend(self._scrape_page(page_url))
                        self.scraped_urls.add(page_url)

                current_page = self._find_next_page(soup)
                page_count += 1

            df = pd.DataFrame(all_warnings)
            if not df.empty:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M")
                final_file = f"{timestamp}_{self.output_file}"
                df.to_csv(final_file, sep=';', index=False, encoding='utf-8')
                logging.info(f"Saved {len(df)} warnings to {final_file}")
            else:
                logging.warning("No warnings found")
        except Exception as e:
            logging.critical(f"Fatal scraping error: {str(e)}")

if __name__ == "__main__":
    scraper = EnhancedFCAScraper()
    scraper.scrape()
