In [None]:
!apt-get update
!apt install chromium-chromedriver
!pip install selenium beautifulsoup4 pandas requests schedule

0% [Working]            Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.ubuntu.com (91.189.91                                                                                                    Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,315 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/m

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import os
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from typing import Dict, List, Optional

# Define user agent and constants
USER_AGENT = 'Mozilla/5.0 (Research Project - Crypto Regulatory Tracking)'
BASE_URL = "https://www.sec.gov/enforcement-litigation/litigation-releases"
RATE_LIMIT_DELAY = 3  # Increased delay to respect SEC's servers

# Enhanced crypto keywords for better detection
CRYPTO_KEYWORDS = {
    'assets': ['crypto', 'cryptocurrency', 'digital asset', 'token', 'coin', 'blockchain',
              'bitcoin', 'ethereum', 'stablecoin', 'defi', 'nft'],
    'platforms': ['trading platform', 'wallet', 'mining', 'staking'],  # Removed 'exchange'
    'violations': ['unregistered', 'fraud', 'manipulation', 'misleading', 'ponzi',
                  'securities violation', 'unauthorized', 'unlicensed']
}

# Phrases that indicate non-crypto cases
NON_CRYPTO_INDICATORS = [
    'securities exchange act of 1934',
    'exchange act of 1934',
    'section 10(b) of the exchange act',
    'rule 10b-5 of the exchange act',
    'listed on nasdaq',
    'registered broker-dealer',
    'investment company act of 1940',
    'investment advisers act of 1940'
]

# Explicit crypto terms for double-checking
EXPLICIT_CRYPTO_TERMS = ['bitcoin', 'ethereum', 'cryptocurrency', 'crypto', 'blockchain',
                         'digital currency', 'digital asset', 'defi', 'nft']

class SECCryptoScraper:
    def __init__(self, output_dir: str = 'crypto_regulatory_data'):
        self.output_dir = output_dir
        self.session = self._create_session()
        self._setup_output_dir()

    def _setup_output_dir(self) -> None:
        """Create output directory if it doesn't exist."""
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def _create_session(self) -> requests.Session:
        """Create a session with retry logic and proper headers."""
        session = requests.Session()
        retries = Retry(
            total=5,
            backoff_factor=2,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET"]
        )
        session.mount('https://', HTTPAdapter(max_retries=retries))
        session.headers.update({
            'User-Agent': USER_AGENT,
            'Accept': 'text/html,application/xhtml+xml',
            'Accept-Language': 'en-US,en;q=0.9',
        })
        return session

    def _log(self, message: str, level: str = "INFO") -> None:
        """Enhanced logging with color coding."""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"[{timestamp}] [{level}] {message}")

    def is_crypto_related(self, text: str) -> bool:
        """
        Enhanced crypto detection with context-aware filtering for terms like 'exchange'.
        Returns True if text contains both crypto-related terms and violation terms.
        """
        text = text.lower()

        # Function to check if a keyword appears as a whole word or phrase
        def contains_whole_word(text, keyword):
            pattern = r'\b' + re.escape(keyword) + r'\b'
            return bool(re.search(pattern, text))

        # First check if any non-crypto indicators are present
        if any(indicator in text for indicator in NON_CRYPTO_INDICATORS):
            # Only exclude if no explicit crypto terms are present
            if not any(contains_whole_word(text, term) for term in EXPLICIT_CRYPTO_TERMS):
                return False

        # Check for crypto asset terms
        has_crypto_asset = any(contains_whole_word(text, kw) for kw in CRYPTO_KEYWORDS['assets'])

        # Check for platform terms, handling 'exchange' specially
        has_crypto_platform = any(contains_whole_word(text, kw) for kw in CRYPTO_KEYWORDS['platforms'])

        # Special handling for 'exchange'
        if contains_whole_word(text, 'exchange'):
            exchange_legal_contexts = [
                'securities exchange act',
                'exchange act',
                'stock exchange',
                'national exchange',
                'exchange commission',
                'new york stock exchange',
                'nyse',
                'nasdaq'
            ]
            # If exchange is mentioned but not in legal contexts, consider it crypto-related
            if not any(context in text for context in exchange_legal_contexts):
                has_crypto_platform = True

        has_crypto = has_crypto_asset or has_crypto_platform
        has_violation = any(contains_whole_word(text, kw) for kw in CRYPTO_KEYWORDS['violations'])

        return has_crypto and has_violation

    def extract_case_details(self, url: str) -> Optional[Dict]:
        """Extract detailed information from a case page."""
        try:
            time.sleep(RATE_LIMIT_DELAY)
            response = self.session.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text()

            # Extract more detailed information
            details = {
                'projects': [],
                'allegations': None,
                'penalty': None,
                'key_violations': [],
                'filing_date': None,
                'respondents': [],
                'enforcement_type': None
            }

            # Extract project names
            title = soup.find('h1')
            if title:
                details['projects'].append(title.get_text().strip())

            # Extract allegations - improved pattern to capture complete allegations
            allegations = re.search(r'(?:alleges|alleged)(?:\s+that)?\s+([^.]+(?:\.[^.]+){0,5})', text)
            if allegations:
                # Clean up allegations text - replace multiple spaces and newlines
                allegations_text = allegations.group(1).strip()
                allegations_text = re.sub(r'\s+', ' ', allegations_text)
                details['allegations'] = allegations_text

            # Extract monetary penalties with improved pattern
            penalty = re.search(r'\$[\d,]+(?:\.\d+)?(?:\s+million|\s+billion)?\s+(?:in\s+penalties|fine|civil\s+penalty|disgorgement)', text)
            if penalty:
                details['penalty'] = penalty.group(0)

            # Extract filing date
            date_match = re.search(r'Filing\s+Date:\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})', text)
            if date_match:
                details['filing_date'] = date_match.group(1)

            # Extract respondents
            respondents = soup.find('div', class_='field-name-field-respondent')
            if respondents:
                details['respondents'] = [r.strip() for r in respondents.get_text().split(',')]

            return details

        except Exception as e:
            self._log(f"Error extracting details from {url}: {str(e)}", "ERROR")
            return None

    def scrape_investigations(self, pages: int = 5) -> pd.DataFrame:
        """Main scraping function with enhanced error handling and data collection."""
        all_cases = []

        for page in range(1, pages + 1):
            try:
                self._log(f"Processing page {page}/{pages}")
                url = f"{BASE_URL}?page={page}"

                response = self.session.get(url)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')
                table = soup.find('table', class_='usa-table')

                if not table:
                    self._log(f"No table found on page {page}", "WARNING")
                    continue

                for row in table.find_all('tr')[1:]:  # Skip header row
                    cells = row.find_all('td')
                    if len(cells) >= 2:
                        date = cells[0].text.strip()
                        link_elem = cells[1].find('a')

                        if link_elem and link_elem.get('href'):
                            title = link_elem.text.strip()
                            full_link = f"https://www.sec.gov{link_elem['href']}"

                            case_details = self.extract_case_details(full_link)
                            if case_details:
                                # Get full case text for crypto detection
                                full_case_text = str(case_details) + title

                                if self.is_crypto_related(full_case_text):
                                    case_id = len(all_cases) + 1

                                    # Determine violation type with better classification
                                    violation_type = self.determine_violation_type(full_case_text)

                                    assets = self.extract_assets(full_case_text)

                                    all_cases.append({
                                        'Case ID': case_id,
                                        'Date': date,
                                        'Title': title,
                                        'Projects': '; '.join(case_details['projects']),
                                        'Allegations': case_details['allegations'] if case_details['allegations'] else "Not specified",
                                        'Penalty': case_details['penalty'] if case_details['penalty'] else "Not specified",
                                        'Filing Date': case_details['filing_date'] if case_details['filing_date'] else date,
                                        'Violation Type': violation_type,
                                        'Assets': assets,
                                        'Crypto-Related Terms': self.get_crypto_terms(full_case_text),
                                        'Respondents': '; '.join(case_details['respondents']) if case_details['respondents'] else "Not specified",
                                        'Link': full_link,
                                    })

                time.sleep(RATE_LIMIT_DELAY)  # Respect rate limits between pages

            except Exception as e:
                self._log(f"Error processing page {page}: {str(e)}", "ERROR")
                continue

        if all_cases:
            df = pd.DataFrame(all_cases)
            self._save_results(df)
            return df

        self._log("No cases found", "WARNING")
        return pd.DataFrame()

    def determine_violation_type(self, case_text: str) -> str:
        """Better classify violation types based on case text."""
        case_text = case_text.lower()

        if "ponzi" in case_text:
            return "Ponzi Scheme"
        elif "unregistered" in case_text and "offering" in case_text:
            return "Unregistered Securities Offering"
        elif "manipulat" in case_text:
            return "Market Manipulation"
        elif "insider" in case_text and "trading" in case_text:
            return "Insider Trading"
        elif "wash" in case_text and "trading" in case_text:
            return "Wash Trading"
        elif "pump" in case_text and "dump" in case_text:
            return "Pump and Dump"
        elif "misleading" in case_text or "misrepresent" in case_text:
            return "Misrepresentation"
        elif "fraud" in case_text:
            return "Fraud"
        else:
            return "Other Securities Violation"

    def extract_assets(self, case_text: str) -> str:
        """Extract asset-related terms (cryptocurrencies, tokens, etc.) from the case text."""
        case_text = case_text.lower()
        assets = []

        for kw in CRYPTO_KEYWORDS['assets']:
            if kw in case_text:
                # Get specific asset names where possible
                if kw == 'bitcoin':
                    assets.append('Bitcoin (BTC)')
                elif kw == 'ethereum':
                    assets.append('Ethereum (ETH)')
                elif kw == 'token' or kw == 'coin':
                    # Try to find specific token names - FIXED REGEX HERE
                    token_match = re.search(r'(?:called|named)\s+["\']?([A-Za-z0-9]+)[\s"\']?\s+(?:token|coin)', case_text)
                    if token_match:
                        assets.append(f"{token_match.group(1)} {kw}")
                    else:
                        assets.append(kw)
                else:
                    assets.append(kw)

        return "; ".join(assets) if assets else "Generic crypto assets"

    def get_crypto_terms(self, case_text: str) -> str:
        """Extract crypto-related terms from the case text with improved filtering."""
        case_text = case_text.lower()

        # Combine all crypto keywords except 'exchange'
        all_keywords = CRYPTO_KEYWORDS['assets'] + [p for p in CRYPTO_KEYWORDS['platforms'] if p != 'exchange']

        # Special handling for 'exchange'
        if 'exchange' in case_text:
            exchange_legal_contexts = [
                'securities exchange act',
                'exchange act',
                'stock exchange',
                'national exchange',
                'exchange commission'
            ]
            # Only add 'exchange' if not in legal contexts
            if not any(context in case_text for context in exchange_legal_contexts):
                all_keywords.append('exchange')

        crypto_terms = [kw for kw in all_keywords if kw in case_text]
        return "; ".join(crypto_terms) if crypto_terms else "None"

    def _save_results(self, df: pd.DataFrame) -> None:
        """Save results to CSV with timestamp and proper semicolon separation."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'sec_crypto_investigations_{timestamp}.csv'
        filepath = os.path.join(self.output_dir, filename)

        # Replace any semicolons within fields to avoid confusion with the delimiter
        for col in df.columns:
            if df[col].dtype == 'object':  # Only process string columns
                df[col] = df[col].astype(str).str.replace(';', ',')

        # Save with semicolon delimiter and escaping special characters
        df.to_csv(filepath, index=False, sep=';', quoting=1)
        self._log(f"Results saved to {filepath}")

# Create scraper instance and run it
scraper = SECCryptoScraper()
df = scraper.scrape_investigations(pages=5)  # Increased to 5 pages for better coverage

# Display the structured table
print("\nStructured Crypto Regulatory Cases:")
print(df)

[2025-02-18 04:16:51] [INFO] Processing page 1/5
[2025-02-18 04:22:05] [INFO] Processing page 2/5
[2025-02-18 04:27:22] [INFO] Processing page 3/5
[2025-02-18 04:32:40] [INFO] Processing page 4/5
[2025-02-18 04:37:57] [INFO] Processing page 5/5
[2025-02-18 04:43:14] [INFO] Results saved to crypto_regulatory_data/sec_crypto_investigations_20250218_044314.csv

Structured Crypto Regulatory Cases:
    Case ID            Date  \
0         1  Sept. 25, 2024   
1         2   Aug. 30, 2024   
2         3   Aug. 26, 2024   
3         4    July 8, 2024   
4         5    July 1, 2024   
5         6  March 11, 2024   
6         7    Feb. 7, 2024   
7         8   Dec. 28, 2023   
8         9  Sept. 22, 2023   
9        10  Sept. 12, 2023   
10       11   Aug. 29, 2023   
11       12   Aug. 23, 2023   
12       13   Aug. 18, 2023   
13       14    Aug. 7, 2023   
14       15   July 31, 2023   
15       16   July 21, 2023   
16       17   July 20, 2023   
17       18   July 14, 2023   
18       19   