In [None]:
# Installation commands (run this cell first)
!pip install playwright beautifulsoup4 requests
!python -m playwright install chromium

import sqlite3
from bs4 import BeautifulSoup
import requests
from playwright.sync_api import sync_playwright
import json
import time
from datetime import datetime



In [None]:
import sqlite3
from datetime import datetime
import json
import time
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import requests

# Initialize SQLite Database
def init_db():
    conn = sqlite3.connect('crypto_qa.db')
    cursor = conn.cursor()

    cursor.execute('''
    CREATE TABLE IF NOT EXISTS qa_pairs (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        question TEXT UNIQUE,
        answer TEXT,
        category TEXT,
        source_url TEXT,
        last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
    ''')

    cursor.execute('''
    CREATE TABLE IF NOT EXISTS scraped_sources (
        url TEXT PRIMARY KEY,
        last_scraped TIMESTAMP
    )
    ''')

    conn.commit()
    return conn

class QAScraper:
    def __init__(self):
        self.conn = init_db()
        self.cursor = self.conn.cursor()

    def scrape_gemini_cryptopedia(self, url):
        """Scrape Q&A pairs from Gemini Cryptopedia"""
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.goto(url)

                page.wait_for_selector('article', timeout=60000)
                qa_elements = page.query_selector_all('.qa-pair')
                scraped_data = []

                for item in qa_elements:
                    try:
                        question = item.query_selector('h3').inner_text()
                        answer = item.query_selector('.answer-content').inner_text()
                        scraped_data.append({
                            'question': question,
                            'answer': answer,
                            'category': 'Blockchain Basics',
                            'source_url': url
                        })
                    except Exception as e:
                        print(f"Skipping item due to error: {str(e)}")

                browser.close()
                return scraped_data

        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
            return []

    def scrape_coindesk_learn(self, url="https://www.coindesk.com/learn/"):
        """Scrape CoinDesk Learn section"""
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.goto(url)

                page.wait_for_selector('.main-content', timeout=30000)
                sections = page.query_selector_all('.article-card')
                scraped_data = []

                for section in sections:
                    try:
                        question = section.query_selector('h2').inner_text()
                        answer = section.query_selector('p').inner_text()[:500] + "..."
                        scraped_data.append({
                            'question': question,
                            'answer': answer,
                            'category': self.determine_category(question),
                            'source_url': url
                        })
                    except Exception as e:
                        print(f"Skipping section: {str(e)}")

                browser.close()
                return scraped_data

        except Exception as e:
            print(f"Error scraping CoinDesk: {str(e)}")
            return []

    def scrape_cointelegraph_101(self, url="https://cointelegraph.com/crypto-101"):
        """Scrape CoinTelegraph Crypto 101"""
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            scraped_data = []

            for item in soup.select('.faq-item'):
                try:
                    question = item.select_one('h3').get_text(strip=True)
                    answer = item.select_one('.faq-answer').get_text(strip=True)
                    scraped_data.append({
                        'question': question,
                        'answer': answer,
                        'category': 'Crypto Basics',
                        'source_url': url
                    })
                except Exception as e:
                    print(f"Skipping FAQ item: {str(e)}")

            return scraped_data

        except Exception as e:
            print(f"Error scraping CoinTelegraph: {str(e)}")
            return []

    def scrape_static_qa(self, qna_list):
        """Process static Q&A pairs from a list."""
        processed_data = []
        for item in qna_list:
            if 'que' in item and 'ans' in item:
                processed_data.append({
                    'question': item['que'],
                    'answer': item['ans'],
                    'category': self.determine_category(item['que']),
                    'source_url': 'static_data'
                })
            else:
                print(f"Skipping static data item due to missing keys: {item}")
        return processed_data

    def scrape_trezor_docs(self, url="https://docs.trezor.io/"):
        """Scrape Trezor hardware wallet documentation"""
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.goto(url + "learn/index.html")

                page.wait_for_selector('.docPage', timeout=30000)
                sections = page.query_selector_all('.theme-doc-markdown')
                scraped_data = []

                for section in sections:
                    try:
                        headers = section.query_selector_all('h2, h3')
                        for header in headers:
                            question = header.inner_text()
                            answer = ""
                            next_node = header.evaluate('node => node.nextElementSibling')
                            while next_node and next_node.get_property('tagName').lower() not in ['h2', 'h3']:
                                answer += next_node.inner_text() + "\n"
                                next_node = next_node.evaluate('node => node.nextElementSibling')

                            if answer.strip():
                                scraped_data.append({
                                    'question': f"Trezor: {question}",
                                    'answer': answer.strip(),
                                    'category': 'Hardware Wallets',
                                    'source_url': url
                                })
                    except Exception as e:
                        print(f"Skipping Trezor section: {str(e)}")

                browser.close()
                return scraped_data

        except Exception as e:
            print(f"Error scraping Trezor docs: {str(e)}")
            return []

    def scrape_ledger_docs(self, url="https://developers.ledger.com/"):
        """Scrape Ledger developer documentation"""
        try:
            response = requests.get(url + "docs", timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')
            scraped_data = []

            for card in soup.select('.card'):
                try:
                    question = card.select_one('.card-title').get_text(strip=True)
                    answer = card.select_one('.card-text').get_text(strip=True)
                    scraped_data.append({
                        'question': f"Ledger: {question}",
                        'answer': answer,
                        'category': 'Hardware Wallets',
                        'source_url': url
                    })
                except Exception as e:
                    print(f"Skipping Ledger card: {str(e)}")

            return scraped_data

        except Exception as e:
            print(f"Error scraping Ledger docs: {str(e)}")
            return []

    def scrape_keplr_docs(self, url="https://docs.keplr.app/"):
        """Scrape Keplr wallet documentation"""
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.goto(url)

                page.wait_for_selector('.theme-doc-markdown', timeout=30000)
                headers = page.query_selector_all('h2, h3')
                scraped_data = []

                for header in headers:
                    try:
                        question = header.inner_text()
                        answer = ""
                        next_node = header.evaluate('node => node.nextElementSibling')
                        while next_node and next_node.get_property('tagName').lower() not in ['h2', 'h3']:
                            answer += next_node.inner_text() + "\n"
                            next_node = next_node.evaluate('node => node.nextElementSibling')

                        if answer.strip():
                            scraped_data.append({
                                'question': f"Keplr: {question}",
                                'answer': answer.strip(),
                                'category': 'Cosmos Wallets',
                                'source_url': url
                            })
                    except Exception as e:
                        print(f"Skipping Keplr section: {str(e)}")

                browser.close()
                return scraped_data

        except Exception as e:
            print(f"Error scraping Keplr docs: {str(e)}")
            return []

    def scrape_cosmos_docs(self, url="https://docs.cosmos.network/"):
        """Scrape Cosmos SDK documentation"""
        try:
            response = requests.get(url + "main", timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')
            scraped_data = []

            for section in soup.select('.bd-section'):
                try:
                    question = section.select_one('h2').get_text(strip=True)
                    answer = "\n".join([p.get_text(strip=True) for p in section.select('p')])
                    scraped_data.append({
                        'question': f"Cosmos: {question}",
                        'answer': answer,
                        'category': 'Cosmos SDK',
                        'source_url': url
                    })
                except Exception as e:
                    print(f"Skipping Cosmos section: {str(e)}")

            return scraped_data

        except Exception as e:
            print(f"Error scraping Cosmos docs: {str(e)}")
            return []

    def scrape_polygon_docs(self, url="https://wiki.polygon.technology/docs/"):
        """Scrape Polygon documentation"""
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.goto(url)

                page.wait_for_selector('.markdown', timeout=30000)
                headers = page.query_selector_all('h2, h3')
                scraped_data = []

                for header in headers:
                    try:
                        question = header.inner_text()
                        answer = ""
                        next_node = header.evaluate('node => node.nextElementSibling')
                        while next_node and next_node.get_property('tagName').lower() not in ['h2', 'h3']:
                            answer += next_node.inner_text() + "\n"
                            next_node = next_node.evaluate('node => node.nextElementSibling')

                        if answer.strip():
                            scraped_data.append({
                                'question': f"Polygon: {question}",
                                'answer': answer.strip(),
                                'category': 'EVM Chains',
                                'source_url': url
                            })
                    except Exception as e:
                        print(f"Skipping Polygon section: {str(e)}")

                browser.close()
                return scraped_data

        except Exception as e:
            print(f"Error scraping Polygon docs: {str(e)}")
            return []

    def scrape_metamask_docs(self, url="https://docs.metamask.io/guide/"):
        """Scrape MetaMask documentation"""
        try:
            response = requests.get(url, timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')
            scraped_data = []

            for faq in soup.select('.faq-item'):
                try:
                    question = faq.select_one('h3').get_text(strip=True)
                    answer = faq.select_one('.faq-answer').get_text(strip=True)
                    scraped_data.append({
                        'question': f"MetaMask: {question}",
                        'answer': answer,
                        'category': 'Browser Wallets',
                        'source_url': url
                    })
                except Exception as e:
                    print(f"Skipping MetaMask FAQ: {str(e)}")

            return scraped_data

        except Exception as e:
            print(f"Error scraping MetaMask docs: {str(e)}")
            return []

    def scrape_ethereum_docs(self, url="https://ethereum.org/en/developers/docs/"):
        """Scrape Ethereum developer documentation"""
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.goto(url)

                page.wait_for_selector('.developers-docs', timeout=30000)
                cards = page.query_selector_all('.card')
                scraped_data = []

                for card in cards:
                    try:
                        question = card.query_selector('h3').inner_text()
                        answer = card.query_selector('p').inner_text()
                        scraped_data.append({
                            'question': f"Ethereum: {question}",
                            'answer': answer,
                            'category': 'Ethereum',
                            'source_url': url
                        })
                    except Exception as e:
                        print(f"Skipping Ethereum card: {str(e)}")

                browser.close()
                return scraped_data

        except Exception as e:
            print(f"Error scraping Ethereum docs: {str(e)}")
            return []

    def scrape_bitcoin_bips(self, url="https://github.com/bitcoin/bips"):
        """Scrape Bitcoin Improvement Proposals"""
        try:
            response = requests.get(url, timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')
            scraped_data = []

            for row in soup.select('.js-navigation-item'):
                try:
                    if 'bip-' in row.get_text().lower():
                        link = row.select_one('a')
                        question = f"BIP: {link.get_text(strip=True)}"
                        answer = f"Bitcoin Improvement Proposal - {link.get_text(strip=True)}. See full text at: {url}/{link['href']}"
                        scraped_data.append({
                            'question': question,
                            'answer': answer,
                            'category': 'Bitcoin Standards',
                            'source_url': url
                        })
                except Exception as e:
                    print(f"Skipping BIP row: {str(e)}")

            return scraped_data

        except Exception as e:
            print(f"Error scraping Bitcoin BIPs: {str(e)}")
            return []

    def determine_category(self, question):
        """Simple function to determine category based on keywords (can be expanded)."""
        question_lower = question.lower()
        if "blockchain" in question_lower or "distributed ledger" in question_lower:
            return "Blockchain Basics"
        elif "bitcoin" in question_lower or "btc" in question_lower:
            return "Bitcoin"
        elif "ethereum" in question_lower or "eth" in question_lower or "smart contract" in question_lower:
            return "Ethereum"
        elif "defi" in question_lower or "decentralized finance" in question_lower:
            return "DeFi"
        elif "nft" in question_lower or "non-fungible token" in question_lower:
            return "NFTs"
        elif "mining" in question_lower or "proof-of-" in question_lower:
            return "Mining/Consensus"
        elif "wallet" in question_lower or "key" in question_lower:
            return "Wallets/Keys"
        elif "exchange" in question_lower or "trading" in question_lower:
            return "Exchanges/Trading"
        elif "regulation" in question_lower or "legal" in question_lower:
            return "Regulation/Legal"
        else:
            return "General Crypto"

    def store_qa_pairs(self, qa_data):
        """Store Q&A pairs in the database."""
        for item in qa_data:
            try:
                self.cursor.execute('''
                    INSERT OR IGNORE INTO qa_pairs (question, answer, category, source_url)
                    VALUES (?, ?, ?, ?)
                ''', (item.get('question'), item.get('answer'), item.get('category', 'Unknown'), item.get('source_url', 'Unknown')))
                if self.cursor.rowcount > 0:
                    print(f"Inserted new question: {item.get('question', 'N/A')}")
                else:
                    print(f"Question already exists, skipping insertion: {item.get('question', 'N/A')}")
            except sqlite3.Error as e:
                print(f"Database error storing {item.get('question', 'N/A')}: {e}")
        self.conn.commit()

    def is_recently_scraped(self, url, threshold_days=7):
        """Check if a URL was scraped recently."""
        self.cursor.execute('SELECT last_scraped FROM scraped_sources WHERE url = ?', (url,))
        row = self.cursor.fetchone()
        if row:
            last_scraped_str = row[0]
            last_scraped_date = datetime.strptime(last_scraped_str, '%Y-%m-%d %H:%M:%S').date()
            if (datetime.now().date() - last_scraped_date).days < threshold_days:
                print(f"Source {url} was scraped recently ({last_scraped_str}). Skipping.")
                return True
        return False

    def update_source_record(self, url):
        """Update or insert the last scraped time for a URL."""
        now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        try:
            self.cursor.execute('''
                INSERT OR REPLACE INTO scraped_sources (url, last_scraped)
                VALUES (?, ?)
            ''', (url, now))
            self.conn.commit()
            print(f"Updated last scraped time for {url} to {now}")
        except sqlite3.Error as e:
            print(f"Database error updating source record for {url}: {e}")

    def export_to_jsonl(self, filename="crypto_qa.jsonl"):
        """Export all QA pairs from the database to a JSONL file."""
        try:
            self.cursor.execute('SELECT question, answer, category, source_url, last_updated FROM qa_pairs')
            rows = self.cursor.fetchall()
            with open(filename, 'w', encoding='utf-8') as f:
                for row in rows:
                    data = {
                        'question': row[0],
                        'answer': row[1],
                        'category': row[2],
                        'source_url': row[3],
                        'last_updated': row[4]
                    }
                    f.write(json.dumps(data, ensure_ascii=False) + '\n')
            print(f"Exported {len(rows)} QA pairs to {filename}")
        except sqlite3.Error as e:
            print(f"Database error during export: {e}")
        except IOError as e:
            print(f"File error during export: {e}")

    def close(self):
        self.conn.close()

In [8]:
import json
import time
import gzip

In [9]:
if __name__ == "__main__":
    # Initialize list to store all QA pairs
    all_qa_pairs = []

    # Complete qna_data list with all entries
    # Example entry - include all others here

    # Save original data in JSONL
    def save_jsonl(dataset, output_path="database_0520.jsonl.gz"):
        """Save dataset to a compressed JSONL file"""
        try:
            with gzip.open(output_path, "wt", encoding="utf-8") as f:
                for entry in dataset:
                    f.write(json.dumps(entry) + "\n")
            print(f"✅ JSONL file saved: {output_path}")
        except Exception as e:
            print(f"❌ Failed to save JSONL: {str(e)}")

    # Initialize scraper
    scraper = QAScraper()
    qna_data = [
        {"que": "Hardware wallets", "ans": "Devices (Ledger, Trezor) store keys offline, requiring physical confirmation for transactions. They mitigate remote hacks but are vulnerable to physical theft or supply-chain compromises."},
        {"que": "Cryptocurrency privacy", "ans": "Pseudonymous by default, but chain analysis (Elliptic) can link addresses to identities. Privacy coins (Monero) use ring signatures; mixers (CoinJoin) obscure trails but face regulatory pushback."},
        {"que": "HD wallet vulnerability", "ans": "Master keys can be exposed if a child key and master public key are leaked. BIP-32 hardened keys prevent this but limit public key derivation, complicating certain use cases."},
        {"que": "Staking in Ethereum", "ans": "Requires 32 ETH to become a validator. Participants earn rewards (~4% APR) but face slashing for attacks. Services like Rocket Pool allow pooled staking with less ETH."},
        {"que": "EIP process", "ans": "Ethereum Improvement Proposals (e.g., EIP-1559) are debated on forums, tested on testnets (Goerli), and implemented via hard forks. Annual upgrades bundle multiple EIPs for efficiency."},
        {"que": "The Merge (Ethereum)", "ans": "Transitioned Ethereum to PoS in 2022, merging the Beacon Chain with Mainnet. Reduced ETH issuance by ~90% and energy use by ~99.95%, enhancing sustainability."},
        {"que": "Ethereum's weaknesses", "ans": "High fees during peak usage and competition from faster chains (Solana). Modular scaling (rollups) may dilute value capture compared to monolithic blockchains."},
        {"que": "Ethereum's strengths", "ans": "First-mover advantage, robust developer ecosystem, and transition to deflationary ETH via EIP-1559. Its security and decentralization make it a preferred platform for institutional DeFi."},
        {"que": "Ethereum forking", "ans": "Forks update protocols; contentious splits create new chains (ETH vs. ETC in 2016). Upgrades like London (EIP-1559) and the Merge are coordinated via community consensus."},
        {"que": "EVM", "ans": "Ethereum Virtual Machine executes smart contracts in a sandboxed environment. Gas fees limit computation, preventing infinite loops. Compilers convert high-level code (Solidity) to EVM bytecode."},
        {"que": "Blockchain 3.0", "ans": "Focuses on scalability (Polkadot, Solana) and interoperability via cross-chain bridges. DAOs and DApps aim for fully decentralized internet infrastructure, though governance remains a challenge."},
        {"que": "Consensus mechanisms", "ans": "Solve Byzantine Generals' Problem via PoW (Bitcoin), PoS (Ethereum), or delegated systems (DPoS). They ensure agreement in decentralized networks despite malicious actors."},
        {"que": "Blockchain 2.0", "ans": "Ethereum's introduction of smart contracts (2015) expanded blockchain beyond currency to programmable apps (DeFi, NFTs). Challenges include scalability and high gas fees during congestion."},
        {"que": "Blockchain governance models", "ans": "Vary from Bitcoin's decentralized BIP process to enterprise consortiums (Hyperledger). PoS chains often use on-chain voting (MakerDAO), while others rely on core dev teams."},
        {"que": "Staking", "ans": "Locking crypto (e.g., ETH) to validate PoS networks. Validators earn rewards but risk penalties (slashing) for downtime or fraud. Platforms like Lido offer liquid staking derivatives."},
        {"que": "HTLCs", "ans": "Hash Time-Locked Contracts enable cross-chain atomic swaps. A payer locks funds with a hash; the payee must reveal the preimage within a timeframe to claim payment, ensuring trustless exchanges."},
        {"que": "Nonces", "ans": "Unique numbers in blocks (PoW) or transactions to prevent replay attacks. In Bitcoin mining, miners vary the nonce to find a valid block hash."},
        {"que": "Smart Contracts", "ans": "Code-based agreements executing automatically when conditions are met. Used in DeFi for loans (Compound) and exchanges (Uniswap), they require audits to prevent exploits (e.g., The DAO hack)."},
        {"que": "Tokens", "ans": "Digital assets on blockchains, created via smart contracts (ERC-20). They represent ownership (NFTs), utility (governance tokens), or stable value (USDC), enabling diverse applications."},
        {"que": "Ledger", "ans": "A blockchain's public, append-only record of transactions. Immutable and transparent, it allows anyone to audit activity without central oversight."},
        {"que": "Security concerns of Bitcoin", "ans": "While Bitcoin's protocol is secure, exchanges (Mt. Gox) and wallets remain hack targets. Cold storage and multisig wallets mitigate risks, but phishing and user error persist."},
        {"que": "Bitcoin's volatility", "ans": "Driven by speculative trading, regulatory news, and macroeconomic trends. Derivatives (CME futures) and institutional adoption (ETFs) have dampened volatility compared to early years."},
        {"que": "Bitcoin's supply", "ans": "Capped at 21 million, with ~19 million mined by 2023. Halvings every 210,000 blocks (~4 years) reduce issuance geometrically, mimicking gold's scarcity. Final BTC will mine around 2140."},
        {"que": "What is the blockchain trilemma?", "ans": "The challenge of balancing decentralization, security, and scalability. Bitcoin prioritizes decentralization/security; Solana emphasizes scalability. Layer-2 solutions (Lightning Network) and sharding (Ethereum) aim to resolve this."},
        {"que": "How are regulators responding to blockchain?", "ans": "Implementing KYC/AML rules for exchanges, scrutinizing stablecoin reserves, and testing CBDCs. The EU's MiCA and U.S. executive orders aim to balance innovation with consumer protection."},
        {"que": "What are sidechains?", "ans": "Independent blockchains (e.g., Liquid Network) pegged to Bitcoin, enabling faster transactions or privacy features. They use two-way pegs to move assets between chains, balancing innovation with Bitcoin's security."},
        {"que": "What are smart contracts?", "ans": "Self-executing code on blockchains (e.g., Ethereum) that automate agreements. They enable trustless transactions (e.g., token swaps) but rely on oracles (Chainlink) for external data, introducing potential vulnerabilities."},
        {"que": "Why is Ethereum shifting to PoS?", "ans": "To reduce energy consumption (~99.95% post-Merge) and enable scalability via sharding. PoS also lowers entry barriers for validators compared to ASIC mining, promoting decentralization."},
        {"que": "What is DeSci?", "ans": "Decentralized Science uses blockchain for transparent research funding, data sharing, and peer review. It combats issues like publication bias and IP disputes via DAOs and tokenized incentives."},
        {"que": "What are Bitcoin Improvement Proposals (BIPs)?", "ans": "Community-driven standards for upgrading Bitcoin. Examples include BIP-32 (HD wallets), BIP-141 (SegWit), and BIP-340 (Schnorr signatures). They undergo peer review before implementation, ensuring backward compatibility."},
        {"que": "What is the 51% attack risk?", "ans": "If a entity controls >50% of a network's hash rate, they can reverse transactions or double-spend. Large networks like Bitcoin are resilient due to prohibitive costs (billions in hardware/energy), but smaller chains are vulnerable."},
        {"que": "How does Bitcoin mining work?", "ans": "Miners compete to solve cryptographic puzzles using ASICs. The first to solve it adds a block, earning BTC rewards (~6.25 BTC/block in 2023) and fees. Difficulty adjusts every 2,016 blocks to maintain ~10-minute block times."},
        {"que": "What are stablecoins?", "ans": "Tokens pegged to stable assets like the USD. Centralized types (USDC, USDT) hold reserves; algorithmic ones (DAI) use smart contracts to balance supply/demand. They reduce crypto volatility risks in trading and payments."},
        {"que": "How does DeFi differ from traditional finance?", "ans": "DeFi uses smart contracts on blockchains (e.g., Ethereum) to automate lending, trading, and derivatives without intermediaries. It offers global access and transparency but faces risks like smart contract bugs and regulatory uncertainty."},
        {"que": "What are PoS attack risks?", "ans": "Grinding attacks (manipulating randomness), prediction attacks (bribing validators), and long-range attacks (rewriting old blocks). PoS chains mitigate these via penalties, diversified validator selection, and checkpointing."},
        {"que": "What are PoS advantages?", "ans": "Lower energy costs, faster transactions, and validator accountability via staking. Malicious actors lose staked funds, enhancing security. It supports scalability solutions like sharding, enabling networks like Ethereum to process thousands of transactions per second."},
        {"que": "What is PoS?", "ans": "Proof of Stake replaces mining with validators who stake tokens to propose/blocks. Selection depends on stake size and duration, reducing energy use. Validators earn fees but risk losing stakes (slashing) for malicious acts. Ethereum's shift to PoS cut energy use by 99.95%."},
        {"que": "What are PoW drawbacks?", "ans": "High energy consumption (Bitcoin uses ~130 TWh/year) and centralization risks due to specialized mining hardware (ASICs) and mining pools. These factors contradict Bitcoin's original decentralized vision and raise environmental concerns."},
        {"que": "How do crypto wallets secure assets?", "ans": "Wallets store private keys, which control blockchain access. Cold wallets (offline, e.g., Ledger) offer high security; hot wallets (online, e.g., MetaMask) prioritize convenience. Losing the private key or seed phrase results in irreversible fund loss, emphasizing self-custody responsibility."},
        {"que": "What is the double-spending problem?", "ans": "The risk of digital currency being spent twice, absent a central authority. Blockchain solves this by timestamping transactions and validating them via consensus. Once recorded, transactions are irreversible, preventing duplication. PoW and PoS mechanisms further secure the ledger against such attacks."},
        {"que": "What are altcoins?", "ans": "Cryptocurrencies other than Bitcoin, such as Litecoin (LTC) or Ethereum (ETH). Some are Bitcoin forks with minor tweaks, while others, like Ethereum, introduce smart contracts or unique features. They often serve niche roles, like privacy (Monero) or scalability (Solana), but many are speculative investments."},
        {"que": "How does blockchain technology work?", "ans": "Transactions are broadcast to a network of nodes, validated via consensus (e.g., mining in PoW), and grouped into blocks. Each block contains a cryptographic hash of the prior block, ensuring tamper resistance. Once validated, blocks are added permanently to the chain, visible to all participants. This decentralization prevents single points of failure or control."},
        {"que": "What is Blockchain?", "ans": "A decentralized, distributed ledger that records transactions across a network of computers. Each transaction is grouped into a block, cryptographically linked to the previous one, creating an immutable chain. It relies on consensus mechanisms (e.g., PoW, PoS) for validation, eliminating the need for central authority. Bitcoin pioneered its use, but applications now span finance, supply chain, and more."}
    ]

    try:
        # Process static data
        static_qa = scraper.scrape_static_qa(qna_data)
        scraper.store_qa_pairs(static_qa)
        all_qa_pairs.extend(static_qa)  # Collect static data

        # Dynamic sources with error handling
        sources = [
            ("https://www.gemini.com/cryptopedia", "scrape_gemini_cryptopedia"),
            ("https://www.coindesk.com/learn/", "scrape_coindesk_learn"),
            ("https://cointelegraph.com/crypto-101", "scrape_cointelegraph_101"),
            ("https://docs.trezor.io/", "scrape_trezor_docs"),
            ("https://developers.ledger.com/", "scrape_ledger_docs"),
            ("https://docs.keplr.app/", "scrape_keplr_docs"),
            ("https://docs.cosmos.network/", "scrape_cosmos_docs"),
            ("https://wiki.polygon.technology/docs/", "scrape_polygon_docs"),
            ("https://docs.metamask.io/guide/", "scrape_metamask_docs"),
            ("https://ethereum.org/en/developers/docs/", "scrape_ethereum_docs"),
            ("https://github.com/bitcoin/bips", "scrape_bitcoin_bips")
        ]

        for url, method_name in sources:
            if not scraper.is_recently_scraped(url):
                try:
                    method = getattr(scraper, method_name)
                    dynamic_qa = method(url)
                    scraper.store_qa_pairs(dynamic_qa)
                    scraper.update_source_record(url)
                    all_qa_pairs.extend(dynamic_qa)  # Collect dynamic data
                    time.sleep(2)
                except Exception as e:
                    print(f"Failed to scrape {url} using {method_name}: {str(e)}")
            else:
                print(f"Skipping recent source: {url}")

        # Save entire dataset to JSONL
        save_jsonl(all_qa_pairs, "database_0520.jsonl.gz")

        # Database Statistics
        print("\nDatabase Statistics:")
        scraper.cursor.execute('SELECT COUNT(*) FROM qa_pairs')
        print(f"Total Q&A pairs: {scraper.cursor.fetchone()[0]}")

        print("\nCategories:")
        scraper.cursor.execute('SELECT category, COUNT(*) FROM qa_pairs GROUP BY category')
        for row in scraper.cursor.fetchall():
            print(f"{row[0]}: {row[1]}")

    finally:
        # Ensure scraper is closed even if an error occurs
        scraper.close()

Question already exists, skipping insertion: Hardware wallets
Question already exists, skipping insertion: Cryptocurrency privacy
Question already exists, skipping insertion: HD wallet vulnerability
Question already exists, skipping insertion: Staking in Ethereum
Question already exists, skipping insertion: EIP process
Question already exists, skipping insertion: The Merge (Ethereum)
Question already exists, skipping insertion: Ethereum's weaknesses
Question already exists, skipping insertion: Ethereum's strengths
Question already exists, skipping insertion: Ethereum forking
Question already exists, skipping insertion: EVM
Question already exists, skipping insertion: Blockchain 3.0
Question already exists, skipping insertion: Consensus mechanisms
Question already exists, skipping insertion: Blockchain 2.0
Question already exists, skipping insertion: Blockchain governance models
Question already exists, skipping insertion: Staking
Question already exists, skipping insertion: HTLCs
Quest