In [1]:
# Clean up existing installations
!pip uninstall -y numpy torch transformers 2>/dev/null || echo "Cleanup complete"

# Core package installation
!pip install -q --upgrade pip
!pip install -q \
    numpy==1.26.4 \
    pandas==2.2.2 \
    scipy==1.13.0 \
    scikit-learn==1.3.2 \
    spacy==3.7.4 \
    pdfplumber==0.11.0 \
    requests==2.31.0

# PyTorch installation with CUDA 12.1 support
!pip install -q \
    torch==2.2.1+cu121 \
    torchvision==0.17.1+cu121 \
    torchaudio==2.2.1+cu121 \
    --index-url https://download.pytorch.org/whl/cu121

# NLP and ML ecosystem
!pip install -q \
    transformers==4.41.2 \
    peft==0.10.0 \
    datasets==2.18.0 \
    accelerate==0.29.1 \
    bitsandbytes==0.43.0 \
    sentence-transformers==3.4.1

# SpaCy model
!python -m spacy download en_core_web_lg

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.19 requires torch<2.7,>=1.10, which is not installed.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m67.4 MB/s[0m eta [36m0:00:0

In [2]:
import re
import pdfplumber
import spacy
import requests
import pandas as pd
import torch
import time
from io import BytesIO
from typing import Dict, List
from collections import OrderedDict
from spacy.matcher import Matcher
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    pipeline,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset

class SectionDetector:
    def __init__(self, nlp=None):
        self.nlp = nlp or spacy.load("en_core_web_lg")
        self._initialize_section_patterns()
        self._refresh_matcher()

    def _initialize_section_patterns(self):
        self.section_hierarchy = {
            'abstract': {'level': 1, 'patterns': [[{"LOWER": {"REGEX": r"^(abstract|summary)$"}}]]},
            'introduction': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["introduction", "intro"]}}]]},
            'methods': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["methods", "methodology"]}}]]},
            'results': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["results", "findings"]}}]]},
            'discussion': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["discussion", "analysis"]}}]]},
            'conclusion': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["conclusion", "summary"]}}]]},
            'references': {'level': 1, 'patterns': [[{"LOWER": "references"}]]}
        }

    def _refresh_matcher(self):
        self.matcher = Matcher(self.nlp.vocab)
        for section, info in self.section_hierarchy.items():
            for pattern in info['patterns']:
                self.matcher.add(section.upper(), [pattern])

    def process_document(self, text: str) -> OrderedDict:
        doc = self.nlp(text)
        matches = sorted(self.matcher(doc), key=lambda x: x[1])
        sections = OrderedDict()
        current_section = "header"
        last_end = 0

        for match_id, start, end in matches:
            section_name = self.nlp.vocab.strings[match_id].lower()
            content = doc[last_end:start].text.strip()
            if content:
                if current_section not in sections:
                    sections[current_section] = []
                sections[current_section].append(content)
            current_section = section_name
            last_end = end

        if last_end < len(doc):
            if current_section not in sections:
                sections[current_section] = []
            sections[current_section].append(doc[last_end:].text.strip())
            
        return self._postprocess_sections(sections)

    def _postprocess_sections(self, raw_sections: Dict) -> OrderedDict:
        processed = OrderedDict()
        previous_level = 0
        
        for section, content_list in raw_sections.items():
            content = "\n".join(content_list)
            current_level = self.section_hierarchy.get(section.lower(), {}).get('level', 1)
            
            if current_level > previous_level:
                processed[section] = content
                previous_level = current_level
            else:
                if processed:
                    last_section = next(reversed(processed))
                    processed[last_section] += "\n\n" + content
                    
        return processed

class PaperProcessor:
    def __init__(self, detector):
        self.detector = detector
        self.session = self._create_session()

    def _create_session(self):
        session = requests.Session()
        retries = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["GET"]
        )
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive'
        }
        
        session.mount('https://', HTTPAdapter(max_retries=retries))
        session.mount('http://', HTTPAdapter(max_retries=retries))
        session.headers.update(headers)
        return session

    def process_paper(self, url):
        text = self._get_paper_text(url)
        return self.detector.process_document(text) if text else None

    def _get_paper_text(self, url):
        try:
            time.sleep(2)  # Increased delay for rate limiting
            response = self.session.get(url, timeout=60, stream=True)
            response.raise_for_status()
            
            content_type = response.headers.get('Content-Type', '')
            if 'application/pdf' not in content_type and 'octet-stream' not in content_type:
                print(f"URL {url} doesn't return a PDF (Content-Type: {content_type})")
                return None
                
            with BytesIO() as pdf_buffer:
                for chunk in response.iter_content(chunk_size=8192):
                    pdf_buffer.write(chunk)
                pdf_buffer.seek(0)
                
                try:
                    with pdfplumber.open(pdf_buffer) as pdf:
                        return "\n".join(page.extract_text() or '' for page in pdf.pages)
                except pdfplumber.PDFSyntaxError:
                    print(f"PDF parsing failed for {url}")
                    return None
                    
        except requests.exceptions.RequestException as e:
            print(f"Error processing {url}: {str(e)}")
            return None
        except Exception as e:
            print(f"Unexpected error processing {url}: {str(e)}")
            return None

def setup_environment():
    import os
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    os.environ["OMP_NUM_THREADS"] = "1"
    torch.backends.cudnn.benchmark = True

def load_model(model_name="gpt2"):
    compute_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=torch.cuda.is_available(),
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True
    ) if torch.cuda.is_available() else None

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=compute_dtype
    )
    return model

def train_model(model, tokenizer, dataset_path):
    df = pd.read_csv(dataset_path)
    dataset = Dataset.from_pandas(df[['text']])

    def tokenize_fn(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,
            padding="max_length",
            return_tensors="pt"
        )

    tokenized_dataset = dataset.map(tokenize_fn, batched=True)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["c_attn", "c_proj", "c_fc"],  # GPT-2 specific modules
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        fan_in_fan_out=True  # Important for GPT-2
    )

    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=3,  # Increased from 1
        learning_rate=1e-5,  # Reduced from 2e-5
        weight_decay=0.01,  # Added to prevent overfitting
        fp16=torch.cuda.is_available(),
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps",  # Added validation
        eval_steps=100,
        save_strategy="steps",
        save_steps=200,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="none",
        repetition_penalty=1.2,  # Added to reduce repetition
        length_penalty=1.0
    )

    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.config.use_cache = False
    model.print_trainable_parameters()

    # Split your dataset
    dataset = dataset.train_test_split(test_size=0.1)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],  # Added validation set
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    )
    
    trainer.train()
    return model

def save_model(model, tokenizer, output_dir):
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

def load_for_inference(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return pipeline("text-generation", model=model, tokenizer=tokenizer)

def generate_text(pipeline, prompt, max_length=200):
    # Enhanced prompt template
    structured_prompt = (
        f"Question: {prompt}\n\n"
        "Answer concisely and technically accurate:\n"
    )
    
    return pipeline(
        structured_prompt,
        max_length=max_length,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.5,  # Increased from 1.2
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )[0]['generated_text']

2025-05-28 00:51:54.217001: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748393514.522785      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748393514.602738      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.11/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [3]:
if __name__ == "__main__":
    setup_environment()

    # 1. Paper processing with multiple fallback URLs
    print("=== Starting Paper Processing ===")
    detector = SectionDetector()
    processor = PaperProcessor(detector)
    
    paper_urls = [
        "https://arxiv.org/pdf/2307.12874",
        "https://arxiv.org/pdf/2303.12940",
        "https://arxiv.org/pdf/1802.04351",
        "https://arxiv.org/pdf/2306.08168",
        "https://arxiv.org/pdf/2503.15964",
        "https://www.jetir.org/papers/JETIR2405D82.pdf",
        "https://www.cs.ucf.edu/~czou/research/subWallet-Blockchain-2019.pdf",
        "https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf",
        "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
        "https://dl.gi.de/server/api/core/bitstreams/aaa640a1-f8dd-4514-ad72-b809932072cc/content",
        "https://eprint.iacr.org/2023/062.pdf",
        "https://eprint.iacr.org/2022/075.pdf",    
        "https://eprint.iacr.org/2023/1234.pdf",
        "https://eprint.iacr.org/2020/300.pdf",
        "https://eprint.iacr.org/2023/312.pdf",
        "https://policyreview.info/pdf/policyreview-2016-3-427.pdf",
        "https://eprint.iacr.org/2016/013.pdf",
        "https://arxiv.org/pdf/1906.00245",
        "https://escholarship.org/content/qt7fh678d6/qt7fh678d6.pdf?t=pn651y",
        "https://re.public.polimi.it/bitstream/11311/1056221/6/11311-1056221%20Giudici.pdf",
        "https://research-api.cbs.dk/ws/files/44436178/ole_bjerg_how_is_bitcoin_money_postprint.pdf",
        "https://www.bis.org/fsi/publ/insights49.pdf",
        "https://www.scirp.org/pdf/ojbm_1534496.pdf",
        "https://www.bis.org/publ/work1066.pdf",
        "http://khcnbinhduong.gov.vn/ImageUpload/file/TTTK%20KCN/2019/Nguon%20tin%20KHCN/Blockchain_A3.pdf",
        "https://e-space.mmu.ac.uk/627269/1/Manuscript_Final%20JCLP.pdf",
        "https://pdfs.semanticscholar.org/9900/c9c91f9f78fa0adb6915855084396654363c.pdf?_gl=1*7q1z9h*_gcl_au*MTkxMDg1NzA4NC4xNzQ4MDIxMDA4*_ga*Mjc1MDg5MDkuMTc0ODAyMTAwOA..*_ga_H7P4ZT52H5*czE3NDgwMjEwMDckbzEkZzEkdDE3NDgwMjExNzkkajE1JGwwJGgwJGR1YWNJOGg3VW43bWFscGZjZ056LU5TM0lXc0Jtc0drMW93",
        "https://www.newyorkfed.org/medialibrary/media/research/epr/2024/EPR_2024_digital-assets_azar.pdf",
        "https://journals.law.harvard.edu/hblr/wp-content/uploads/sites/87/2025/03/04_HLB_15_1_Noked171-216.pdf",
        "https://www.stern.nyu.edu/sites/default/files/2024-07/Glucksman_Sak_2024.pdf",
        "https://www.tigta.gov/sites/default/files/reports/2024-07/2024300030fr_0.pdf",
        "https://www.fsb.org/uploads/Crypto-Council-for-Innovation.pdf",
        "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
        "https://ndbf.nebraska.gov/sites/default/files/industries/Digital%20Asset%20Depository%20Nebraska%20Custody%20and%20Fiduciary%20Services%20Examination%20Manual.pdf",
        "https://www.swlegal.com/media/filer_public/2d/f7/2df70b84-cb3c-4578-9943-8b3ea024abf9/sw_nl_january_2024_english.pdf",
        "https://www.willkie.com/-/media/files/publications/2024/12/law360---sec-custody-rule-creates-crypto-compliance-conundrum.pdf",
        "https://www.henrystewartpublications.com/sites/default/files/Opportunities%20in%20digital%20assets%20and%20digital%20custody-Tracking%20the%20modernisation%20of%20standard%20custody%20offering%20-%20Ignatowicz%20%26%20Taudes%20JSOC%2015-3.pdf",
        "https://www.gdf.io/wp-content/uploads/2019/02/GDF-Crypto-Asset-Safekeeping_20-April-2019-2-cust-providers-additions-1-2.pdf",
        "https://www.occ.gov/topics/charters-and-licensing/interpretations-and-actions/2020/int1170.pdf",
        "https://www.gemini.com/static/documents/guide-to-crypto-custody.pdf",
        "https://orbilu.uni.lu/bitstream/10993/62083/1/ZetzscheSinnigNikolakopoulou_Crypto%20custody_CMLJ%202024.pdf",
        "https://www.esrb.europa.eu/pub/pdf/reports/esrb.cryptoassetsanddecentralisedfinance202305~9792140acd.en.pdf",
        "https://repository.uel.ac.uk/download/df676586f4e9f8a89df529a36841d83d4750539805189a8951032ee4c2f0c16c/99798/challenges-and-approaches-to-regulating-decentralized-finance.pdf",
        "https://repository.uel.ac.uk/download/ca8bad2f5fab17596c44927643b4da1473ef7ef79862fe3ca05ea9251bd4db8b/1599957/Financial%20Crime%20update%20%282020%29.pdf",
        "https://www.iacpcybercenter.org/wp-content/uploads/2018/03/Bitcoin.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/Podcasts/SPT_Emerging-Tech-Terms.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-crypto.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-phishing.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018/Emerging_Tech_Bitcoin_Crypto.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2019/emerging-tech_white-paper.pdf",
        "https://openaccess.uoc.edu/bitstream/10609/151551/1/Rahmanikivi_cbt22_empirical.pdf",
        "https://ics.uci.edu/~dabrowsa/dabrowski-defi21-hwwallet.pdf",
        "https://fc19.ifca.ai/preproceedings/93-preproceedings.pdf",
        "https://www.jkroll.com/papers/bitcoin_threshold_signatures.pdf",
        "https://corporates.db.com/files/documents/publications/db-polygo-digital-id-wp-42pp-web-secured.pdf",
        "https://www.napier.ac.uk/-/media/worktribe/output-2839021/smart-contract-attacks-and-protections.ashx",
        "https://www.cyprusbarassociation.org/images/6._Crypto_Wallets.pdf",
        "https://computerscience.unicam.it/marcantoni/tesi/Ethereum%20Smart%20Contracts%20Optimization.pdf",
        "https://cspecc.utsa.edu/publications/files/Refereed_Papers/2020_Choo_BCPPA-blockchain-cond-priv-auth-prot.pdf",
        "https://www.ekonomika.org.rs/sr/PDF/ekonomika/2019/clanci19-3/7.pdf",
        "https://assets.cureusjournals.com/artifacts/upload/review_article/pdf/1099/20250319-214523-194a3z.pdf"
    ]
    
    processed_data = []
    for url in paper_urls:
        print(f"\nAttempting to process: {url}")
        sections = processor.process_paper(url)
        if sections:
            full_text = "\n\n".join(sections.values())
            processed_data.append({"text": full_text})
            print(f"Successfully processed paper from {url}")
            break  # Stop after first successful download
        else:
            print(f"Failed to process paper from {url}")
    
    if not processed_data:
        print("\nError: Could not process any papers. Using sample data instead.")
        processed_data.append({
            "text": "Blockchain is a distributed ledger technology that enables secure transactions. Consensus mechanisms like Proof of Work and Proof of Stake validate transactions."
        })

    # 2. Data preparation
    pd.DataFrame(processed_data).to_csv("processed_papers.csv", index=False)
    print("\nSaved processed papers to processed_papers.csv")

    # 3. Model training with error handling
    print("\n=== Starting Model Training ===")
    try:
        base_model = load_model()
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token
        
        trained_model = train_model(base_model, tokenizer, "processed_papers.csv")
        
        # 4. Model persistence
        save_model(trained_model, tokenizer, "trained_model")

        # 5. Inference demonstration
        print("\n=== Testing Model Generation ===")
        gen_pipeline = load_for_inference("trained_model")
        test_prompts = [
            "Explain blockchain consensus mechanisms:",
            "What are the benefits of zero-knowledge proofs?",
            "Describe smart contract security considerations:"
        ]
        
        for prompt in test_prompts:
            print(f"\nPrompt: {prompt}")
            response = generate_text(gen_pipeline, prompt)
            print("Response:", response.split("\n")[0])  # Show first line of response
            
    except Exception as e:
        print(f"\nError during model training/inference: {str(e)}")
        print("Falling back to pretrained model for demonstration...")
        gen_pipeline = pipeline("text-generation", model="gpt2")
        print("\nSample generation with pretrained model:")
        print(generate_text(gen_pipeline, "Explain blockchain:"))

=== Starting Paper Processing ===

Attempting to process: https://arxiv.org/pdf/2307.12874
Successfully processed paper from https://arxiv.org/pdf/2307.12874

Saved processed papers to processed_papers.csv

=== Starting Model Training ===


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]


Error during model training/inference: TrainingArguments.__init__() got an unexpected keyword argument 'repetition_penalty'
Falling back to pretrained model for demonstration...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Sample generation with pretrained model:
Question: Explain blockchain:

Answer concisely and technically accurate:
. The key difference between a bitcoin, an Ethereum or Ripple is that each has its own set of cryptographic algorithms used to produce the digital asset (e-money). For example there are several different cryptocurrencies which use various blockchains in order for their transactions to be processed efficiently - ethereum's blocksize limit allows transaction processing within one second whereas bitcoins have two minutes; with more than half these systems being built on top as well! In this way they all come together quite seamlessly through distributed consensus mechanisms like Bitcoin Core & Dogecoin. As you can see from my recent article I've described some basic concepts about how money works using cryptography by analogy...I'm sure other people will find it helpful here if anyone wants clarification...so let me know what questions do your readers want answered so we cou