In [1]:
# Clean up existing installations
!pip uninstall -y numpy torch transformers 2>/dev/null || echo "Cleanup complete"

# Core package installation
!pip install -q --upgrade pip
!pip install -q \
    numpy==1.26.4 \
    pandas==2.2.2 \
    scipy==1.13.0 \
    scikit-learn==1.3.2 \
    spacy==3.7.4 \
    pdfplumber==0.11.0 \
    requests==2.31.0

# PyTorch installation with CUDA 12.1 support
!pip install -q \
    torch==2.2.1+cu121 \
    torchvision==0.17.1+cu121 \
    torchaudio==2.2.1+cu121 \
    --index-url https://download.pytorch.org/whl/cu121

# NLP and ML ecosystem
!pip install -q \
    transformers==4.41.2 \
    peft==0.10.0 \
    datasets==2.18.0 \
    accelerate==0.29.1 \
    bitsandbytes==0.43.0 \
    sentence-transformers==3.4.1

# SpaCy model
!python -m spacy download en_core_web_lg

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.19 requires torch<2.7,>=1.10, which is not installed.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00

In [2]:
import re
import pdfplumber
import spacy
import heapq
from typing import Dict, List
from collections import OrderedDict, Counter
from spacy.matcher import Matcher

In [3]:
class SectionDetector:
    def __init__(self, nlp=None):
        self.nlp = nlp or spacy.load("en_core_web_lg")
        self._initialize_section_patterns()
        self._refresh_matcher()

    def _initialize_section_patterns(self):
        self.section_hierarchy = {
            'abstract': {'level': 1, 'patterns': [[{"LOWER": {"REGEX": r"^(abstract|summary)$"}}]]},
            'introduction': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["introduction", "intro"]}}]]},
            'methods': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["methods", "methodology"]}}]]},
            'results': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["results", "findings"]}}]]},
            'discussion': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["discussion", "analysis"]}}]]},
            'conclusion': {'level': 1, 'patterns': [[{"LOWER": {"IN": ["conclusion", "summary"]}}]]},
            'references': {'level': 1, 'patterns': [[{"LOWER": "references"}]]}
        }

    def _refresh_matcher(self):
        self.matcher = Matcher(self.nlp.vocab)
        for section, info in self.section_hierarchy.items():
            for pattern in info['patterns']:
                self.matcher.add(section.upper(), [pattern])

    def process_document(self, text: str) -> OrderedDict:
        doc = self.nlp(text)
        matches = sorted(self.matcher(doc), key=lambda x: x[1])
        sections = OrderedDict()
        current_section = "header"
        last_end = 0

        for match_id, start, end in matches:
            section_name = self.nlp.vocab.strings[match_id].lower()
            content = doc[last_end:start].text.strip()
            if content:
                sections[current_section] = sections.get(current_section, []) + [content]
            current_section = section_name
            last_end = end

        if last_end < len(doc):
            sections[current_section] = sections.get(current_section, []) + [doc[last_end:].text.strip()]
            
        return self._postprocess_sections(sections)

    def _postprocess_sections(self, raw_sections: Dict) -> OrderedDict:
        processed = OrderedDict()
        previous_level = 0
        for section, content in raw_sections.items():
            current_level = self.section_hierarchy.get(section.lower(), {}).get('level', 1)
            if current_level > previous_level:
                processed[section] = content
                previous_level = current_level
            else:
                if processed:
                    last_section = next(reversed(processed))
                    processed[last_section] += "\n" + content
        return processed

class PaperProcessor:
    def __init__(self, detector):
        self.detector = detector

    def process_paper(self, url):
        text = self._get_paper_text(url)
        return self.detector.process_document(text) if text else None

    def _get_paper_text(self, url):
        try:
            response = requests.get(url, timeout=10)
            with pdfplumber.open(BytesIO(response.content)) as pdf:
                return "\n".join(page.extract_text() or '' for page in pdf.pages)
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            return ""

In [4]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    pipeline
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def setup_environment():
    """Configure system settings for optimal performance"""
    import os
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    os.environ["OMP_NUM_THREADS"] = "1"
    torch.backends.cudnn.benchmark = True

def load_model(model_name="gpt2"):
    """Load model with flexible quantization support"""
    compute_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=torch.cuda.is_available(),
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True
    ) if torch.cuda.is_available() else None

    return AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=compute_dtype
    )

def train_model(model, tokenizer, dataset_path):
    """Complete training workflow"""
    # Data preparation
    from datasets import Dataset
    df = pd.read_csv(dataset_path)
    dataset = Dataset.from_pandas(df[['text']])

    # Tokenization
    def tokenize_fn(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,
            padding="max_length"
        )

    tokenized_dataset = dataset.map(tokenize_fn, batched=True)

    # LoRA configuration
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["attn.c_attn", "attn.c_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    # Training setup
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=2e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=20,
        optim="adamw_torch",
        report_to="none"
    )

    # Model preparation
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # Training execution
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )
    trainer.train()
    return model

2025-05-27 23:27:13.725316: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748388433.985839      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748388434.058701      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.11/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [5]:
def save_model(model, tokenizer, output_dir):
    """Save model artifacts with proper formatting"""
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

def load_for_inference(model_path):
    """Load trained model for generation"""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return pipeline("text-generation", model=model, tokenizer=tokenizer)

def generate_text(pipeline, prompt, max_length=200):
    """Generation with temperature sampling"""
    return pipeline(
        prompt,
        max_length=max_length,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )[0]['generated_text']

In [6]:
if __name__ == "__main__":
    # Environment configuration
    setup_environment()

    # 1. Paper processing
    detector = SectionDetector()
    processor = PaperProcessor(detector)
    
    paper_urls = [
        "https://arxiv.org/pdf/2307.12874",
        "https://arxiv.org/pdf/2303.12940",
        "https://arxiv.org/pdf/1802.04351",
        "https://arxiv.org/pdf/2306.08168",
        "https://arxiv.org/pdf/2503.15964",
        "https://www.jetir.org/papers/JETIR2405D82.pdf",
        "https://www.cs.ucf.edu/~czou/research/subWallet-Blockchain-2019.pdf",
        "https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf",
        "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
        "https://dl.gi.de/server/api/core/bitstreams/aaa640a1-f8dd-4514-ad72-b809932072cc/content",
        "https://eprint.iacr.org/2023/062.pdf",
        "https://eprint.iacr.org/2022/075.pdf",    
        "https://eprint.iacr.org/2023/1234.pdf",
        "https://eprint.iacr.org/2020/300.pdf",
        "https://eprint.iacr.org/2023/312.pdf",
        "https://policyreview.info/pdf/policyreview-2016-3-427.pdf",
        "https://eprint.iacr.org/2016/013.pdf",
        "https://arxiv.org/pdf/1906.00245",
        "https://escholarship.org/content/qt7fh678d6/qt7fh678d6.pdf?t=pn651y",
        "https://re.public.polimi.it/bitstream/11311/1056221/6/11311-1056221%20Giudici.pdf",
        "https://research-api.cbs.dk/ws/files/44436178/ole_bjerg_how_is_bitcoin_money_postprint.pdf",
        "https://www.bis.org/fsi/publ/insights49.pdf",
        "https://www.scirp.org/pdf/ojbm_1534496.pdf",
        "https://www.bis.org/publ/work1066.pdf",
        "http://khcnbinhduong.gov.vn/ImageUpload/file/TTTK%20KCN/2019/Nguon%20tin%20KHCN/Blockchain_A3.pdf",
        "https://e-space.mmu.ac.uk/627269/1/Manuscript_Final%20JCLP.pdf",
        "https://pdfs.semanticscholar.org/9900/c9c91f9f78fa0adb6915855084396654363c.pdf?_gl=1*7q1z9h*_gcl_au*MTkxMDg1NzA4NC4xNzQ4MDIxMDA4*_ga*Mjc1MDg5MDkuMTc0ODAyMTAwOA..*_ga_H7P4ZT52H5*czE3NDgwMjEwMDckbzEkZzEkdDE3NDgwMjExNzkkajE1JGwwJGgwJGR1YWNJOGg3VW43bWFscGZjZ056LU5TM0lXc0Jtc0drMW93",
        "https://www.newyorkfed.org/medialibrary/media/research/epr/2024/EPR_2024_digital-assets_azar.pdf",
        "https://journals.law.harvard.edu/hblr/wp-content/uploads/sites/87/2025/03/04_HLB_15_1_Noked171-216.pdf",
        "https://www.stern.nyu.edu/sites/default/files/2024-07/Glucksman_Sak_2024.pdf",
        "https://www.tigta.gov/sites/default/files/reports/2024-07/2024300030fr_0.pdf",
        "https://www.fsb.org/uploads/Crypto-Council-for-Innovation.pdf",
        "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
        "https://ndbf.nebraska.gov/sites/default/files/industries/Digital%20Asset%20Depository%20Nebraska%20Custody%20and%20Fiduciary%20Services%20Examination%20Manual.pdf",
        "https://www.swlegal.com/media/filer_public/2d/f7/2df70b84-cb3c-4578-9943-8b3ea024abf9/sw_nl_january_2024_english.pdf",
        "https://www.willkie.com/-/media/files/publications/2024/12/law360---sec-custody-rule-creates-crypto-compliance-conundrum.pdf",
        "https://www.henrystewartpublications.com/sites/default/files/Opportunities%20in%20digital%20assets%20and%20digital%20custody-Tracking%20the%20modernisation%20of%20standard%20custody%20offering%20-%20Ignatowicz%20%26%20Taudes%20JSOC%2015-3.pdf",
        "https://www.gdf.io/wp-content/uploads/2019/02/GDF-Crypto-Asset-Safekeeping_20-April-2019-2-cust-providers-additions-1-2.pdf",
        "https://www.occ.gov/topics/charters-and-licensing/interpretations-and-actions/2020/int1170.pdf",
        "https://www.gemini.com/static/documents/guide-to-crypto-custody.pdf",
        "https://orbilu.uni.lu/bitstream/10993/62083/1/ZetzscheSinnigNikolakopoulou_Crypto%20custody_CMLJ%202024.pdf",
        "https://www.esrb.europa.eu/pub/pdf/reports/esrb.cryptoassetsanddecentralisedfinance202305~9792140acd.en.pdf",
        "https://repository.uel.ac.uk/download/df676586f4e9f8a89df529a36841d83d4750539805189a8951032ee4c2f0c16c/99798/challenges-and-approaches-to-regulating-decentralized-finance.pdf",
        "https://repository.uel.ac.uk/download/ca8bad2f5fab17596c44927643b4da1473ef7ef79862fe3ca05ea9251bd4db8b/1599957/Financial%20Crime%20update%20%282020%29.pdf",
        "https://www.iacpcybercenter.org/wp-content/uploads/2018/03/Bitcoin.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/Podcasts/SPT_Emerging-Tech-Terms.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-crypto.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-phishing.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018/Emerging_Tech_Bitcoin_Crypto.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2019/emerging-tech_white-paper.pdf",
        "https://openaccess.uoc.edu/bitstream/10609/151551/1/Rahmanikivi_cbt22_empirical.pdf",
        "https://ics.uci.edu/~dabrowsa/dabrowski-defi21-hwwallet.pdf",
        "https://fc19.ifca.ai/preproceedings/93-preproceedings.pdf",
        "https://www.jkroll.com/papers/bitcoin_threshold_signatures.pdf",
        "https://corporates.db.com/files/documents/publications/db-polygo-digital-id-wp-42pp-web-secured.pdf",
        "https://www.napier.ac.uk/-/media/worktribe/output-2839021/smart-contract-attacks-and-protections.ashx",
        "https://www.cyprusbarassociation.org/images/6._Crypto_Wallets.pdf",
        "https://computerscience.unicam.it/marcantoni/tesi/Ethereum%20Smart%20Contracts%20Optimization.pdf",
        "https://cspecc.utsa.edu/publications/files/Refereed_Papers/2020_Choo_BCPPA-blockchain-cond-priv-auth-prot.pdf",
        "https://www.ekonomika.org.rs/sr/PDF/ekonomika/2019/clanci19-3/7.pdf",
        "https://assets.cureusjournals.com/artifacts/upload/review_article/pdf/1099/20250319-214523-194a3z.pdf"
    ]
    
    processed_data = []
    for url in paper_urls:
        if sections := processor.process_paper(url):
            processed_data.append({"text": "\n".join(sections.values())})

    # 2. Data preparation
    pd.DataFrame(processed_data).to_csv("processed_papers.csv", index=False)

    # 3. Model training
    base_model = load_model()
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    
    trained_model = train_model(base_model, tokenizer, "processed_papers.csv")
    
    # 4. Model persistence
    save_model(trained_model, tokenizer, "trained_model")

    # 5. Inference demonstration
    gen_pipeline = load_for_inference("trained_model")
    test_prompts = [
        "Explain blockchain consensus mechanisms:",
        "What are the benefits of zero-knowledge proofs?"
    ]
    
    for prompt in test_prompts:
        print(f"\nPrompt: {prompt}")
        print("Response:", generate_text(gen_pipeline, prompt))

Error processing https://arxiv.org/pdf/2307.12874: name 'requests' is not defined
Error processing https://arxiv.org/pdf/2303.12940: name 'requests' is not defined
Error processing https://arxiv.org/pdf/1802.04351: name 'requests' is not defined
Error processing https://arxiv.org/pdf/2306.08168: name 'requests' is not defined
Error processing https://arxiv.org/pdf/2503.15964: name 'requests' is not defined
Error processing https://www.jetir.org/papers/JETIR2405D82.pdf: name 'requests' is not defined
Error processing https://www.cs.ucf.edu/~czou/research/subWallet-Blockchain-2019.pdf: name 'requests' is not defined
Error processing https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf: name 'requests' is not defined
Error processing https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf: name 'requests' is not defined
Error processing https://dl.gi.de/server/api/core/bitstreams/aaa640a1-f8dd-4514-ad72-b809932072cc/content: name 'requests' is not defined
Error proce

NameError: name 'pd' is not defined