In [1]:
# Installation commands
!pip install numpy==1.24.0 scipy==1.10.0 spacy==3.7.4 scikit-learn==1.2.2 pandas==1.5.3 pdfplumber requests
!python -m spacy download en_core_web_lg
!python -m nltk.downloader punkt

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0mm
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Import statements
import re
import os
import joblib
import pdfplumber
import requests
import numpy as np
import spacy
import heapq
from spacy.matcher import Matcher
from io import BytesIO
from typing import Dict, List
from collections import OrderedDict, Counter
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score

# Initialize language models
nlp_en = spacy.load("en_core_web_lg")

In [3]:
class SectionDetector:
    def __init__(self, nlp=None, model_path=None):
        self.nlp = nlp or spacy.load("en_core_web_lg")
        self.section_classifier = None
        self.section_hierarchy = {}
        self.matcher = None
        self._initialize_section_patterns()
        
        if model_path:
            self.load_model(model_path)

    def add_custom_section(self, name: str, level: int, patterns: List[List[dict]]):
        self.section_hierarchy[name.lower()] = {'level': level, 'patterns': patterns}
        self._refresh_matcher()

    class SpacyTransformer(BaseEstimator, TransformerMixin):
        def __init__(self, nlp): 
            self.nlp = nlp
        def transform(self, X):
            return np.array([
                [len(doc), len(list(doc.sents)), 
                 sum(1 for t in doc if t.is_title),
                 sum(1 for t in doc if t.pos_ == "NOUN")
                ] for doc in [self.nlp(text) for text in X]])
        def fit(self, X, y=None): 
            return self

    def _initialize_section_patterns(self):
        self.section_hierarchy = {
            'abstract': {'level': 1, 'patterns': [
                [{"LOWER": {"REGEX": r"^(abstract|summary)$"}}],
                [{"IS_DIGIT": True}, {"LOWER": {"IN": ["abstract", "summary"]}}]
            ]},
            'introduction': {'level': 1, 'patterns': [
                [{"LOWER": {"IN": ["introduction", "intro"]}}],
                [{"TEXT": r"^1\.?\s?", "LOWER": "introduction"}]
            ]},
            'methods': {'level': 1, 'patterns': [
                [{"LOWER": {"IN": ["methods", "methodology"]}}],
                [{"TEXT": r"^2\.?\s?", "LOWER": "methods"}]
            ]},
            'results': {'level': 1, 'patterns': [
                [{"LOWER": {"IN": ["results", "findings"]}}],
                [{"TEXT": r"^3\.?\s?", "LOWER": "results"}]
            ]},
            'discussion': {'level': 1, 'patterns': [
                [{"LOWER": {"IN": ["discussion", "analysis"]}}],
                [{"TEXT": r"^4\.?\s?", "LOWER": "discussion"}]
            ]},
            'conclusion': {'level': 1, 'patterns': [
                [{"LOWER": {"IN": ["conclusion", "summary"]}}],
                [{"TEXT": r"^5\.?\s?", "LOWER": "conclusion"}]
            ]},
            'references': {'level': 1, 'patterns': [
                [{"LOWER": "references"}],
                [{"TEXT": r"^References$"}]
            ]}
        }
        self._refresh_matcher()

    def _refresh_matcher(self):
        self.matcher = Matcher(self.nlp.vocab)
        for section, info in self.section_hierarchy.items():
            for pattern in info['patterns']:
                self.matcher.add(section.upper(), [pattern])

    def train_classifier(self, X_train, y_train):
        if isinstance(X_train[0], tuple):
            X_train = [text for text, *_ in X_train]

        self.section_classifier = Pipeline([
            ('features', FeatureUnion([
                ('tfidf', TfidfVectorizer(
                    tokenizer=self._spacy_tokenizer,
                    token_pattern=None,
                    ngram_range=(1, 2),
                    max_features=3000,
                    lowercase=True
                )),
                ('spacy', self.SpacyTransformer(self.nlp))
            ])),
            ('clf', LogisticRegression(
                class_weight='balanced',
                max_iter=1000,
                C=0.1
            ))
        ])
        self.section_classifier.fit(X_train, y_train)

    def _spacy_tokenizer(self, text):
        doc = self.nlp(text)
        return [
            token.lemma_.lower() if not token.is_oov else token.text.lower()
            for token in doc
            if not token.is_stop and not token.is_punct and not token.is_space
        ]

    def process_document(self, text: str, use_ml: bool = True) -> OrderedDict:
        raw_sections = self._extract_raw_sections(text, use_ml)
        return self._postprocess_sections(raw_sections)

    def _extract_raw_sections(self, text, use_ml):
        doc = self.nlp(text)
        matches = self.matcher(doc)
        sections = OrderedDict()
        current_section = "header"
        last_end = 0

        matches = sorted(matches, key=lambda x: x[1])

        for match_id, start, end in matches:
            section_name = self.nlp.vocab.strings[match_id].lower()
            content = doc[last_end:start].text.strip()
            
            if content:
                sections[current_section] = sections.get(current_section, []) + [content]
            
            current_section = self._classify_section(
                doc[start:end].text, 
                section_name, 
                use_ml
            )
            last_end = end

        if last_end < len(doc):
            sections[current_section] = sections.get(current_section, []) + [doc[last_end:].text.strip()]

        return {k: "\n".join(v) for k, v in sections.items()}

    def _classify_section(self, header_text, rule_based_name, use_ml):
        if use_ml and self.section_classifier:
            try:
                return self.section_classifier.predict([header_text])[0]
            except Exception as e:
                print(f"ML classification failed: {e}, using rule-based")
        return rule_based_name

    def _postprocess_sections(self, raw_sections: Dict) -> OrderedDict:
        processed = OrderedDict()
        previous_level = 0
        
        for section, content in raw_sections.items():
            current_info = self.section_hierarchy.get(section.lower(), {})
            current_level = current_info.get('level', 1)
            
            if current_level > previous_level:
                processed[section] = content
                previous_level = current_level
            else:
                if processed:
                    last_section = next(reversed(processed))
                    processed[last_section] += "\n" + content
                else:
                    processed[section] = content
        
        return processed

    def save_model(self, path):
        state = {
            'classifier': self.section_classifier,
            'hierarchy': self.section_hierarchy,
            'nlp_lang': self.nlp.lang
        }
        joblib.dump(state, path)
    
    def load_model(self, path):
        state = joblib.load(path)
        self.section_classifier = state['classifier']
        self.section_hierarchy = state['hierarchy']
        self.nlp = spacy.load(f"{state['nlp_lang']}_core_web_sm")
        self._refresh_matcher()

    def calculate_accuracy(self, true_labels, predicted_labels):
        return {
            'precision': precision_score(true_labels, predicted_labels, average='weighted'),
            'recall': recall_score(true_labels, predicted_labels, average='weighted'),
            'f1': f1_score(true_labels, predicted_labels, average='weighted')
        }

class EnhancedSectionDetector(SectionDetector):
    def summarize_section(self, text, sentences_count=3):
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        if len(sentences) <= sentences_count: 
            return text
        
        word_freq = Counter(re.findall(r'\w+', text.lower()))
        scores = [(i, sum(word_freq[word] for word in re.findall(r'\w+', sent.lower())))
                 for i, sent in enumerate(sentences)]
        top_indices = heapq.nlargest(sentences_count, range(len(scores)), key=lambda i: scores[i])
        return ' '.join([sentences[i] for i in sorted(top_indices)])

class PaperProcessor:
    def __init__(self, detector):
        self.detector = detector
        
    def process_paper(self, url):
        text = get_paper_text(url)
        if not text: 
            return None
        sections = self.detector.process_document(text)
        return {'url': url, 'sections': self._summarize_sections(sections)} if sections else None
    
    def _summarize_sections(self, sections):
        return {sec: {
            'summary': self.detector.summarize_section(content),
            'length': len(content),
            'keywords': self._extract_keywords(content)
        } for sec, content in sections.items()}
    
    def _extract_keywords(self, text, top_n=5):
        words = re.findall(r'\b\w{4,}\b', text.lower())
        return [w for w, _ in Counter(words).most_common(top_n)]

def get_paper_text(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        response.raise_for_status()
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            text = "\n".join(page.extract_text() or '' for page in pdf.pages)
            return text if len(text) > 100 else ""
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return ""

In [4]:
if __name__ == "__main__":
    # Initialize detector with custom patterns
    detector = EnhancedSectionDetector(nlp_en)
    detector.add_custom_section('smart_contracts', 2, [
        [{"LOWER": {"IN": ["smart", "contract"]}}],
        [{"TEXT": {"REGEX": r"^4\.?\s?(contract|smart)"}}]
    ])
    
    # Process sample papers
    processor = PaperProcessor(detector)
    papers = [
        "https://arxiv.org/pdf/2307.12874",
        "https://arxiv.org/pdf/2303.12940",
        "https://arxiv.org/pdf/1802.04351",
        "https://arxiv.org/pdf/2306.08168",
        "https://arxiv.org/pdf/2503.15964",
        "https://www.jetir.org/papers/JETIR2405D82.pdf",
        "https://www.cs.ucf.edu/~czou/research/subWallet-Blockchain-2019.pdf",
        "https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf",
        "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
        "https://dl.gi.de/server/api/core/bitstreams/aaa640a1-f8dd-4514-ad72-b809932072cc/content",
        "https://eprint.iacr.org/2023/062.pdf",
        "https://eprint.iacr.org/2022/075.pdf",    
        "https://eprint.iacr.org/2023/1234.pdf",
        "https://eprint.iacr.org/2020/300.pdf",
        "https://eprint.iacr.org/2023/312.pdf",
        "https://policyreview.info/pdf/policyreview-2016-3-427.pdf",
        "https://eprint.iacr.org/2016/013.pdf",
        "https://arxiv.org/pdf/1906.00245",
        "https://escholarship.org/content/qt7fh678d6/qt7fh678d6.pdf?t=pn651y",
        "https://re.public.polimi.it/bitstream/11311/1056221/6/11311-1056221%20Giudici.pdf",
        "https://research-api.cbs.dk/ws/files/44436178/ole_bjerg_how_is_bitcoin_money_postprint.pdf",
        "https://www.bis.org/fsi/publ/insights49.pdf",
        "https://www.scirp.org/pdf/ojbm_1534496.pdf",
        "https://www.bis.org/publ/work1066.pdf",
        "http://khcnbinhduong.gov.vn/ImageUpload/file/TTTK%20KCN/2019/Nguon%20tin%20KHCN/Blockchain_A3.pdf",
        "https://e-space.mmu.ac.uk/627269/1/Manuscript_Final%20JCLP.pdf",
        "https://pdfs.semanticscholar.org/9900/c9c91f9f78fa0adb6915855084396654363c.pdf?_gl=1*7q1z9h*_gcl_au*MTkxMDg1NzA4NC4xNzQ4MDIxMDA4*_ga*Mjc1MDg5MDkuMTc0ODAyMTAwOA..*_ga_H7P4ZT52H5*czE3NDgwMjEwMDckbzEkZzEkdDE3NDgwMjExNzkkajE1JGwwJGgwJGR1YWNJOGg3VW43bWFscGZjZ056LU5TM0lXc0Jtc0drMW93",
        "https://www.newyorkfed.org/medialibrary/media/research/epr/2024/EPR_2024_digital-assets_azar.pdf",
        "https://journals.law.harvard.edu/hblr/wp-content/uploads/sites/87/2025/03/04_HLB_15_1_Noked171-216.pdf",
        "https://www.stern.nyu.edu/sites/default/files/2024-07/Glucksman_Sak_2024.pdf",
        "https://www.tigta.gov/sites/default/files/reports/2024-07/2024300030fr_0.pdf",
        "https://www.fsb.org/uploads/Crypto-Council-for-Innovation.pdf",
        "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
        "https://ndbf.nebraska.gov/sites/default/files/industries/Digital%20Asset%20Depository%20Nebraska%20Custody%20and%20Fiduciary%20Services%20Examination%20Manual.pdf",
        "https://www.swlegal.com/media/filer_public/2d/f7/2df70b84-cb3c-4578-9943-8b3ea024abf9/sw_nl_january_2024_english.pdf",
        "https://www.willkie.com/-/media/files/publications/2024/12/law360---sec-custody-rule-creates-crypto-compliance-conundrum.pdf",
        "https://www.henrystewartpublications.com/sites/default/files/Opportunities%20in%20digital%20assets%20and%20digital%20custody-Tracking%20the%20modernisation%20of%20standard%20custody%20offering%20-%20Ignatowicz%20%26%20Taudes%20JSOC%2015-3.pdf",
        "https://www.gdf.io/wp-content/uploads/2019/02/GDF-Crypto-Asset-Safekeeping_20-April-2019-2-cust-providers-additions-1-2.pdf",
        "https://www.occ.gov/topics/charters-and-licensing/interpretations-and-actions/2020/int1170.pdf",
        "https://www.gemini.com/static/documents/guide-to-crypto-custody.pdf",
        "https://orbilu.uni.lu/bitstream/10993/62083/1/ZetzscheSinnigNikolakopoulou_Crypto%20custody_CMLJ%202024.pdf",
        "https://www.esrb.europa.eu/pub/pdf/reports/esrb.cryptoassetsanddecentralisedfinance202305~9792140acd.en.pdf",
        "https://repository.uel.ac.uk/download/df676586f4e9f8a89df529a36841d83d4750539805189a8951032ee4c2f0c16c/99798/challenges-and-approaches-to-regulating-decentralized-finance.pdf",
        "https://repository.uel.ac.uk/download/ca8bad2f5fab17596c44927643b4da1473ef7ef79862fe3ca05ea9251bd4db8b/1599957/Financial%20Crime%20update%20%282020%29.pdf",
        "https://www.iacpcybercenter.org/wp-content/uploads/2018/03/Bitcoin.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/Podcasts/SPT_Emerging-Tech-Terms.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-crypto.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-phishing.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018/Emerging_Tech_Bitcoin_Crypto.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2019/emerging-tech_white-paper.pdf",
        "https://openaccess.uoc.edu/bitstream/10609/151551/1/Rahmanikivi_cbt22_empirical.pdf",
        "https://ics.uci.edu/~dabrowsa/dabrowski-defi21-hwwallet.pdf",
        "https://fc19.ifca.ai/preproceedings/93-preproceedings.pdf",
        "https://www.jkroll.com/papers/bitcoin_threshold_signatures.pdf",
        "https://corporates.db.com/files/documents/publications/db-polygo-digital-id-wp-42pp-web-secured.pdf",
        "https://www.napier.ac.uk/-/media/worktribe/output-2839021/smart-contract-attacks-and-protections.ashx",
        "https://www.cyprusbarassociation.org/images/6._Crypto_Wallets.pdf",
        "https://computerscience.unicam.it/marcantoni/tesi/Ethereum%20Smart%20Contracts%20Optimization.pdf",
        "https://cspecc.utsa.edu/publications/files/Refereed_Papers/2020_Choo_BCPPA-blockchain-cond-priv-auth-prot.pdf",
        "https://www.ekonomika.org.rs/sr/PDF/ekonomika/2019/clanci19-3/7.pdf",
        "https://assets.cureusjournals.com/artifacts/upload/review_article/pdf/1099/20250319-214523-194a3z.pdf"
    ]
    
    results = []
    for url in papers:
        print(f"\nProcessing: {url}")
        if result := processor.process_paper(url):
            results.append(result)
            print("Detected Sections:", list(result['sections'].keys()))
            if result['sections']:
                first_sec = next(iter(result['sections'].values()))
                print(f"Summary: {first_sec['summary'][:300]}...")

    print(f"\nProcessed {len(results)} papers successfully")


Processing: https://arxiv.org/pdf/2307.12874
Detected Sections: ['header', 'smart_contracts']
Summary: 70, no. 4, pp. 539–551, 4
2019....

Processing: https://arxiv.org/pdf/2303.12940
Detected Sections: ['header', 'smart_contracts']
Summary: [13], derived unlock pattern of the user and
the status of the foreground program without any authorization, revealing the
intensityofsecurityweaknessesinthetransmissionprocedure.Ifanintrudercan
disrupt the money transfer or query of the balances or transactions by blocking
the link between the wall...

Processing: https://arxiv.org/pdf/1802.04351
Detected Sections: ['header']
Summary: be interesting to investigate how expert users are (apparently
successfully) handling these challenges. Bitcoin presents a
newopportunityforpublickeycryptographytobecomemain-
stream, and our evaluation is a first step towards achieving
usable key management in decentralized cryptocurrencies. 10...

Processing: https://arxiv.org/pdf/2306.08168
Detected Sections: ['he