In [1]:
!pip install numpy==1.24.0 scipy==1.10.0 spacy==3.7.4 scikit-learn==1.2.2 pandas==1.5.3

Collecting pandas==1.5.3
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.3
    Uninstalling pandas-2.2.3:
      Successfully uninstalled pandas-2.2.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.
dask-expr 1.1.21 requires pandas>=2, but you have pandas 1.5.3 which is incompatible.
cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, but you have pandas 1.5.3

In [2]:
# English model
!python -m spacy download en_core_web_sm-3.7.0 --direct

# Chinese model (old version)
!pip install https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en_core_web_sm 3.8.0
    Uninstalling en_core_web_sm-3.8.0:
      Successfully uninstalled en_core_web_sm-3.8.0
Successfully installed en-core-web-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting zh-core-web-sm==3.7.0
  Downloading h

In [3]:
# --- Imports ---
import spacy
from spacy.matcher import Matcher
import numpy as np
import os
import re
import joblib
from collections import OrderedDict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from spacy.lang.en import English
from spacy.lang.zh import Chinese

In [4]:
nlp_en = spacy.load("en_core_web_sm")
nlp_zh = spacy.load("zh_core_web_sm")

In [5]:
# --- Enhanced SectionDetector Class ---
class SectionDetector:
    def __init__(self, nlp=None, model_path=None):
        """Initialize with optional pre-trained model"""
        self.nlp = nlp or spacy.load("en_core_web_sm")
        self.section_classifier = None
        self.section_hierarchy = {}
        self.matcher = None
        self._initialize_section_patterns()
        
        if model_path:
            self.load_model(model_path)

    def _initialize_section_patterns(self):
        """Initialize multilingual section patterns and matcher"""
        # Base patterns for English
        self.section_hierarchy = {
            'abstract': {'level': 1, 'patterns': [
                [{"LOWER": "abstract"}],
                [{"IS_DIGIT": True}, {"LOWER": "abstract"}]
            ]},
            'introduction': {'level': 1, 'patterns': [
                [{"LOWER": "introduction"}],
                [{"IS_DIGIT": True}, {"LOWER": "introduction"}]
            ]},
            'methods': {'level': 1, 'patterns': [
                [{"LOWER": "methods"}],
                [{"LOWER": "materials"}, {"LOWER": "and"}, {"LOWER": "methods"}],
                [{"IS_DIGIT": True}, {"LOWER": "methods"}]
            ]},
            'results': {'level': 1, 'patterns': [
                [{"LOWER": "results"}],
                [{"IS_DIGIT": True}, {"LOWER": "results"}]
            ]},
            'discussion': {'level': 1, 'patterns': [
                [{"LOWER": "discussion"}],
                [{"IS_DIGIT": True}, {"LOWER": "discussion"}]
            ]},
            'conclusion': {'level': 1, 'patterns': [
                [{"LOWER": "conclusion"}],
                [{"IS_DIGIT": True}, {"LOWER": "conclusion"}]
            ]},
            'references': {'level': 1, 'patterns': [
                [{"LOWER": "references"}],
                [{"LOWER": "bibliography"}],
                [{"IS_DIGIT": True}, {"LOWER": "references"}]
            ]}
        }

        # Initialize matcher
        self.matcher = Matcher(self.nlp.vocab)
        self._refresh_matcher()

    def _refresh_matcher(self):
        """Update matcher with current section patterns"""
        self.matcher = Matcher(self.nlp.vocab)
        for section, info in self.section_hierarchy.items():
            for pattern in info['patterns']:
                self.matcher.add(section.upper(), [pattern])

    def add_custom_section(self, name, level, patterns):
        """Add domain-specific section patterns"""
        self.section_hierarchy[name] = {
            'level': level,
            'patterns': patterns
        }
        self._refresh_matcher()

    def _extract_raw_sections(self, text, use_ml):
        """Multilingual text segmentation with context-aware processing"""
        doc = self.nlp(text)
        matches = self.matcher(doc)
        sections = OrderedDict()
        current_section = "header"
        last_end = 0

        # Sort matches by start position
        matches = sorted(matches, key=lambda x: x[1])

        for match_id, start, end in matches:
            section_name = self.nlp.vocab.strings[match_id].lower()
            content = doc[last_end:start].text.strip()
            
            if content:
                sections[current_section] = sections.get(current_section, []) + [content]
            
            current_section = self._classify_section(
                doc[start:end].text, 
                section_name, 
                use_ml
            )
            
            last_end = end

        # Add remaining content
        if last_end < len(doc):
            sections[current_section] = sections.get(current_section, []) + [doc[last_end:].text.strip()]

        return {k: "\n".join(v) for k, v in sections.items()}

    def _classify_section(self, header_text, rule_based_name, use_ml):
        """Hybrid classification decision"""
        if use_ml and self.section_classifier:
            try:
                return self.section_classifier.predict([header_text])[0]
            except Exception as e:
                print(f"ML classification failed: {e}, using rule-based")
        return rule_based_name

    def _postprocess_sections(self, sections):
        """Multilingual hierarchy reconstruction"""
        hierarchy_stack = []
        final_sections = OrderedDict()
        
        for section_name, content in sections.items():
            level = self._get_section_level(section_name)
            
            while hierarchy_stack and hierarchy_stack[-1]['level'] >= level:
                hierarchy_stack.pop()
            
            hierarchy_stack.append({
                'name': section_name,
                'level': level
            })
            
            hier_key = "::".join([n['name'] for n in hierarchy_stack])
            final_sections[hier_key] = content

        return final_sections

    def _get_section_level(self, section_name):
        """Multilingual level detection with fuzzy matching"""
        clean_name = section_name.lower().strip()
        
        for section, info in self.section_hierarchy.items():
            if section in clean_name:
                return info['level']
        
        similarity_threshold = 0.8
        for section, info in self.section_hierarchy.items():
            if self.nlp(section).similarity(self.nlp(clean_name)) > similarity_threshold:
                return info['level']
        
        return 0  # Default level

    def save_model(self, path):
        """Save complete detector state"""
        state = {
            'classifier': self.section_classifier,
            'hierarchy': self.section_hierarchy,
            'nlp_config': self.nlp.config
        }
        joblib.dump(state, path)
        print(f"Full detector state saved to {path}")

    def load_model(self, path):
        """Load complete detector state"""
        state = joblib.load(path)
        self.section_classifier = state['classifier']
        self.section_hierarchy = state['hierarchy']
        self.nlp = spacy.load(state['nlp_config']['lang'])
        self._refresh_matcher()
        print(f"Full detector state loaded from {path}")

    class SpacyTransformer(BaseEstimator, TransformerMixin):
        def __init__(self, nlp):
            self.nlp = nlp
            self.tokenizer = nlp.tokenizer
            
        def transform(self, X):
            processed = []
            for text in X:
                doc = self.nlp(text)
                features = [
                    len(doc), 
                    len(list(doc.sents)),
                    sum(1 for token in doc if token.is_title),
                    sum(1 for token in doc if token.pos_ == "NOUN")
                ]
                processed.append(features)
            return np.array(processed)
            
        def fit(self, X, y=None):
            return self

    def train_classifier(self, X_train, y_train):
        self.section_classifier = Pipeline([
            ('features', FeatureUnion([
                ('tfidf', TfidfVectorizer(
                    tokenizer=self._spacy_tokenizer,
                    ngram_range=(1, 2),
                    max_features=3000
                )),
                ('spacy', self.SpacyTransformer(self.nlp))
            ])),
            ('clf', LogisticRegression(
                class_weight='balanced',
                max_iter=1000,
                C=0.1
            ))
        ])
        self.section_classifier.fit(X_train, y_train)

    def _spacy_tokenizer(self, text):
        """Language-aware tokenization"""
        doc = self.nlp(text)
        return [
            token.lemma_.lower() 
            if not token.is_oov else token.text.lower()
            for token in doc
            if not token.is_stop 
            and not token.is_punct
            and not token.is_space
        ]

    def calculate_accuracy(self, true_labels, predicted_labels):
        return {
            'precision': precision_score(true_labels, predicted_labels, average='weighted'),
            'recall': recall_score(true_labels, predicted_labels, average='weighted'),
            'f1': f1_score(true_labels, predicted_labels, average='weighted')
        }

    # --- Enhanced Multilingual Processing Pipeline ---
    def process_papers_with_sections(paper_urls, model_path=None, output_dir="/kaggle/working/papers", lang='en'):
        """Enhanced pipeline with multilingual support"""
        # [Keep full function implementation as provided]
        # ... (Full function code from original implementation)
    
    # --- Enhanced Evaluation Metrics ---
    def calculate_extended_metrics(y_true, y_pred):
        """Calculate comprehensive evaluation metrics"""
        # [Keep full function implementation as provided]
        # ... (Full function code from original implementation)
    
    # --- Updated Training Data Loading ---
    def load_training_data():
        """Sample training data - replace with actual dataset"""
        training_examples = [
            ("Abstract", "abstract"),
            ("1. Introduction", "introduction"),
            ("Methods and Materials", "methods"),
            ("Experimental Results", "results"),
            ("Discussion", "discussion"),
            ("Conclusion", "conclusion"),
            ("References", "references"),
            ("摘要", "abstract"),  # Chinese examples
            ("1. 引言", "introduction"),
            ("方法", "methods")
        ]
        
        X_train = [text for text, label in training_examples]
        y_train = [label for text, label in training_examples]
        return X_train, y_train
    
    # --- Mock PDF Text Extraction Function ---
    def get_paper_text(url):
        """Mock PDF text extraction - implement actual PDF processing"""
        # In real implementation, use PyPDF2, pdfplumber, or similar
        return f"Sample text from {url}"

In [6]:
# --- Enhanced Multilingual Processing Pipeline ---
def process_papers_with_sections(paper_urls, model_path=None, output_dir="/kaggle/working/papers", lang='en'):
    """Enhanced pipeline with multilingual support"""
    # Initialize appropriate NLP pipeline
    if lang == 'zh':
        nlp = Chinese()
        nlp.add_pipe("sentencizer")
    else:
        nlp = English()
        nlp.add_pipe("sentencizer")
    
    # Initialize detector with language-specific settings
    detector = SectionDetector(nlp=nlp)
    
    # Add Chinese section patterns
    if lang == 'zh':
        chinese_sections = {
            '摘要': {'level': 1, 'patterns': [[{"ORTH": "摘要"}]]},
            '引言': {'level': 1, 'patterns': [[{"ORTH": "引言"}]]},
            '方法': {'level': 1, 'patterns': [[{"ORTH": "方法"}]]},
            '结果': {'level': 1, 'patterns': [[{"ORTH": "结果"}]]},
            '讨论': {'level': 1, 'patterns': [[{"ORTH": "讨论"}]]},
            '结论': {'level': 1, 'patterns': [[{"ORTH": "结论"}]]},
            '参考文献': {'level': 1, 'patterns': [[{"ORTH": "参考文献"}]]}
        }
        for name, config in chinese_sections.items():
            detector.add_custom_section(name, config['level'], config['patterns'])
    
    if model_path and os.path.exists(model_path):
        detector.load_model(model_path)
        use_ml = True
    else:
        use_ml = False
    
    processed_papers = []
    for url in paper_urls:
        try:
            # Assuming get_paper_text is implemented elsewhere
            text = get_paper_text(url)
            sections = detector.extract_sections(text, use_ml=use_ml)
            processed_papers.append({
                'url': url,
                'sections': sections,
                'language': lang
            })
        except Exception as e:
            print(f"Error processing {url}: {e}")
    
    # Save results
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        for paper in processed_papers:
            filename = os.path.join(output_dir, f"{paper['url'].split('/')[-1]}.json")
            joblib.dump(paper, filename)
    
    return processed_papers

In [7]:
# --- Enhanced Evaluation Metrics ---
def calculate_extended_metrics(y_true, y_pred):
    """Calculate comprehensive evaluation metrics"""
    return {
        'precision_macro': precision_score(y_true, y_pred, average='macro'),
        'recall_macro': recall_score(y_true, y_pred, average='macro'),
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'precision_weighted': precision_score(y_true, y_pred, average='weighted'),
        'recall_weighted': recall_score(y_true, y_pred, average='weighted'),
        'f1_weighted': f1_score(y_true, y_pred, average='weighted'),
        'class_distribution': {
            cls: {'true': y_true.count(cls), 'predicted': list(y_pred).count(cls)}
            for cls in set(y_true + y_pred)
        }
    }

In [8]:
if __name__ == "__main__":
    # Load and verify training data
    X_train, y_train = load_training_data()
    
    if not X_train or not y_train:
        raise ValueError("Training data loading failed - check data sources")
    
    if len(X_train) != len(y_train):
        raise ValueError("Mismatched training data features/labels")

    # Initialize and train detector
    detector = SectionDetector()
    detector.train_classifier(X_train, y_train)
    
    # Save model
    detector.save_model("/kaggle/working/section_classifier.joblib")
    
    # Verify model saving
    if not os.path.exists("/kaggle/working/section_classifier.joblib"):
        raise RuntimeError("Model saving failed")

    # Process papers
    papers = process_papers_with_sections(
        paper_urls = [
            "https://arxiv.org/pdf/2307.12874",
            "https://arxiv.org/pdf/2303.12940",
            "https://arxiv.org/pdf/1802.04351",
            "https://arxiv.org/pdf/2306.08168",
            "https://arxiv.org/pdf/2503.15964",
            "https://www.jetir.org/papers/JETIR2405D82.pdf",
            "https://www.cs.ucf.edu/~czou/research/subWallet-Blockchain-2019.pdf",
            "https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf",
            "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
            "https://dl.gi.de/server/api/core/bitstreams/aaa640a1-f8dd-4514-ad72-b809932072cc/content",
            "https://eprint.iacr.org/2023/062.pdf",
            "https://eprint.iacr.org/2022/075.pdf",    
            "https://eprint.iacr.org/2023/1234.pdf",
            "https://eprint.iacr.org/2020/300.pdf",
            "https://eprint.iacr.org/2023/312.pdf",
            "https://policyreview.info/pdf/policyreview-2016-3-427.pdf",
            "https://eprint.iacr.org/2016/013.pdf",
            "https://arxiv.org/pdf/1906.00245",
            "https://escholarship.org/content/qt7fh678d6/qt7fh678d6.pdf?t=pn651y",
            "https://re.public.polimi.it/bitstream/11311/1056221/6/11311-1056221%20Giudici.pdf",
            "https://research-api.cbs.dk/ws/files/44436178/ole_bjerg_how_is_bitcoin_money_postprint.pdf",
            "https://www.bis.org/fsi/publ/insights49.pdf",
            "https://www.scirp.org/pdf/ojbm_1534496.pdf",
            "https://www.bis.org/publ/work1066.pdf",
            "http://khcnbinhduong.gov.vn/ImageUpload/file/TTTK%20KCN/2019/Nguon%20tin%20KHCN/Blockchain_A3.pdf",
            "https://e-space.mmu.ac.uk/627269/1/Manuscript_Final%20JCLP.pdf",
            "https://pdfs.semanticscholar.org/9900/c9c91f9f78fa0adb6915855084396654363c.pdf?_gl=1*7q1z9h*_gcl_au*MTkxMDg1NzA4NC4xNzQ4MDIxMDA4*_ga*Mjc1MDg5MDkuMTc0ODAyMTAwOA..*_ga_H7P4ZT52H5*czE3NDgwMjEwMDckbzEkZzEkdDE3NDgwMjExNzkkajE1JGwwJGgwJGR1YWNJOGg3VW43bWFscGZjZ056LU5TM0lXc0Jtc0drMW93",
            "https://www.newyorkfed.org/medialibrary/media/research/epr/2024/EPR_2024_digital-assets_azar.pdf",
            "https://journals.law.harvard.edu/hblr/wp-content/uploads/sites/87/2025/03/04_HLB_15_1_Noked171-216.pdf",
            "https://www.stern.nyu.edu/sites/default/files/2024-07/Glucksman_Sak_2024.pdf",
            "https://www.tigta.gov/sites/default/files/reports/2024-07/2024300030fr_0.pdf",
            "https://www.fsb.org/uploads/Crypto-Council-for-Innovation.pdf",
            "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
            "https://ndbf.nebraska.gov/sites/default/files/industries/Digital%20Asset%20Depository%20Nebraska%20Custody%20and%20Fiduciary%20Services%20Examination%20Manual.pdf",
            "https://www.swlegal.com/media/filer_public/2d/f7/2df70b84-cb3c-4578-9943-8b3ea024abf9/sw_nl_january_2024_english.pdf",
            "https://www.willkie.com/-/media/files/publications/2024/12/law360---sec-custody-rule-creates-crypto-compliance-conundrum.pdf",
            "https://www.henrystewartpublications.com/sites/default/files/Opportunities%20in%20digital%20assets%20and%20digital%20custody-Tracking%20the%20modernisation%20of%20standard%20custody%20offering%20-%20Ignatowicz%20%26%20Taudes%20JSOC%2015-3.pdf",
            "https://www.gdf.io/wp-content/uploads/2019/02/GDF-Crypto-Asset-Safekeeping_20-April-2019-2-cust-providers-additions-1-2.pdf",
            "https://www.occ.gov/topics/charters-and-licensing/interpretations-and-actions/2020/int1170.pdf",
            "https://www.gemini.com/static/documents/guide-to-crypto-custody.pdf",
            "https://orbilu.uni.lu/bitstream/10993/62083/1/ZetzscheSinnigNikolakopoulou_Crypto%20custody_CMLJ%202024.pdf",
            "https://www.esrb.europa.eu/pub/pdf/reports/esrb.cryptoassetsanddecentralisedfinance202305~9792140acd.en.pdf",
            "https://repository.uel.ac.uk/download/df676586f4e9f8a89df529a36841d83d4750539805189a8951032ee4c2f0c16c/99798/challenges-and-approaches-to-regulating-decentralized-finance.pdf",
            "https://repository.uel.ac.uk/download/ca8bad2f5fab17596c44927643b4da1473ef7ef79862fe3ca05ea9251bd4db8b/1599957/Financial%20Crime%20update%20%282020%29.pdf",
            "https://www.iacpcybercenter.org/wp-content/uploads/2018/03/Bitcoin.pdf",
            "https://www.ussc.gov/sites/default/files/pdf/training/Podcasts/SPT_Emerging-Tech-Terms.pdf",
            "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-crypto.pdf",
            "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-phishing.pdf",
            "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018/Emerging_Tech_Bitcoin_Crypto.pdf",
            "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2019/emerging-tech_white-paper.pdf",
            "https://openaccess.uoc.edu/bitstream/10609/151551/1/Rahmanikivi_cbt22_empirical.pdf",
            "https://ics.uci.edu/~dabrowsa/dabrowski-defi21-hwwallet.pdf",
            "https://fc19.ifca.ai/preproceedings/93-preproceedings.pdf",
            "https://www.jkroll.com/papers/bitcoin_threshold_signatures.pdf",
            "https://corporates.db.com/files/documents/publications/db-polygo-digital-id-wp-42pp-web-secured.pdf",
            "https://www.napier.ac.uk/-/media/worktribe/output-2839021/smart-contract-attacks-and-protections.ashx",
            "https://www.cyprusbarassociation.org/images/6._Crypto_Wallets.pdf",
            "https://computerscience.unicam.it/marcantoni/tesi/Ethereum%20Smart%20Contracts%20Optimization.pdf",
            "https://cspecc.utsa.edu/publications/files/Refereed_Papers/2020_Choo_BCPPA-blockchain-cond-priv-auth-prot.pdf",
            "https://www.ekonomika.org.rs/sr/PDF/ekonomika/2019/clanci19-3/7.pdf",
            "https://assets.cureusjournals.com/artifacts/upload/review_article/pdf/1099/20250319-214523-194a3z.pdf"
        ],
        model_path="/kaggle/working/section_classifier.joblib",
        lang='en'
    )
    
    # Evaluation (with mock true labels for demonstration)
    y_true, y_pred = [], []
    class_mapping = {
        'ABSTRACT': 'abstract',
        'INTRO': 'introduction',
        '摘要': 'abstract',
        '引言': 'introduction'
    }
    
    # Generate mock evaluation data
    for paper in papers:
        # In real implementation, use actual ground truth labels
        mock_true_labels = ['abstract', 'introduction', 'methods', 'results']
        predicted_labels = list(paper['sections'].keys())[:4]  # First 4 sections
        
        for true, pred in zip(mock_true_labels, predicted_labels):
            y_true.append(true)
            y_pred.append(pred)

    # Calculate metrics
    base_metrics = detector.calculate_accuracy(y_true, y_pred)
    extended_metrics = calculate_extended_metrics(y_true, y_pred)
    
    print("Base Metrics:")
    print(f"Weighted Precision: {base_metrics['precision']:.2f}")
    print(f"Weighted Recall: {base_metrics['recall']:.2f}")
    print(f"Weighted F1: {base_metrics['f1']:.2f}")
    
    print("\nExtended Metrics:")
    print(f"Macro Precision: {extended_metrics['precision_macro']:.2f}")
    print(f"Macro Recall: {extended_metrics['recall_macro']:.2f}")
    print(f"Macro F1: {extended_metrics['f1_macro']:.2f}")
    
    print("\nClass Distribution:")
    for cls, counts in extended_metrics['class_distribution'].items():
        print(f"{cls}: True={counts['true']}, Predicted={counts['predicted']}")

NameError: name 'X_train' is not defined