In [3]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Using cached transformers-4.57.1-py3-none-any.whl (12.0 MB)
Using cached huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl (320 kB)
Using cached tokenizers-0.22.1-cp39-abi3-win_amd64.whl (2.7 MB)
Installing collected packages: safetensors, huggingface-hub, tokenizers, transformers
Successfully installed huggingface-hub-0.36.0 safetensors-0.6.2 tokenizers-0.22.1 transformers-4.57.1


In [41]:
import torch
import sys

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch version: 2.5.1
CUDA available: False


In [43]:
torch_version = tuple(int(x) for x in torch.__version__.split('.')[:2])
if torch_version < (2, 6):
    print("\nPyTorch version is < 2.6")
    print("   Loading models with safetensors instead...")
    USE_SAFETENSORS = True
else:
    print("PyTorch version is compatible")
    USE_SAFETENSORS = False


PyTorch version is < 2.6
   Loading models with safetensors instead...


In [7]:
import sys
sys.path.append('..')
import torch
from transformers import AutoTokenizer, AutoModel
from src.common.data_utils import load_claims, load_corpus

In [9]:
train_claims = load_claims('../data/scifact/data/claims_train.jsonl')
corpus = load_corpus('../data/scifact/data/corpus.jsonl')
print(f"Loaded {len(train_claims)} claims, {len(corpus)} docs")

Loaded 809 claims, 5183 docs


In [11]:
def create_training_instance(claim, corpus):
    """Convert claim to model input."""
    # Use oracle retrieval (gold evidence docs)
    if claim.evidence:
        doc_id = list(claim.evidence.keys())[0]
        doc = corpus[doc_id]
        
        # Build input text
        text = claim.claim
        for sent in doc.abstract:
            text += " [SEP] " + sent
        
        # Get gold evidence mask
        evidence_mask = torch.zeros(len(doc.abstract))
        for ev_list in claim.evidence.values():
            for ev in ev_list:
                for sent_idx in ev['sentences']:
                    if sent_idx < len(doc.abstract):
                        evidence_mask[sent_idx] = 1
        
        # Get label
        label_map = {'SUPPORT': 0, 'CONTRADICT': 1, 'NOT_ENOUGH_INFO': 2}
        label = label_map[claim.label]
        
        return text, label, evidence_mask
    return None

In [13]:
example = create_training_instance(train_claims[0], corpus)
if example:
    text, label, ev_mask = example
    print(f"Input text (first 200 chars): {text[:200]}")
    print(f"Label: {label}")
    print(f"Evidence mask: {ev_mask}")

In [47]:
#Load SciBERT 
sys.path.append('..')
from transformers import AutoTokenizer, AutoModel

print("Loading SciBERT...")

# Use safetensors if PyTorch version < 2.6
if USE_SAFETENSORS:
    from huggingface_hub import snapshot_download
    import os
    
    # Download model with safetensors
    cache_dir = snapshot_download(
        "allenai/scibert_scivocab_uncased",
        ignore_patterns=["*.bin"]  # Ignore .bin files, use .safetensors
    )
    
    tokenizer = AutoTokenizer.from_pretrained(cache_dir)
    model = AutoModel.from_pretrained(cache_dir)
else:
    tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
    model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

print(f"âœ“ Loaded SciBERT")
print(f"  Vocab size: {tokenizer.vocab_size}")
print(f"  Hidden size: {model.config.hidden_size}")
print(f"  Num layers: {model.config.num_hidden_layers}")

Loading SciBERT...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


.gitattributes:   0%|          | 0.00/437 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

flax_model.msgpack:   0%|          | 0.00/440M [00:00<?, ?B/s]

ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434