In [1]:
# Standard library imports
import re
from dataclasses import dataclass
from typing import Optional

# Third-party imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    PreTrainedModel,
    PretrainedConfig
)

# library here 
from utils import * 

def retrieve_citations(
    model,
    query_text: str,
    target_texts: list,
    tokenizer,
    config,
    k: int = 5,
    device = None
):
    if device is None:
        device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    model = model.to(device)

    # First, process the query document similar to WikiProcessor._find_citations
    citations = []
    for match in re.finditer(r'\[\[(.*?)\]\]', query_text):
        citations.append((match.start(), match.end(), match.group(1)))
    
    # Process query text similar to tokenize_sources
    query_encoded = tokenizer.encode_plus(
        query_text,
        add_special_tokens=False,
        return_offsets_mapping=True,
        padding=False,
        return_tensors=None
    )
    
    # Create offset to index mapping
    offset_mapping = query_encoded["offset_mapping"]
    off2i = {s:i for i, (s,_) in enumerate(offset_mapping)}
    off2i.update({e:i+1 for i, (_,e) in enumerate(offset_mapping)})
    
    # Create citation tokens array similar to tokenize_sources
    input_ids = query_encoded["input_ids"]
    cite_tokens = np.zeros(len(input_ids), dtype=int)
    mask_tokens = np.zeros(len(input_ids), dtype=int)
    
    # Fill in citations
    citation_indices = []
    for i, j, _ in citations:
        s, e = off2i[i], off2i[j]
        cite_tokens[s] = 1  # Using 1 as a placeholder
        mask_tokens[s:e] = 1
        citation_indices.append(s)
    
    # Prepare source similar to collate function
    mask_tokens = np.where(np.isin(input_ids, tokenizer.convert_tokens_to_ids(['[',']'])), 1, mask_tokens)
    mask_tokens[cite_tokens == 1] = 0
    input_ids = np.array(input_ids)
    input_ids[cite_tokens == 1] = tokenizer.convert_tokens_to_ids(config.cite_token)
    source_ids = input_ids[mask_tokens == 0]
    
    # Pad or truncate source
    if len(source_ids) > config.source_len:
        source_ids = source_ids[:config.source_len]
    else:
        source_ids = np.pad(source_ids, 
                           (0, config.source_len - len(source_ids)),
                           'constant', 
                           constant_values=tokenizer.pad_token_id)
    
    # Process target texts
    target_encoded = []
    for target in target_texts:
        tokens = tokenizer.encode_plus(
            target,
            add_special_tokens=False,
            padding=False,
            return_tensors=None
        )["input_ids"]
        
        if len(tokens) >= config.target_len - 1:
            tokens = tokens[:config.target_len-1]
        tokens = np.append(tokens, tokenizer.convert_tokens_to_ids(config.ref_token))
        
        if len(tokens) < config.target_len:
            tokens = np.pad(tokens,
                          (0, config.target_len - len(tokens)),
                          'constant',
                          constant_values=tokenizer.pad_token_id)
        
        target_encoded.append(tokens)
    
    target_ids = torch.tensor(target_encoded, dtype=torch.long).to(device)
    source_ids = torch.tensor(source_ids, dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = (source_ids != tokenizer.pad_token_id).to(device)
    target_attention_mask = (target_ids != tokenizer.pad_token_id).to(device)

    logit_scale = torch.clamp(model.logit_scale, 0, torch.log(torch.tensor(100.0)))
    # Get embeddings
    with torch.no_grad():
        # Get source embeddings
        source_outputs = model.transformer(
            input_ids=source_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        # Get target embeddings
        target_outputs = model.transformer(
            input_ids=target_ids,
            attention_mask=target_attention_mask,
            return_dict=True
        )
        
        # Extract citation and reference embeddings
        cite_mask = model.get_citation_masks(source_ids)
        cite_embeds = source_outputs.last_hidden_state[cite_mask]
        
        ref_mask = model.get_reference_masks(target_ids)
        ref_embeds = target_outputs.last_hidden_state[ref_mask]
        
        # Normalize embeddings
        cite_embeds = F.normalize(cite_embeds, p=2, dim=-1)
        ref_embeds = F.normalize(ref_embeds, p=2, dim=-1)

        logit_scale = model.logit_scale.exp()
        
        # Compute similarity scores
        logits = torch.matmul(cite_embeds, ref_embeds.t()) * logit_scale
        scores = F.softmax(logits, dim=-1)
        
        # Get top k for each citation
        top_k_scores, top_k_indices = torch.topk(scores, k=min(k, len(target_texts)), dim=1)
    
    # Prepare results
    results = []
    for i, (_, _, citation_text) in enumerate(citations):
        top_matches = []
        for j, idx in enumerate(top_k_indices[i]):
            top_matches.append({
                'text': target_texts[idx],
                'score': float(top_k_scores[i][j])
            })
        results.append({
            'citation_text': citation_text,
            'matches': top_matches
        })
    
    return results


# Load model and tokenizer
# Create model config
config = ExperimentConfig(collate_sample_size=50000,)
config.device = 'cuda:1'

experiment = Experiment(config)

tokenizer = experiment.get_tokenizer()

model = experiment.get_model()
# Initialize model
checkpoint = torch.load('./experiments/best_citation_model_4.pt')
model.load_state_dict(checkpoint['model_state_dict'])


  from .autonotebook import tqdm as notebook_tqdm
  checkpoint = torch.load('./experiments/best_citation_model_4.pt')


<All keys matched successfully>

In [8]:
checkpoint['validation_metrics']

KeyError: 'validation_metrics'

In [44]:
# Extended database of target texts and example queries for citation matching

def print_citation_results(results, max_preview_length=200):
    """
    Print the citation retrieval results in a readable format.
    
    Args:
        results: List of results from retrieve_citations
        max_preview_length: Maximum length of text preview to show
    """
    for i, result in enumerate(results, 1):
        print(f"\nCitation {i}: [[{result['citation_text']}]]")
        print("\nTop matches:")
        for j, match in enumerate(result['matches'], 1):
            preview = match['text'][:max_preview_length]
            if len(match['text']) > max_preview_length:
                preview += "...\n"
            # print(f"\n{j}. Score: {match['score']:.4f}")
            print(f"Preview: {preview}")
        print("\n" + "="*80)

target_texts = [
    # Original papers from input (keeping the most significant ones)
    """Attention Is All You Need introduces the transformer architecture, a novel sequence transduction model based entirely on attention mechanisms, dispensing with recurrence and convolutions entirely. The proposed model, called the Transformer, applies self-attention to compute representations of its input and output without using sequence-aligned recurrent neural networks (RNNs) or convolution. Experiments on translation tasks demonstrate superior quality while being more parallelizable and requiring significantly less time to train.""",
    
    """BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding presents a new language representation model that uses bidirectional training of Transformer, a popular attention model, to pre-train deep bidirectional representations from unlabeled text. BERT achieves state-of-the-art performance on eleven natural language processing tasks.""",
    
    """GPT-3: Language Models are Few-Shot Learners demonstrates that scaling up language models greatly improves task-agnostic, few-shot performance. Using a transformer architecture with 175 billion parameters, GPT-3 achieves strong performance on many NLP tasks and benchmarks without any fine-tuning, sometimes matching or exceeding state-of-the-art performance.""",

    # Additional NLP/Transformer papers
    """RoBERTa: A Robustly Optimized BERT Pretraining Approach presents key modifications to BERT training procedure, including training the model longer, with bigger batches, over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data.""",
    
    """PaLM: Scaling Language Modeling with Pathways introduces a 540-billion parameter language model trained using the Pathways system, which enables efficient training across multiple TPU pods. The model demonstrates breakthrough performance on hundreds of language tasks and exhibits reasoning capabilities that approach human-level performance.""",
    
    """LLaMA: Open and Efficient Foundation Language Models presents a collection of foundation language models ranging from 7B to 65B parameters. These models outperform larger models like GPT-3 while using significantly less training compute, demonstrating the effectiveness of focused architectural choices and training procedures.""",

    # Vision Transformers and Multi-modal
    """ViT: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale shows that a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data, Vision Transformers (ViT) attain excellent results compared to state-of-the-art convolutional networks.""",
    
    """DALL·E: Creating Images from Text demonstrates the capability to generate images from text descriptions, leveraging a transformer that autoregressively models the text and image tokens as a single stream of data. The model can create images of objects in novel combinations not explicitly present in the training data.""",
    
    """Stable Diffusion: High-Resolution Image Synthesis with Latent Diffusion Models presents a computationally efficient approach to image generation using a two-stage model: a low-dimensional latent diffusion model and an autoencoder for high-resolution image synthesis, enabling both conditional and unconditional image generation.""",

    # Reinforcement Learning and Games
    """MuZero: Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model presents a new algorithm that combines tree-based search with a learned model to achieve superhuman performance in a range of challenging domains without requiring knowledge of their underlying dynamics.""",
    
    """AlphaFold 2: Highly accurate protein structure prediction with AlphaFold describes a deep learning system that can predict protein structures with atomic accuracy, even for proteins whose structures had not been previously determined experimentally, representing a major advance in protein structure prediction.""",

    # Foundational ML/DL Papers
    """Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift introduces a technique to normalize each layer's inputs, allowing much higher learning rates and acting as a regularizer. This significantly reduces the number of training steps required to train deep networks.""",
    
    """Large Language Models Can Self-Improve explores how language models can improve their own responses through self-reflection and iteration, showing that models can generate better responses by decomposing problems, generating multiple candidate responses, and selecting the best ones.""",
    
    """Chain-of-Thought Prompting Elicits Reasoning in Large Language Models demonstrates that prompting language models to generate step-by-step reasoning before producing an answer significantly improves their performance on complex reasoning tasks.""",

    # Additional ML Applications
    """Wav2Vec: Unsupervised Pre-training for Speech Recognition presents a self-supervised approach to speech representation learning, which can be fine-tuned with limited labeled data to achieve strong performance on speech recognition tasks.""",
    
    """CLIP: Learning Transferable Visual Models From Natural Language Supervision demonstrates efficient learning of visual concepts from natural language supervision. The model can be applied to any visual classification benchmark by providing the names of the visual categories in natural language.""",
    
    """Codex: Evaluating Large Language Models Trained on Code explores the capabilities of language models trained on code, showing they can translate natural language to code, explain complex algorithms, and complete partial code snippets with reasonable accuracy."""
]

# Example queries with citation placeholders
example_queries_2 = [
    """The [[Stable Diffusion]] model revolutionized image generation by making it computationally efficient and accessible to the masses.""",
    
    """[[Chain-of-Thought Prompting]] demonstrated a crucial technique for improving language model reasoning capabilities.""",
    
    """The introduction of [[ViT]] showed that transformers could be effectively applied to computer vision tasks.""",
    
    """[[RoBERTa]] significantly improved upon BERT's performance by modifying its training procedure.""",
    
    """[[CLIP]] demonstrated how natural language supervision could be used to create versatile visual models.""",
    
    """[[PaLM]] showed remarkable reasoning capabilities approaching human-level performance in various tasks.""",
    
    """[[Wav2Vec]] introduced innovative self-supervised learning techniques for speech recognition.""",
    
    """[[Codex]] demonstrated the potential of large language models for code generation and understanding.""",
    
    """[[AlphaFold 2]] revolutionized protein structure prediction with unprecedented accuracy.""",
    
    """[[LLaMA]] proved that efficient training procedures could match larger models' performance with fewer parameters."""
]

# Example queries with citation placeholders - conceptual descriptions
example_queries = [
    """[[REF]] A breakthrough in image synthesis came with a two-stage diffusion approach using a low-dimensional latent space combined with an autoencoder, making high-quality image generation computationally feasible on consumer hardware.""",
    
    """[[REF]] The discovery that asking large language models to break down their reasoning process into steps before providing a final answer dramatically improved their problem-solving abilities.""",
    
    """[[REF]] By treating images as sequences of patches and applying transformer architectures directly to these sequences, researchers demonstrated that convolutional neural networks weren't necessary for computer vision.""",
    
    """[[REF]] Through careful optimization of training procedures, including longer training times, larger batches, and dynamic masking patterns, researchers significantly enhanced the performance of bidirectional transformer models.""",
    
    """[[REF]] A major advance in computer vision came through training models to match images with natural language descriptions, creating visual representations that could generalize to any visual classification task.""",
    
    """[[REF]] The 540-billion parameter model trained on the Pathways system marked a significant milestone in language model capabilities, approaching human-level performance across hundreds of tasks.""",
    
    """[[REF]] Self-supervised learning revolutionized speech recognition by creating robust representations from unlabeled audio data that could be fine-tuned with minimal labeled data.""",
    
    """[[REF]] The development of specialized language models trained on programming data demonstrated that AI could understand, generate, and translate between different programming languages.""",
    
    """[[REF]] A breakthrough in computational biology arrived with deep learning systems capable of predicting protein structures with atomic accuracy, even for previously unstudied proteins.""",
    
    """[[REF]] The development of foundation models ranging from 7B to 65B parameters showed that careful architecture design and training procedures could achieve state-of-the-art results with significantly reduced computational requirements.""",
    
    """[[REF]] The introduction of attention mechanisms that completely replaced recurrent and convolutional operations transformed the field of natural language processing.""",
    
    """[[REF]] A new approach to object detection reframed the problem as direct regression of bounding boxes and class probabilities, enabling real-time performance.""",
    
    """[[REF]] By training two models in opposition - one generating fake data and another detecting fakes - researchers created a framework for learning complex data distributions.""",
    
    """[[REF]] The discovery that randomly dropping out neurons during training could prevent neural networks from becoming overly dependent on specific features greatly improved generalization.""",
    
    """[[REF]] A method for efficient stochastic optimization that adapts learning rates based on first and second moments of the gradients became the de facto standard for training neural networks.""",
    
    """[[REF]] The introduction of skip connections allowed for the successful training of extremely deep neural networks, revolutionizing computer vision architectures.""",
    
    """[[REF]] A technique for normalizing layer inputs in neural networks dramatically accelerated training by reducing internal covariate shift.""",
    
    """[[REF]] The development of a pure reinforcement learning system that mastered Go without human knowledge demonstrated the potential of learning complex strategies from first principles.""",
    
    """[[REF]] A framework for examining the behavior of large language models as knowledge bases revealed their capacity to store and retrieve factual information.""",
    
    """[[REF]] The creation of a system that could solve competitive programming problems at a human-competitive level marked a significant advance in automated software development."""
]

# The rest of the code (target_texts and functions) remains the same


# Example usage:
if __name__ == "__main__":
    for query in example_queries:
    # query = example_queries[1]  # Use first example query
        print(query)
        results = retrieve_citations(
            model=model,
            query_text=query,
            target_texts=target_texts,
            tokenizer=tokenizer,
            config=config,
            k=5
        )
        print_citation_results(results)

[[REF]] A breakthrough in image synthesis came with a two-stage diffusion approach using a low-dimensional latent space combined with an autoencoder, making high-quality image generation computationally feasible on consumer hardware.

Citation 1: [[REF]]

Top matches:
Preview: Stable Diffusion: High-Resolution Image Synthesis with Latent Diffusion Models presents a computationally efficient approach to image generation using a two-stage model: a low-dimensional latent diffu...

Preview: ViT: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale shows that a pure transformer applied directly to sequences of image patches can perform very well on image classificati...

Preview: DALL·E: Creating Images from Text demonstrates the capability to generate images from text descriptions, leveraging a transformer that autoregressively models the text and image tokens as a single str...

Preview: Attention Is All You Need introduces the transformer architecture, a novel seque