In [None]:
# Standard library imports
import re
from dataclasses import dataclass
from typing import Optional

# Third-party imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    PreTrainedModel,
    PretrainedConfig
)

# library here 
from main import * 

def retrieve_citations(
    model,
    query_text: str,
    target_texts: list,
    tokenizer,
    config,
    k: int = 5,
    device = None
):
    if device is None:
        device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    model = model.to(device)

    # First, process the query document similar to WikiProcessor._find_citations
    citations = []
    for match in re.finditer(r'\[\[(.*?)\]\]', query_text):
        citations.append((match.start(), match.end(), match.group(1)))
    
    # Process query text similar to tokenize_sources
    query_encoded = tokenizer.encode_plus(
        query_text,
        add_special_tokens=False,
        return_offsets_mapping=True,
        padding=False,
        return_tensors=None
    )
    
    # Create offset to index mapping
    offset_mapping = query_encoded["offset_mapping"]
    off2i = {s:i for i, (s,_) in enumerate(offset_mapping)}
    off2i.update({e:i+1 for i, (_,e) in enumerate(offset_mapping)})
    
    # Create citation tokens array similar to tokenize_sources
    input_ids = query_encoded["input_ids"]
    cite_tokens = np.zeros(len(input_ids), dtype=int)
    mask_tokens = np.zeros(len(input_ids), dtype=int)
    
    # Fill in citations
    citation_indices = []
    for i, j, _ in citations:
        s, e = off2i[i], off2i[j]
        cite_tokens[s] = 1  # Using 1 as a placeholder
        mask_tokens[s:e] = 1
        citation_indices.append(s)
    
    # Prepare source similar to collate function
    mask_tokens = np.where(np.isin(input_ids, tokenizer.convert_tokens_to_ids(['[',']'])), 1, mask_tokens)
    mask_tokens[cite_tokens == 1] = 0
    input_ids = np.array(input_ids)
    input_ids[cite_tokens == 1] = tokenizer.convert_tokens_to_ids(config.cite_token)
    source_ids = input_ids[mask_tokens == 0]
    
    # Pad or truncate source
    if len(source_ids) > config.source_len:
        source_ids = source_ids[:config.source_len]
    else:
        source_ids = np.pad(source_ids, 
                           (0, config.source_len - len(source_ids)),
                           'constant', 
                           constant_values=tokenizer.pad_token_id)
    
    # Process target texts
    target_encoded = []
    for target in target_texts:
        tokens = tokenizer.encode_plus(
            target,
            add_special_tokens=False,
            padding=False,
            return_tensors=None
        )["input_ids"]
        
        if len(tokens) >= config.target_len - 1:
            tokens = tokens[:config.target_len-1]
        tokens = np.append(tokens, tokenizer.convert_tokens_to_ids(config.ref_token))
        
        if len(tokens) < config.target_len:
            tokens = np.pad(tokens,
                          (0, config.target_len - len(tokens)),
                          'constant',
                          constant_values=tokenizer.pad_token_id)
        
        target_encoded.append(tokens)
    
    target_ids = torch.tensor(target_encoded, dtype=torch.long).to(device)
    source_ids = torch.tensor(source_ids, dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = (source_ids != tokenizer.pad_token_id).to(device)
    target_attention_mask = (target_ids != tokenizer.pad_token_id).to(device)

    logit_scale = torch.clamp(model.logit_scale, 0, torch.log(torch.tensor(100.0)))
    # Get embeddings
    with torch.no_grad():
        # Get source embeddings
        source_outputs = model.transformer(
            input_ids=source_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        # Get target embeddings
        target_outputs = model.transformer(
            input_ids=target_ids,
            attention_mask=target_attention_mask,
            return_dict=True
        )
        
        # Extract citation and reference embeddings
        cite_mask = model.get_citation_masks(source_ids)
        cite_embeds = source_outputs.last_hidden_state[cite_mask]
        
        ref_mask = model.get_reference_masks(target_ids)
        ref_embeds = target_outputs.last_hidden_state[ref_mask]
        
        # Normalize embeddings
        cite_embeds = F.normalize(cite_embeds, p=2, dim=-1)
        ref_embeds = F.normalize(ref_embeds, p=2, dim=-1)

        logit_scale = model.logit_scale.exp()
        
        # Compute similarity scores
        logits = torch.matmul(cite_embeds, ref_embeds.t()) * logit_scale
        scores = F.softmax(logits, dim=-1)
        
        # Get top k for each citation
        top_k_scores, top_k_indices = torch.topk(scores, k=min(k, len(target_texts)), dim=1)
    
    # Prepare results
    results = []
    for i, (_, _, citation_text) in enumerate(citations):
        top_matches = []
        for j, idx in enumerate(top_k_indices[i]):
            top_matches.append({
                'text': target_texts[idx],
                'score': float(top_k_scores[i][j])
            })
        results.append({
            'citation_text': citation_text,
            'matches': top_matches
        })
    
    return results


# Extended database of target texts and example queries for citation matching

def print_citation_results(results, max_preview_length=500):
    """
    Print the citation retrieval results in a readable format.
    
    Args:
        results: List of results from retrieve_citations
        max_preview_length: Maximum length of text preview to show
    """
    for i, result in enumerate(results, 1):
        print(f"\nCitation {i}: [[{result['citation_text']}]]")
        print("\nTop matches:")
        total_score = 0
        for j, match in enumerate(result['matches'], 1):
            preview = match['text'][:max_preview_length]
            if len(match['text']) > max_preview_length:
                preview += "..."
            print(f"\n{j}. Score: {match['score']:.4f}")
            print(f"Preview: {preview}")
            total_score += match['score']
            if total_score >.6:
                break
        print("\n" + "="*80)


In [None]:


# Initialize model
# checkpoint = torch.load('./checkpoints/citation-matching/decent-cherry-79/checkpoint-step-3000.pt')
# checkpoint = torch.load('./checkpoints/citation-matching/fresh-feather-80/checkpoint-step-2000.pt')
checkpoint = torch.load('./checkpoints/citation-matching/wise-valley-96/checkpoint-step-5500.pt')

config = checkpoint['config']

experiment = Experiment(config)

tokenizer = experiment.get_tokenizer()

model = experiment.get_model()

model.load_state_dict(checkpoint['model_state_dict'])


In [None]:
# Target texts containing the papers referenced in queries
target_texts = [
    """Deep Residual Learning for Image Recognition introduces skip connections that allow training of very deep neural networks by addressing the vanishing gradient problem. The approach enables successful training of networks with hundreds of layers, leading to significant improvements in visual recognition tasks. The key innovation is the residual learning framework that explicitly reformulates layers as learning residual functions with reference to the layer inputs, rather than learning unreferenced functions. These skip connections allow gradients to flow directly through the network, enabling much deeper architectures to be trained effectively. Empirically, residual networks are easier to optimize and can gain accuracy from considerably increased depth, producing results substantially better than previous networks on the ImageNet dataset.""",

    """BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding presents a new language representation model that uses bidirectional training of Transformer to pre-train deep bidirectional representations from unlabeled text. Unlike previous models which looked at a text sequence either from left to right or combined left-to-right and right-to-left training, BERT jointly conditions on both left and right context in all layers. The pre-training tasks include masked language modeling where some percentage of input tokens are masked and the model attempts to predict them, and next sentence prediction where the model learns relationships between sentences. This approach enables BERT to achieve state-of-the-art performance on eleven natural language processing tasks including GLUE, MultiNLI, and SQuAD, without substantial task-specific architecture modifications.""",

    """ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators introduces a more sample-efficient pre-training task that predicts whether tokens have been replaced by a generator network. Instead of masking input tokens and training the network to reconstruct them like BERT, ELECTRA trains a discriminative model that predicts whether each token in the corrupted input was replaced by a generator network or not. This replaced token detection pre-training task is more computationally efficient than masked language modeling, requiring only a fraction of the compute to achieve similar performance. The method's efficiency comes from training the model on all input tokens rather than just a small masked subset, allowing it to learn from more positions per input.""",

    """AlphaFold 2: Highly accurate protein structure prediction presents a system that can predict protein structures with atomic accuracy, revolutionizing structural biology. The system uses a novel neural network architecture that incorporates physical and biological knowledge about protein structure, including an attention-based neural network operating over evolutionarily related sequences, and a novel equivariant attention network that builds a representation of atomic coordinates in 3D space. The model was trained end-to-end to optimize structure predictions by combining multiple objectives. It achieves unprecedented levels of accuracy on protein structure prediction, reaching accuracy competitive with experimental methods in many cases and far surpassing previous computational methods.""",

    """MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications presents a class of efficient models that use depth-wise separable convolutions to build lightweight deep neural networks. The architecture factorizes standard convolutions into depth-wise convolutions that filter input channels and point-wise convolutions that combine filtered outputs. This factorization dramatically reduces computation and model size. Additionally, the paper introduces width and resolution multipliers as parameters to further reduce computational cost. These innovations make MobileNets particularly suitable for mobile and embedded vision applications, achieving accuracy comparable to larger networks while being significantly more efficient in terms of size and computation.""",

    """Dropout: A Simple Way to Prevent Neural Networks from Overfitting proposes randomly dropping units during training as an efficient way to prevent co-adaptation of neurons. The technique temporarily removes random neurons along with their connections during training, forcing the network to learn more robust features that are useful in conjunction with many different random subsets of neurons. Each training case uses a different random configuration, equivalent to training a large ensemble of neural networks with extensive weight sharing. The method significantly reduces overfitting and provides major improvements over other regularization methods, proving particularly effective in large networks with many parameters.""",

    """Proximal Policy Optimization Algorithms presents a new family of policy gradient methods for reinforcement learning that alternate between sampling data through interaction with the environment and optimizing a surrogate objective function. The key innovation is a new objective function that enables multiple epochs of minibatch updates while preventing destructively large policy changes. PPO uses a clipped probability ratio in the objective function to penalize steps that would move the policy too far from the previous version. This approach provides the performance benefits of trust region policy optimization while being much simpler to implement and tune. The algorithm demonstrates robust performance across a wide range of tasks while requiring minimal hyperparameter tuning.""",

    """Adam: A Method for Stochastic Optimization presents an algorithm for first-order gradient-based optimization based on adaptive estimates of lower-order moments. The method computes individual adaptive learning rates for different parameters from estimates of first and second moments of the gradients. It combines the advantages of two popular methods: AdaGrad's ability to deal with sparse gradients and RMSProp's ability to deal with non-stationary objectives. Adam is computationally efficient, requires little memory, is invariant to diagonal rescaling of gradients, and is well-suited for problems that are large in terms of data and/or parameters. The paper provides proof of convergence and empirically demonstrates superior performance compared to other stochastic optimization methods.""",

    """ViT: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale shows that a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. The approach divides an image into fixed-size patches, linearly embeds each patch, adds position embeddings, and feeds the resulting sequence of vectors to a standard Transformer encoder. When pre-trained on large amounts of data, Vision Transformers (ViT) attain excellent results compared to state-of-the-art convolutional networks while requiring substantially less computational resources to train. This work challenges the dominance of convolutional architectures in computer vision and suggests that transformer-based models might be sufficient as a general architecture for computer vision tasks.""",

    """Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift introduces a technique to normalize each layer's inputs by adjusting and scaling activations using running statistics computed during training. This addresses the internal covariate shift problem where the distribution of network activations changes during training as parameters of previous layers change. By normalizing activations to have specified mean and variance, batch normalization enables higher learning rates and acts as a regularizer. The technique dramatically reduces training time, improves performance, and in some cases eliminates the need for Dropout. It has become a standard component in most deep neural network architectures.""",

    """StyleGAN: A Style-Based Generator Architecture for Generative Adversarial Networks introduces an alternative generator architecture for GANs that enables automatic, unsupervised separation of high-level attributes and stochastic variation. The architecture builds on Progressive GANs but redesigns the generator to control image synthesis via scale-specific modifications of the latent code. This enables automatic, unsupervised separation of high-level attributes (e.g., pose, identity) from stochastic variation (e.g., freckles, hair) in the generated images. The method also introduces several improvements to the training process and introduces new ways to quantify GAN performance, leading to state-of-the-art results in terms of both quality and variation.""",

    """T5: Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer presents a unified framework that converts all text-based language problems into a text-to-text format. The approach systematically studies different pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. The resulting Text-to-Text Transfer Transformer (T5) framework achieves state-of-the-art results on many NLP benchmarks while being flexible enough to be fine-tuned for any text-based task. The study provides comprehensive empirical results on the relative importance of various components in the transfer learning pipeline.""",

    """DALL·E 2: Hierarchical Text-Conditional Image Generation with CLIP Latents demonstrates improved image generation capabilities through a two-stage model leveraging a frozen CLIP image encoder. The system first generates CLIP image embeddings conditioned on text descriptions, then decodes these embeddings into images. This approach enables high-quality, diverse image generation that maintains semantic consistency with input text while providing fine-grained control over generated content. The model demonstrates remarkable capabilities in combining concepts in novel ways, handling complex compositions, and maintaining artistic styles, representing a significant advance in text-to-image generation.""",

    """Rainbow: Combining Improvements in Deep Reinforcement Learning integrates multiple extensions to DQN into a single algorithm. The combined approach incorporates double Q-learning, prioritized replay, dueling networks, multi-step learning, distributional RL, and noisy nets. The paper systematically evaluates the contribution of each component and demonstrates that their combination leads to state-of-the-art performance on the Atari 2600 benchmark. This unified approach provides both improved performance and increased stability compared to previous methods, while offering insights into how different DQN improvements interact and complement each other.""",

    """EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks proposes a systematic approach to model scaling that uniformly scales network width, depth, and resolution using a compound coefficient. Unlike previous approaches that arbitrarily scaled these dimensions, EfficientNet uses neural architecture search to optimize the balance between them, leading to more efficient models. The baseline network (EfficientNet-B0) demonstrates strong performance, and when scaled up, the approach consistently outperforms previous ConvNets while using orders of magnitude fewer parameters. The method provides a new paradigm for designing efficient deep neural networks through principled scaling.""",
    
    """Transformer: Attention Is All You Need introduces the transformer architecture, which relies entirely on attention mechanisms to model dependencies in data sequences. By dispensing with recurrence and convolution, the architecture achieves greater parallelization and performance, laying the groundwork for advancements like BERT and GPT.""",

    """GPT-3: Language Models are Few-Shot Learners demonstrates that large-scale language models trained on diverse datasets can perform well in a wide range of tasks with minimal task-specific fine-tuning. The model’s autoregressive architecture and scale enable few-shot learning capabilities unprecedented in natural language processing.""",

    """RoBERTa: A Robustly Optimized BERT Pretraining Approach improves the BERT architecture by refining its pretraining procedure. Changes include removing the next sentence prediction task, using larger batch sizes, and training on longer sequences. These enhancements lead to better performance on downstream tasks.""",

    """Neural Ordinary Differential Equations proposes a new class of deep learning models that parameterize continuous transformations of data. These models interpret the computation graph as a discretization of an ODE solver, enabling continuous-depth architectures and applications in time-series modeling.""",

    """WaveNet: A Generative Model for Raw Audio presents a deep generative model for audio synthesis. The model uses dilated causal convolutions to model audio waveforms, achieving state-of-the-art performance in text-to-speech synthesis and music generation.""",

    """UNet: Convolutional Networks for Biomedical Image Segmentation introduces a fully convolutional network specifically designed for semantic segmentation in medical imaging. Its symmetric encoder-decoder architecture with skip connections enables precise localization and segmentation.""",

    """GAN: Generative Adversarial Networks introduces a framework for training generative models using a game-theoretic approach. Two networks—a generator and a discriminator—are trained simultaneously to produce data indistinguishable from real examples.""",

    """Pix2Pix: Image-to-Image Translation with Conditional Adversarial Networks demonstrates that conditional GANs can be used for a variety of image-to-image translation tasks, such as converting sketches to photographs or grayscale to color images.""",

    """Fast R-CNN: Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation improves object detection frameworks by introducing a Region of Interest (RoI) pooling layer. The method accelerates training and inference while improving accuracy.""",

    """YOLO: You Only Look Once proposes a unified architecture for real-time object detection that treats detection as a regression problem. By directly predicting bounding boxes and class probabilities from images, YOLO achieves state-of-the-art speed and accuracy.""",

    """DenseNet: Densely Connected Convolutional Networks introduces dense connectivity between layers in deep networks, enhancing feature reuse and gradient flow. This architecture achieves state-of-the-art performance on image classification tasks while being computationally efficient.""",

    """Swin Transformer: Hierarchical Vision Transformer Using Shifted Windows adapts the transformer architecture for computer vision by introducing a shifted window mechanism. This hierarchical structure enables efficient processing of high-resolution images.""",

    """FastText: Efficient Text Classification and Representation Learning introduces an efficient approach to learning word embeddings and performing text classification. It combines n-gram features and hierarchical softmax to achieve high-speed and high-accuracy text processing.""",

    """Self-Supervised Learning of Pretext-Invariant Representations (SimCLR) demonstrates that contrastive learning frameworks can achieve competitive performance in representation learning. SimCLR emphasizes data augmentations and a contrastive loss function to learn embeddings without labels.""",

    """VAE: Auto-Encoding Variational Bayes proposes a probabilistic generative model that learns latent variable representations by maximizing the evidence lower bound (ELBO). This approach enables unsupervised learning of compact and interpretable representations.""",

    """Graph Neural Networks (GNNs) propose a framework for learning representations of graph-structured data by iteratively aggregating information from neighboring nodes. Applications include social network analysis, molecular modeling, and recommendation systems.""",

    """XLNet: Generalized Autoregressive Pretraining for Language Understanding extends autoregressive language modeling to integrate bidirectional context. XLNet employs permutation-based training to improve upon BERT's pretraining approach, achieving state-of-the-art NLP results.""",

    """DeepDream: A Code Example for Visualizing Neural Networks popularizes techniques for visualizing the features learned by deep networks. The method involves enhancing patterns that activate specific neurons, creating dreamlike visualizations.""",

    """BigGAN: Large Scale GAN Training for High Fidelity Natural Image Synthesis demonstrates that increasing model and dataset size leads to significant improvements in GAN performance. BigGAN introduces training techniques for stability and scalability.""",

    """DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolutions, and Fully Connected CRFs proposes a new framework for semantic segmentation. Atrous convolutions allow dense feature extraction, while fully connected CRFs refine predictions.""",

    """LightGBM: A Highly Efficient Gradient Boosting Decision Tree introduces a scalable, efficient framework for gradient boosting. The algorithm employs histogram-based learning and leaf-wise tree growth to optimize for large datasets.""",

    """XGBoost: A Scalable Tree Boosting System introduces a highly efficient and scalable gradient boosting framework for classification and regression tasks. The framework includes novel features like weighted quantile sketch and sparsity-aware splits.""",

    """RLHF: Reinforcement Learning from Human Feedback proposes a framework for training AI systems using feedback from human evaluators. This approach enables the alignment of AI behavior with human preferences, improving safety and interpretability.""",

    """Contrastive Predictive Coding (CPC) introduces a self-supervised learning method that predicts future latent representations using contrastive losses. CPC learns representations from raw inputs without labels, achieving state-of-the-art results in various domains.""",

    """DQN: Playing Atari with Deep Reinforcement Learning presents the first application of deep Q-learning to Atari games. Using convolutional neural networks, the algorithm learns directly from raw pixel inputs to achieve human-level performance.""",

    """DeepWalk: Online Learning of Social Representations applies random walks and Skip-Gram models to graph data for unsupervised representation learning. It captures latent structures in networks, enabling tasks like node classification.""",

    """Text-to-Text Transfer Transformer (T5) proposes a unifying framework for NLP tasks by treating all text-based problems as text-to-text transformations. T5 systematically compares pretraining objectives and achieves state-of-the-art results.""",

    """SqueezeNet: AlexNet-level Accuracy with 50x Fewer Parameters introduces a compact CNN architecture that achieves high accuracy with fewer parameters. It employs squeeze-and-expand modules to optimize network efficiency.""",

    """AdaBoost: A Boosting Algorithm for Combining Weak Learners demonstrates how weak learners can be combined iteratively to form a strong classifier. The algorithm adaptively focuses on difficult examples, improving performance.""",

    """NAS: Neural Architecture Search with Reinforcement Learning explores automated design of neural networks using reinforcement learning. The approach discovers high-performing architectures without manual tuning.""",

    """T-SNE: Stochastic Neighbor Embedding introduces a dimensionality reduction method optimized for visualization. By minimizing a KL divergence between high-dimensional and low-dimensional distributions, t-SNE creates interpretable visualizations.""",

    """DeepSVDD: Deep Support Vector Data Description applies deep learning to anomaly detection. The method trains a deep neural network to map data to a hypersphere, isolating normal samples and identifying anomalies.""",

    """OPT: Open Pretrained Transformers introduces a family of open-source language models designed to provide transparency in pretraining and evaluation. OPT models achieve performance comparable to proprietary systems like GPT.""",
]


query_conceptual = [
    """[[Deep Residual Learning for Image Recognition]] A mathematical insight enabled researchers to create incredibly deep networks by adding simple bypass paths, solving a fundamental training obstacle.""",
    
    """[[BERT]] Scientists developed a way for computers to understand the meaning of words by examining both past and future context simultaneously in unmarked text.""",
    
    """[[ELECTRA]] By teaching machines to judge whether small text changes were valid or not, researchers created more efficient natural language understanding systems.""",
    
    """[[AlphaFold 2]] A breakthrough in scientific computing came when researchers developed a system that could deduce the 3D shape of proteins just from their chemical makeup.""",
    
    """[[MobileNets]] Engineers created smaller, faster neural networks by redesigning how information flows between layers, making advanced AI possible on smartphones.""",
    
    """[[Dropout]] Scientists discovered that temporarily deactivating random parts of a neural network during training helped it learn more robust and generalizable patterns.""",
    
    """[[Proximal Policy Optimization Algorithms]] A novel learning method allowed AI systems to improve at tasks like video games by repeatedly trying different strategies and measuring their success.""",
    
    """[[Adam]] Researchers found that adjusting the learning speed of individual parameters based on their historical behavior led to more stable and efficient training.""",
    
    """[[ViT]] A revolutionary approach to computer vision emerged when scientists treated images as a sequence of small squares rather than using traditional filtering techniques.""",
    
    """[[Batch Normalization]] By creating a standardized way to adjust the flow of information between neural network layers, researchers dramatically reduced the time needed for training.""",
    
    """[[StyleGAN]] Researchers created a system that could separate the fundamental characteristics of images from their random variations, enabling better control over generated content.""",
    
    """[[T5]] A unified approach transformed all language-related tasks into a simple format of converting one piece of text into another.""",
    
    """[[DALL·E 2]] By understanding the relationship between text descriptions and visual concepts, scientists created a system that could generate images matching complex descriptions.""",
    
    """[[Rainbow]] Combining multiple strategies for learning from experience into a single system led to dramatic improvements in game-playing AI.""",
    
    """[[EfficientNet]] A systematic study of neural network design revealed that carefully balancing different aspects of network architecture could achieve better results with fewer resources."""
]

query_citation_style = [
   "The training of deep neural networks faced a significant barrier until researchers developed [[Deep Residual Learning for Image Recognition]], which introduced skip connections that fundamentally changed how we could build and train extremely deep architectures.",

   "Natural language processing underwent a revolutionary transformation with the introduction of [[BERT]], which enabled contextual understanding of text by processing it bidirectionally, setting new standards for language comprehension tasks.",

   "The efficiency of language model pre-training saw significant improvements with [[ELECTRA]], which demonstrated that training models to detect replaced tokens could achieve better results with substantially less computational resources.",

   "The field of structural biology experienced a paradigm shift when [[AlphaFold 2]] demonstrated the ability to predict protein structures with unprecedented accuracy, transforming our approach to understanding protein folding.",

   "Mobile device capabilities expanded dramatically through [[MobileNets]], which enabled efficient neural network implementations that maintained high accuracy while operating within the constraints of mobile hardware.",

   "Neural network training became more robust with the discovery of [[Dropout]], a surprisingly simple yet effective technique that prevents overfitting by randomly disabling neurons during the training process.",

   "Reinforcement learning algorithms achieved greater stability and efficiency through [[Proximal Policy Optimization Algorithms]], which introduced a more reliable approach to policy improvement in complex environments.",

   "The challenge of optimizing neural networks was significantly simplified by [[Adam]], which provided an adaptive learning rate method that has become the default choice for many deep learning applications.",

   "Computer vision architecture underwent a fundamental shift when [[ViT]] demonstrated that the transformer architecture, previously dominant in NLP, could be effectively applied to image processing by treating images as sequences of patches.",

   "The speed and stability of neural network training improved dramatically with [[Batch Normalization]], which addressed the internal covariate shift problem that had previously hindered deep network training.",

   "Image generation capabilities took a significant leap forward with [[StyleGAN]], which introduced a novel architecture that could separately control different aspects of generated images, from high-level attributes to fine details.",

   "Language model architecture was unified and simplified through [[T5]], which demonstrated that treating all NLP tasks as text-to-text transformations could lead to superior performance across a wide range of applications.",

   "The field of AI-driven image creation advanced significantly with [[DALL·E 2]], which demonstrated unprecedented capabilities in generating highly detailed and contextually accurate images from natural language descriptions.",

   "Reinforcement learning performance in complex environments improved substantially when [[Rainbow]] combined multiple algorithmic improvements into a single, more capable system.",

   "Neural network scaling efficiency was revolutionized when [[EfficientNet]] introduced a principled approach to balancing network dimensions, leading to more optimal resource utilization in deep learning systems."
]

query_citation_style_no_keywords = [
    "Researchers introduced a method to connect layers in complex computational systems that addressed the issue of diminishing signals in deep configurations [[Deep Residual Learning for Image Recognition]].",

    "A novel framework enabled machines to better interpret written language by leveraging the context from both before and after each word during training [[BERT]].",

    "A new approach involved teaching models to recognize artificially modified segments in text, enabling them to learn with fewer computational resources [[ELECTRA]].",

    "Scientists made significant progress in predicting the spatial arrangement of molecular structures, setting a new standard for accuracy in biology [[AlphaFold 2]].",

    "Developers designed lightweight architectures that improved computational efficiency for applications on smaller devices without sacrificing accuracy [[MobileNets]].",

    "During training, a method involving selective suppression of random components helped systems improve their ability to generalize [[Dropout]].",

    "A game-changing algorithm allowed for controlled updates in decision-making models, preventing disruptive changes while improving outcomes [[Proximal Policy Optimization Algorithms]].",

    "An adaptive optimization technique made it easier to train systems efficiently by adjusting parameters based on previous updates and trends [[Adam]].",

    "Scientists approached visual data processing by breaking images into small segments and analyzing them as sequential elements, achieving remarkable accuracy [[ViT]].",

    "A solution to stabilize training of deep systems involved recalibrating the outputs of intermediate steps, which accelerated learning and reduced inconsistencies [[Batch Normalization]].",

    "Researchers developed a framework for generating images that could independently manipulate large-scale attributes and fine details, offering better control over the output [[StyleGAN]].",

    "A universal strategy was devised to reframe all textual problems as simple transformations from one text format to another, streamlining various applications [[T5]].",

    "Researchers created a system that could produce visual content based on descriptions, capturing intricate relationships between text and images [[DALL·E 2]].",

    "By merging several independent enhancements into one cohesive model, scientists significantly boosted performance in environments requiring learning from interaction [[Rainbow]].",

    "A structured approach to designing efficient systems revealed that adjusting specific aspects of model size proportionally could maximize performance while minimizing resource use [[EfficientNet]].",

    "A sequence-based architecture that discarded traditional recurrence methods transformed how dependencies are modeled in sequential data processing tasks [[Transformer: Attention Is All You Need]].",

    "Researchers demonstrated that significantly increasing the size of systems enabled them to perform a variety of tasks with minimal adjustment for individual applications [[GPT-3]].",

    "Improved pretraining methods resulted in a robust system for analyzing written language, optimized through changes like larger datasets and the removal of unnecessary tasks [[RoBERTa]].",

    "A mathematical model reformulated deep structures as continuous transformations, enabling a novel perspective on depth and applications in sequential data [[Neural Ordinary Differential Equations]].",

    "Engineers revolutionized audio generation by modeling data directly at the waveform level, creating a system that excelled in producing natural sound [[WaveNet]].",

    "A compact architecture was designed to perform precise segmentation of visual data by combining hierarchical encoding and decoding structures [[UNet]].",

    "A framework was introduced where two competing networks worked together to create realistic samples indistinguishable from actual data, opening a new era in generative modeling [[GAN]].",

    "By conditioning data generation on specific inputs, researchers created a system capable of translating one type of visual representation into another with high fidelity [[Pix2Pix]].",

    "A method for rapidly detecting objects redefined how visual data is processed, combining speed with accuracy through a single computational pass [[YOLO]].",

    "A system for dense information sharing between layers reduced inefficiencies in traditional designs, yielding better performance with fewer resources [[DenseNet]].",

    "Researchers introduced a hierarchical system for analyzing visual inputs, allowing for efficient and scalable processing of large and detailed data sets [[Swin Transformer]].",

    "A novel approach to self-supervised learning demonstrated how contrasting data augmentations could help systems understand structure without explicit labels [[SimCLR]].",

    "A model for learning compact representations from raw data introduced probabilistic reasoning into generating data from latent variables [[VAE]].",

    "A flexible architecture for analyzing interconnected systems provided solutions for problems involving networks of relationships, like social data and molecules [[Graph Neural Networks]].",

    "A new training framework leveraged context in multiple directions to achieve better understanding of sequential data by reordering how training examples are processed [[XLNet]]."
]



# The rest of the code (target_texts and functions) remains the same


# Example usage:
if __name__ == "__main__":
    for query in query_citation_style_no_keywords:
        # query = example_queries[1]  # Use first example query
        print(query)
        results = retrieve_citations(
            model=model,
            query_text=query,
            target_texts=target_texts,
            tokenizer=tokenizer,
            config=config,
            k=30
        )
        print_citation_results(results)

In [None]:
target_texts = [
     "horse is a large hoofed mammal known for its strength and speed. With powerful legs designed for running, horses can reach speeds of up to 55 mph (88 km/h). They are social animals that have been domesticated for over 6,000 years, serving humans in transportation, agriculture, and sport. " ,
     "snail is a slow-moving mollusk that carries its home, a spiral shell, on its back. These gastropods move by secreting mucus and using their muscular foot to glide along surfaces. Most land snails are herbivorous, feeding on plants, fungi, and algae. They can seal their shell opening with a layer of dried mucus to protect themselves from predators and prevent dehydration. ",
    "snake is a legless reptile that moves by flexing its body in a serpentine motion. They are carnivorous predators that can unhinge their jaws to swallow prey much larger than their head size. Snakes use their forked tongues to collect chemical information about their environment, which is processed by a special organ called the Jacobson's organ. ",
    "penguin is a flightless aquatic bird adapted perfectly for life in cold climates. Their dense feathers, which trap a layer of warm air next to their skin, and a thick layer of blubber help them survive in extreme Antarctic conditions. Despite being awkward on land, penguins are exceptional swimmers, using their wings as flippers to 'fly' through water at speeds up to 22 mph (35 km/h). ",
    "octopus is a highly intelligent cephalopod with eight arms lined with powerful suction cups. They are masters of camouflage, able to change both color and texture to match their surroundings in less than a second. Octopuses have three hearts and a distributed nervous system, with two-thirds of their neurons located in their arms. They are known for their problem-solving abilities, tool use, ",
    "giraffe is the world's tallest living land animal, with adult males reaching heights of up to 18 feet (5.5 meters). Their distinctive spotted coat pattern is unique to each individual, like human fingerprints. Despite their long necks, giraffes have the same number of neck vertebrae as humans - just seven - though each one can be over 10 inches (25 cm) long. ",
]

target_texts = [
    "table",
    "chair",
    "pan",
    "horse",
]
# Example question with citation placeholder
query = "question: Which one is an animal  "

def display_question(question_data):
    print("Question:")
    print(question_data["source"])
    print("\nChoices:")
    for choice, text in question_data["targets"].items():
        print(f"\n{choice}) {text}")
    print("\nCorrect Answer:", question_data["metadata"]["correct_answer"])
    prin

# Example usage:
if __name__ == "__main__":
    print(query)
    results = retrieve_citations(
        model=model,
        query_text=query,
        target_texts=target_texts,
        tokenizer=tokenizer,
        config=config,
        k=30
    )
    print_citation_results(results)