In [20]:
import os
import re
from glob import glob
from tqdm.auto import tqdm

def parse_tex_file(tex_file_path):
    try:
        with open(tex_file_path, 'r', encoding="utf-8") as file:
            tex_content = file.read()
    except UnicodeDecodeError:
        with open(tex_file_path, 'r', encoding="latin1") as file:
            tex_content = file.read()

    citation_sentences = re.findall(r'(.*?\\cite\{.*?\}.*?[\.!?])', tex_content)
    citation_sentences = [' '.join(sentence.split()) for sentence in citation_sentences]  # Remove excess whitespace
    return citation_sentences


def parse_bbl_file(bbl_file_path):
    try:
        with open(bbl_file_path, 'r', encoding="utf-8") as file:
            bbl_content = file.read()
    except UnicodeDecodeError:
        with open(bbl_file_path, 'r', encoding="latin1") as file:
            bbl_content = file.read()

    bib_entries = re.findall(r'\\bibitem\{(.*)\}\n(.*?)(?=(\\bibitem|$))', bbl_content, re.DOTALL)
    bib_entries_dict = {key: ' '.join(text.split()) for key, _, text in bib_entries}  # Remove excess whitespace
    return bib_entries_dict


def load_data_from_files(directory):
    tex_file_paths = glob(os.path.join(directory, '**/*.tex'), recursive=True)
    bbl_file_paths = glob(os.path.join(directory, '**/*.bbl'), recursive=True)

    corpus = []
    for tex_file_path in tqdm(tex_file_paths, desc='Parsing tex files'):
        corpus.extend(parse_tex_file(tex_file_path))

    bib_entries = {}
    for bbl_file_path in tqdm(bbl_file_paths, desc='Parsing bbl files'):
        bib_entries.update(parse_bbl_file(bbl_file_path))

    return corpus, bib_entries

def create_corpus_citation_masked(corpus):
    corpus_citation_masked = [re.sub(r'\\cite\{.*?\}', '<CITATION>', sentence) for sentence in tqdm(corpus)]
    return corpus_citation_masked

def create_padded_bib_entries(bib_entries):
    padded_bib_entries = {key: value + ' <BIB>' for key, value in tqdm(bib_entries.items())}
    return padded_bib_entries

# Example usage
directory_path = './sources/'
# corpus, bib_entries = load_data_from_files(directory_path)

corpus_citation_masked = create_corpus_citation_masked(corpus)
padded_bib_entries = create_padded_bib_entries(bib_entries)


  0%|          | 0/16509 [00:00<?, ?it/s]

  0%|          | 0/168 [00:00<?, ?it/s]

In [21]:
corpus_citation_masked[:5]

['In all experiments, we used $\\delta=0.01$ and a heuristic threshold $\\beta_{t,\\delta} = \\log(1/\\delta) + \\log(1+t)$ for all elimination rules and LLR stopping. This is slightly larger than the heuristic threshold proposed by <CITATION> and adopted in many recent works.',
 'We report the results on unstructured bandit instances (generated according to the procedure of Appendix \\ref{app:instances}) in Table \\ref{tab:uns_all}. The algorithm k-Learner is the unstructured variant of LinGame proposed by <CITATION>.',
 'We combined adaptive algorithms which are natively based on LLR stopping with our elimination stopping rules and, whenever possible, we extended their sampling rule to use elimination. The selected baselines are the following. For linear BAI, LinGapE \\citep{xu2018fully}, LinGame \\citep{degenne2020gamification}, Frank-Wolfe Sampling (FWS) \\citep{wang2021fast}, Lazy Track-and-Stop (TaS) \\citep{jedra2020optimal}, XY-Adaptive \\citep{soare2014best}, and RAGE \\citep{

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
from torch.utils.data import Dataset, DataLoader
from torch.nn import CosineSimilarity
from torch.optim import Adam
from sklearn.model_selection import train_test_split
import torch


# Define the special tokens
special_tokens_dict = {'additional_special_tokens': ['<CITATION>', '<BIB>']}
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens(special_tokens_dict)

model = GPT2Model.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))  # Resize the token embeddings

# Citation Dataset
class CitationDataset(Dataset):
    def __init__(self, corpus, bib_entries):
        self.corpus = corpus
        self.bib_entries = list(bib_entries.values())
        
    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        return self.corpus[idx], self.bib_entries[idx]

# Create datasets
corpus_train, corpus_val, bib_train, bib_val = train_test_split(corpus_citation_masked, padded_bib_entries, test_size=0.2)
train_dataset = CitationDataset(corpus_train, bib_train)
val_dataset = CitationDataset(corpus_val, bib_val)

# Create dataloaders
batch_size = 8  # adjust according to your GPU memory
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = Adam(model.parameters(), lr=1e-4)
cos = CosineSimilarity(dim=-1)

def train_model():
    model.train()
    for batch in train_loader:
        corpus_batch, bib_batch = batch
        corpus_encoded = tokenize_and_encode(corpus_batch)
        bib_encoded = tokenize_and_encode(bib_batch)

        loss = calculate_loss(corpus_encoded, bib_encoded)

        # Backward propagation
        loss.backward()

        # Update weights
        optimizer.step()

        # Zero gradients
        optimizer.zero_grad()

def evaluate_model():
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            corpus_batch, bib_batch = batch
            corpus_encoded = tokenize_and_encode(corpus_batch)
            bib_encoded = tokenize_and_encode(bib_batch)

            loss = calculate_loss(corpus_encoded, bib_encoded)
            total_loss += loss.item()

    return total_loss / len(val_loader)

# Define the training loop
num_epochs = 10
for epoch in range(num_epochs):
    train_model()
    val_loss = evaluate_model()
    print(f'Epoch {epoch+1}, Validation Loss: {val_loss}')
