# read bib files

In [6]:
# read .bib files 
import os
import pandas as pd
import tarfile



def read_bib_files(directory):
    bib_data = []
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.bib'):
                print(file_name)
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r') as file:
                    bib_data.append((file_path,file.read()))
    return bib_data

def create_dataframe(bib_data):
    entries = []
    for file_path,data in bib_data:
        entry = {}
        lines = data.split('\n')
        for line in lines:
            line = line.strip()
            if line.startswith('@'):
                if len(entry)>0:
                    entries.append(entry)
                entry = {}
                entry['file_path'] = file_path
                entry['type'] = line.split('{')[0]
                entry['key'] = line.split('{')[1].split(',')[0]
            elif '=' in line:
                key, value = line.split('=',1)
                entry[key.strip()] = value.strip('\t {,}')
        if len(entry)>0:
            entries.append(entry)
    return pd.DataFrame(entries)

directory_path = './sources/'
bib_data = read_bib_files(directory_path)
bib_data = create_dataframe(bib_data)

bib_data




reference-final.bib
bib.bib
SCGD.bib
reference-final.bib
bib.bib
SCGD.bib
references.bib
biblio.bib
egbib.bib
explainer.bib
ref.bib
aaai.bib
acmart.bib
main.bib
isit.bib
nips2022_conference.bib
fedsubavg.bib


Unnamed: 0,file_path,type,key,title,author,journal,volume,number,pages,year,...,primaryClass,shorttitle,urldate,copyright,bookTitle,jounal,date-added,date-modified,bdsk-url-1,chapter
0,./sources/2206.10870v1/reference-final.bib,@article,wiesemann2013pessimistic,Pessimistic bilevel optimization,"Wiesemann, Wolfram and Tsoukalas, Angelos and ...",SIAM Journal on Optimization,23,1,353--380,2013,...,,,,,,,,,,
1,./sources/2206.10870v1/reference-final.bib,@book,dempe2002foundations,Foundations of bilevel programming,"Dempe, Stephan",,,,,2002,...,,,,,,,,,,
2,./sources/2206.10870v1/reference-final.bib,@article,boob2019stochastic,Stochastic first-order methods for convex and ...,"Boob, Digvijay and Deng, Qi and Lan, Guanghui",arXiv preprint arXiv:1908.02734,,,,2019,...,,,,,,,,,,
3,./sources/2206.10870v1/reference-final.bib,@article,bertsimas2003robust,Robust discrete optimization and network flows,"Bertsimas, Dimitris and Sim, Melvyn",Mathematical programming,98,1,49--71,2003,...,,,,,,,,,,
4,./sources/2206.10870v1/reference-final.bib,@article,bertsimas2004price,The price of robustness,"Bertsimas, Dimitris and Sim, Melvyn",Operations research,52,1,35--53,2004,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5539,./sources/2109.07704v4/fedsubavg.bib,@inproceedings,batchnorm,Batch Normalization: Accelerating Deep Network...,Sergey Ioffe and,,,,448--456,2015,...,,,,,,,,,,
5540,./sources/2109.07704v4/fedsubavg.bib,@article,fasthessian,Fast Exact Multiplication by the Hessian,Barak A. Pearlmutter,Neural Comput.,,1,147--160,1994,...,,,,,,,,,,
5541,./sources/2109.07704v4/fedsubavg.bib,@inproceedings,afo,Adaptive Federated Optimization,"Reddi, Sashank J and Charles, Zachary and Zahe...",,,,,2020,...,,,,,,,,,,
5542,./sources/2109.07704v4/fedsubavg.bib,@article,warner_1965,Randomized response: A survey technique for el...,"Warner, Stanley L",J. Am. Stat. Assoc.,60,309,63--69,1965,...,,,,,,,,,,


In [71]:
import re


def parse_bbl_file(bbl_file_path):
    with open(bbl_file_path, 'r') as file:
        bbl_content = file.read()

    bib_entries = re.findall(r'\\bibitem\[([\s\S]*?)\]{(.*)}\s*([\s\S]+?)(?=\\bibitem|$)', bbl_content)

    parsed_entries = []
    for i,entry in enumerate(bib_entries):
        key = entry[1]
        bibitem = entry[0]
        entry_content = entry[2].strip()
        entry_content = entry_content.strip().split('\\newblock')
        authors = entry_content[0].strip()
        title = entry_content[1].strip()
        text = entry_content[2].strip()
        text = text.replace('\penalty0','')
        venue_match = re.search(r'\\emph{(.*?)},', text)
        venue = venue_match.group(1) if venue_match else None
        pages_match = re.search(r'pages\s+(.*?)\.', text)
        pages = pages_match.group(1) if pages_match else None
        journal_match = re.search(r'\.\s+(.*?)\,', text)
        journal = journal_match.group(1) if journal_match else None
        year_match = re.search(r'(\d{4})\.', text)
        year = year_match.group(1) if year_match else None

        parsed_entry = {
            'key': key,
            'bibitem': bibitem,
            'title': title,
            'authors': authors,
            "venue": venue,
            "pages": pages,
            "journal": journal,
            "year": year
        }
        parsed_entries.append(parsed_entry)

    return pd.DataFrame(parsed_entries)

# Example usage
bbl_file_path = './sources/2210.08367v1/paper.bbl'  # Specify the path to your .bbl file
parsed_entries = parse_bbl_file(bbl_file_path)

parsed_entries


Unnamed: 0,key,bibitem,title,authors,venue,pages,journal,year
0,agarwal2014taming,"Agarwal et~al.(2014)Agarwal, Hsu, Kale, Langfo...",Taming the monster: A fast and simple algorith...,"Alekh Agarwal, Daniel Hsu, Satyen Kale, John L...",International Conference on Machine Learning,1638--1646,PMLR,1646
1,anthony2002uniform,Anthony(2002),Uniform glivenko-cantelli theorems and concent...,Martin Anthony.,Research Report LSE-CDAM-2002--07,,,2002
2,ash2021gone,"Ash et~al.(2021)Ash, Goel, Krishnamurthy, and ...",Gone fishing: Neural active learning with fish...,"Jordan Ash, Surbhi Goel, Akshay Krishnamurthy,...",Advances in Neural Information Processing Systems,,,2021
3,ash2019deep,"Ash et~al.(2019)Ash, Zhang, Krishnamurthy, Lan...","Deep batch active learning by diverse, uncerta...","Jordan~T Ash, Chicheng Zhang, Akshay Krishnamu...",arXiv preprint arXiv:1906.03671,,,1906
4,audibert2007fast,Audibert and Tsybakov(2007),Fast learning rates for plug-in classifiers.,Jean-Yves Audibert and Alexandre~B Tsybakov.,The Annals of statistics,,,2007
...,...,...,...,...,...,...,...,...
59,wang2021neural,"Wang et~al.(2021)Wang, Awasthi, Dann, Sekhari,...",Neural active learning with performance guaran...,"Zhilei Wang, Pranjal Awasthi, Christoph Dann, ...",Advances in Neural Information Processing Systems,,,2021
60,yao1977probabilistic,Yao(1977),Probabilistic computations: Toward a unified m...,Andrew Chi-Chin Yao.,,222--227,IEEE Computer Society,1977
61,yarotsky2017error,Yarotsky(2017),Error bounds for approximations with deep relu...,Dmitry Yarotsky.,Neural Networks,,,2017
62,yarotsky2018optimal,Yarotsky(2018),Optimal approximation of continuous functions ...,Dmitry Yarotsky.,Conference on learning theory,639--649,PMLR,2018


# read citations

In [3]:
import re
from transformers import GPT2Tokenizer


def remove_tex_comments(tex_file_path, cleaned_file_path):
    with open(tex_file_path, 'r') as file:
        content = file.read()

    # Remove inline comments (single-line comments starting with %)
    content = re.sub(r'%.*', '', content)

    # Remove block comments (multi-line comments enclosed between %{ and %})
    content = re.sub(r'(?s)\\%{.*?%}', '', content)

    # Remove commented-out lines (lines starting with % but not preceded by \)
    content = re.sub(r'(?<!\\)%.*\n', '', content)

    with open(cleaned_file_path, 'w') as file:
        file.write(content)

def extract_citations(tex_file_path):
    with open(tex_file_path, 'r') as file:
        content = file.read()

    citations = re.findall(r'\\(cite|parencite|textcite|citet|citep){([^}]+)}', content)
    citations = [(citation_type, citation) for citation_type, citation in citations]
    return citations



def mark_citations(tex_file_path, tokenizer):
    with open(tex_file_path, 'r') as file:
        tex_text = file.read()
    # Find citations in the form of \cite{}, \parencite{}, \textcite{}, \citet{}, \citep{}
    citations = re.findall(r'\\(cite|parencite|textcite|citet|citep){([^}]+)}', tex_text)
    
    # Replace each citation with a special token
    for citation_type, citation_content in citations:
        if ',' in citation_content:
            all_content = citation_content.split(',')
        else:
            all_content = [citation_content]
        
        token = ''.join([f"<CITATION:{id.strip()}>" for id in all_content])
        tex_text = tex_text.replace(f"\\{citation_type}{{{citation_content}}}", token)
    
    # Tokenize the modified text using the GPT-2 tokenizer
    tokens = tokenizer.encode(tex_text)
    return tex_text, tokens




tex_file_path = './data/2203.15589/main.tex'
cleaned_file_path = './cleaned_output.tex'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


remove_tex_comments(tex_file_path, cleaned_file_path)
# citations = extract_citations(cleaned_file_path)

tex_text, tokens = mark_citations(cleaned_file_path, tokenizer)
tex_text[:100],tokenizer.decode(tokens[:100])



Token indices sequence length is longer than the specified maximum sequence length for this model (32733 > 1024). Running this sequence through the model will result in indexing errors


('\\documentclass[11pt]{article}\n\n\\usepackage{fullpage}\n\\usepackage[round]{natbib}\n\n\\usepackage{amsmath',
 '\\documentclass[11pt]{article}\n\n\\usepackage{fullpage}\n\\usepackage[round]{natbib}\n\n\\usepackage{amsmath,amsthm,amsfonts,amssymb}\n\\usepackage{amsmath}\n\\usepackage{hyperref}\n\\usepackage{color}\n\\usepackage{mathrsfs}\n\\usepackage{bm}\n\\usepackage{multirow}')