# read bib files

In [22]:
# read .bib files 
import os
import pandas as pd
import tarfile
from tqdm.auto import tqdm



def read_bib_files(directory):
    bib_data = []
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.bib'):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r',errors='ignore') as file:
                    bib_data.append((file_path,file.read()))
    return bib_data

def create_dataframe(bib_data):
    entries = []
    for file_path,data in tqdm(bib_data):
        # print(file_path)
        entry = {}
        lines = data.split('\n')
        for line in lines:
            line = line.strip()
            if line.startswith('@'):
                if len(entry)>0:
                    entries.append(entry)
                entry = {
                    'file_path':file_path,
                }
                entry['type'] = line.split('{')[0]
                if len(line.split('{'))>1:
                    entry['key'] = line.split('{')[1].split(',')[0]
            elif '=' in line:
                key, value = line.split('=',1)
                entry[key.strip()] = value.strip('\t {,}')
        if len(entry)>0:
            entries.append(entry)
    return pd.DataFrame(entries)

directory_path = './sources/'
bib_data = read_bib_files(directory_path)
bib_data = create_dataframe(bib_data)

bib_data




  0%|          | 0/345 [00:00<?, ?it/s]

Unnamed: 0,file_path,type,key,title,author,booktitle,pages,year,organization,journal,...,document_type,source,collection,% organization,at,refid,lastaccessed,longbooktitle,issues,numbers
0,./sources/2205.10936v2/references.bib,@inproceedings,abbasi2018best,Best of both worlds: Stochastic \& adversarial...,"Abbasi-Yadkori, Yasin and Bartlett, Peter and ...",Conference on Learning Theory,918--949,2018,PMLR,,...,,,,,,,,,,
1,./sources/2205.10936v2/references.bib,@inproceedings,kaufmann2013information,Information complexity in bandit subset selection,"Kaufmann, Emilie and Kalyanakrishnan, Shivaram",Conference on Learning Theory,228--251,2013,PMLR,,...,,,,,,,,,,
2,./sources/2205.10936v2/references.bib,@inproceedings,wang2021fast,Fast Pure Exploration via Frank-Wolfe,"Wang, Po-An and Tzeng, Ruo-Chun and Proutiere,...",Thirty-Fifth Conference on Neural Information ...,,2021,,,...,,,,,,,,,,
3,./sources/2205.10936v2/references.bib,@inproceedings,audibert2010best,Best arm identification in multi-armed bandits.,"Audibert, Jean-Yves and Bubeck, S{\'e}bastien ...",COLT,41--53,2010,Citeseer,,...,,,,,,,,,,
4,./sources/2205.10936v2/references.bib,@article,tirinzoni2020asymptotically,An asymptotically optimal primal-dual incremen...,"Tirinzoni, Andrea and Pirotta, Matteo and Rest...",,1417--1427,2020,,Advances in Neural Information Processing Systems,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125491,./sources/2210.05593v2/reference.bib,@InProceedings,Ma_2021_ICCV,Partner-Assisted Learning for Few-Shot Image C...,"Ma, Jiawei and Xie, Hanchen and Han, Guangxing...",ICCV,,2021,,,...,,,,,,,,,,
125492,./sources/2210.05593v2/reference.bib,@InProceedings,Rectification,Prototype Rectification for Few-Shot Learning,"Liu, Jinlu and Song, Liang and Qin, Yongqiang",ECCV,,2022,,,...,,,,,,,,,,
125493,./sources/2210.05593v2/reference.bib,@InProceedings,Cui_2019_CVPR,Class-Balanced Loss Based on Effective Number ...,"Cui, Yin and Jia, Menglin and Lin, Tsung-Yi an...",CVPR,,2019,,,...,,,,,,,,,,
125494,./sources/2210.05593v2/reference.bib,@InProceedings,H3DNet,H3dnet: 3D Object Detection using Hybrid Geome...,"Zhang, Zaiwei and Sun, Bo and Yang, Haitao and...",ECCV,,2020,,,...,,,,,,,,,,


In [24]:
import re
import pandas as pd


def parse_bbl_file(bbl_file_path):
    # print(bbl_file_path)
    with open(bbl_file_path, 'rb') as file:
        bbl_content = file.read().decode('utf8', errors='ignore')

    bib_entries = re.findall(r'\\bibitem\[([\s\S]*?)\]{(.*)}\s*([\s\S]+?)(?=\\bibitem|$)', bbl_content)

    parsed_entries = []
    for i,entry in enumerate(bib_entries):
        try:
            key = entry[1]
            bibitem = entry[0]
            entry_content = entry[2].strip()
            entry_content = entry_content.strip().split('\\newblock')
            authors = entry_content[0].strip()

            parsed_entry = {
                'file_path': bbl_file_path,
                'key': key,
                'bibitem': bibitem,
                'authors': authors,
            }
            if len(entry_content)>=2:
                title = entry_content[1].strip()
                parsed_entry['title'] = title
            if len(entry_content)>=3:
                text = entry_content[2].strip()
                text = text.replace('\penalty0','')
                venue_match = re.search(r'\\emph{(.*?)},', text)
                venue = venue_match.group(1) if venue_match else None
                pages_match = re.search(r'pages\s+(.*?)\.', text)
                pages = pages_match.group(1) if pages_match else None
                journal_match = re.search(r'\.\s+(.*?)\,', text)
                journal = journal_match.group(1) if journal_match else None
                year_match = re.search(r'(\d{4})\.', text)
                year = year_match.group(1) if year_match else None
                # add more fields to parsed_entry by update
                parsed_entry.update({
                    "venue": venue,
                    "pages": pages,
                    "journal": journal,
                    "year": year
                })
        except:
            print('Error parsing entry:')
            print(entry)
            raise
        parsed_entries.append(parsed_entry)

    return parsed_entries


# # Example usage
# bbl_file_path = './sources/2210.08367v1/paper.bbl'  # Specify the path to your .bbl file
# parsed_entries = parse_bbl_file(bbl_file_path)

from glob import glob
file_list = glob('./sources/*/*.bbl',recursive=True)
bbl_entries = [entry for path in file_list for entry in parse_bbl_file(path)]
bbl_data = pd.DataFrame(bbl_entries)

bbl_data.head(3)


Unnamed: 0,file_path,key,bibitem,authors,title,venue,pages,journal,year
0,./sources/2209.08739v1/main.bbl,aneja2021ncpvae,"Aneja et~al.(2021)Aneja, Schwing, Kautz, and V...","Jyoti Aneja, Alexander Schwing, Jan Kautz, and...",A contrastive learning approach for training v...,Neural Information Processing Systems (NeurIPS),,,2021
1,./sources/2209.08739v1/main.bbl,bauer2019resampled,Bauer and Mnih(2019),Matthias Bauer and Andriy Mnih.,Resampled priors for variational autoencoders.,,66--75,PMLR,2019
2,./sources/2209.08739v1/main.bbl,bengio2013better,"Bengio et~al.(2013)Bengio, Mesnil, Dauphin, an...","Yoshua Bengio, Gr{\'e}goire Mesnil, Yann Dauph...",Better mixing via deep representations.,International conference on machine learning,552--560,PMLR,2013


In [50]:
bib_data.loc[bib_data.key=='sklearn'][['title','author','year','journal',]].iloc[0].to_dict()

{'title': 'Scikit-learn: Machine Learning in {P}ython',
 'author': 'Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.',
 'year': '2011',
 'journal': 'Journal of Machine Learning Research'}

# read citations

In [3]:
import re
from transformers import GPT2Tokenizer


def remove_tex_comments(tex_file_path, cleaned_file_path):
    with open(tex_file_path, 'r') as file:
        content = file.read()

    # Remove inline comments (single-line comments starting with %)
    content = re.sub(r'%.*', '', content)

    # Remove block comments (multi-line comments enclosed between %{ and %})
    content = re.sub(r'(?s)\\%{.*?%}', '', content)

    # Remove commented-out lines (lines starting with % but not preceded by \)
    content = re.sub(r'(?<!\\)%.*\n', '', content)

    with open(cleaned_file_path, 'w') as file:
        file.write(content)

def extract_citations(tex_file_path):
    with open(tex_file_path, 'r') as file:
        content = file.read()

    citations = re.findall(r'\\(cite|parencite|textcite|citet|citep){([^}]+)}', content)
    citations = [(citation_type, citation) for citation_type, citation in citations]
    return citations



def mark_citations(tex_file_path, tokenizer):
    with open(tex_file_path, 'r') as file:
        tex_text = file.read()
    # Find citations in the form of \cite{}, \parencite{}, \textcite{}, \citet{}, \citep{}
    citations = re.findall(r'\\(cite|parencite|textcite|citet|citep){([^}]+)}', tex_text)
    
    # Replace each citation with a special token
    for citation_type, citation_content in citations:
        if ',' in citation_content:
            all_content = citation_content.split(',')
        else:
            all_content = [citation_content]
        
        token = ''.join([f"<CITATION:{id.strip()}>" for id in all_content])
        tex_text = tex_text.replace(f"\\{citation_type}{{{citation_content}}}", token)
    
    # Tokenize the modified text using the GPT-2 tokenizer
    tokens = tokenizer.encode(tex_text)
    return tex_text, tokens




tex_file_path = './data/2203.15589/main.tex'
cleaned_file_path = './cleaned_output.tex'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


remove_tex_comments(tex_file_path, cleaned_file_path)
# citations = extract_citations(cleaned_file_path)

tex_text, tokens = mark_citations(cleaned_file_path, tokenizer)
tex_text[:100],tokenizer.decode(tokens[:100])



Token indices sequence length is longer than the specified maximum sequence length for this model (32733 > 1024). Running this sequence through the model will result in indexing errors


('\\documentclass[11pt]{article}\n\n\\usepackage{fullpage}\n\\usepackage[round]{natbib}\n\n\\usepackage{amsmath',
 '\\documentclass[11pt]{article}\n\n\\usepackage{fullpage}\n\\usepackage[round]{natbib}\n\n\\usepackage{amsmath,amsthm,amsfonts,amssymb}\n\\usepackage{amsmath}\n\\usepackage{hyperref}\n\\usepackage{color}\n\\usepackage{mathrsfs}\n\\usepackage{bm}\n\\usepackage{multirow}')