In [1]:
from PyPDF2 import PdfReader
reader = PdfReader('attention_paper.pdf')

pages = len(reader.pages)

In [2]:
import re
def adjust_bracket_spaces(s):
    def replace_comma_spaces(match):
        """Function to replace spaces around commas in the matched string."""
        return re.sub(r'\s*,\s*', ', ', match.group())



    # Remove spaces between brackets and single numbers
    s_modified = re.sub(r'\[\s*(\d+)\s*\]', r'[\1]', s)

    # Correct spaces around commas inside brackets using the helper function
    s_modified = re.sub(r'\[\d+(\s*,\s*\d+)+\]', replace_comma_spaces, s_modified)

    return s_modified

In [4]:
from nltk.tokenize import sent_tokenize


def parse_sentencewise(page):
    sentences = sent_tokenize(page)
    #remove newlines
    sentences = [s.replace('\n', ' ') for s in sentences]
    sentences = [adjust_bracket_spaces(s) for s in sentences]


    # # Remove spaces between brackets and single numbers
    # sentences = [re.sub(r'\[\s*(\d+)\s*\]', r'[\1]', s) for s in sentences]

    # # Correct spaces around commas inside brackets
    # sentences = [re.sub(r'(\[\d+)\s*,\s*(\d+)', r'\1, \2', s) for s in sentences]

    return sentences
page = reader.pages[1]
page = page.extract_text()
sentences = parse_sentencewise(page)
for s in sentences:
    print(s)

1 Introduction Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [ 35,2,5].
Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [38, 24, 15].
Recurrent models typically factor computation along the symbol positions of the input and output sequences.
Aligning the positions to steps in computation time, they generate a sequence of hidden states ht, as a function of the previous hidden state ht−1and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples.
Recent work has achieved significant improvements in computational efficiency through factoriz

In [5]:
# save sentences that have [number] in them
import re
def contains_integer_list(s):
    return bool(re.search(r'\[\s*\d+(?:\s*,\s*\d+)+\s*\]', s))
def get_sentences_with_reference(sentences):
    sentences_with_reference = []
    for sentence in sentences:
        if contains_integer_list(sentence):
            sentences_with_reference.append(sentence)
            continue
                 
        elif re.search(r'\[\d+\]', sentence):
            sentences_with_reference.append(sentence)
    return sentences_with_reference
sourced_sentences = get_sentences_with_reference(sentences)
for s in sourced_sentences:
    print(s)

1 Introduction Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [ 35,2,5].
Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [38, 24, 15].
Recent work has achieved significant improvements in computational efficiency through factorization tricks [21] and conditional computation [32], while also improving model performance in case of the latter.
Attention mechanisms have become an integral part of compelling sequence modeling and transduc- tion models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [ 2,19].
In all but a few cases [27], however, such attention mechanisms are used in conjunction with a recurrent network.
2 Background

In [23]:
def create_reference_dict(sourced_sentences):
    reference_dict = {}
    for sentence in sourced_sentences:
        #print(sentence)
        # retrieve string between brackets
        reference = re.findall(r'\[(.*?)\]', sentence)
        #print(reference)
        removed_reference = re.sub(r'\[(.*?)\]', '', sentence)
        for r in reference:
            # split by comma
            r = r.split(',')
            # remove spaces
            r = [i.strip() for i in r]
            for i in r:
                if i not in reference_dict:
                    reference_dict[i] = [removed_reference]
                else:
                    reference_dict[i].append(removed_reference)
    return reference_dict

create_reference_dict(sourced_sentences)


{'13': ['1 Introduction Recurrent neural networks, long short-term memory  and gated recurrent  neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation .'],
 '7': ['1 Introduction Recurrent neural networks, long short-term memory  and gated recurrent  neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation .'],
 '35': ['1 Introduction Recurrent neural networks, long short-term memory  and gated recurrent  neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation .',
  '3 Model Architecture Most competitive neural sequence transduction models have an encoder-decoder structure .'],
 '2': ['1 Introduct

In [24]:
def retrieve_reference_dicts(reader):
    reference_dicts = []
    for page in reader.pages:
        page = page.extract_text()
        sentences = parse_sentencewise(page)
        sourced_sentences = get_sentences_with_reference(sentences)
        reference_dict = create_reference_dict(sourced_sentences)
        reference_dicts.append(reference_dict)
    return reference_dicts

reference_dicts = retrieve_reference_dicts(reader)

In [28]:
print(reference_dicts[-10])

{'9': ['There are many choices of positional encodings, learned and fixed .', 'We also experimented with using learned positional embeddings  instead, and found that the two versions produced nearly identical results (see Table 3 row (E)).'], '12': ['The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies .']}


In [6]:
from pdfminer.high_level import extract_text

def extract_references_from_pdf(pdf_path):
    # Extract text from the PDF
    text = extract_text(pdf_path)

    # Find the references section (this can be adjusted based on common patterns)
    references_start = re.search(r'\b(References|Bibliography|Works Cited)\b', text)
    
    if not references_start:
        print("References section not found.")
        return

    # Extract the references section (assuming the rest of the document after 'References' is the references section)
    references_text = text[references_start.start():]

    # Split references into individual entries (this is a basic split on numbers. You may need to adjust this regex)
    references = re.split(r'\n(?=\[?\d+\]?\.?)', references_text)

    return references[1:]  # the first item is the "References" title

pdf_path = "attention_paper.pdf"
references = extract_references_from_pdf(pdf_path)
print(references)

['[1] Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint\n\narXiv:1607.06450, 2016.\n', '[2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly\n\nlearning to align and translate. CoRR, abs/1409.0473, 2014.\n', '[3] Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V. Le. Massive exploration of neural\n\nmachine translation architectures. CoRR, abs/1703.03906, 2017.\n', '[4] Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine\n\nreading. arXiv preprint arXiv:1601.06733, 2016.\n', '10\n\n\x0c[5] Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Fethi Bougares, Holger Schwenk,\nand Yoshua Bengio. Learning phrase representations using rnn encoder-decoder for statistical\nmachine translation. CoRR, abs/1406.1078, 2014.\n', '[6] Francois Chollet. Xception: Deep learning with depthwise separable convolutions. arXiv\n\npreprint arXiv:1610.02357, 2016.\n', '[7] Junyoung Ch

In [9]:
def get_references_only(references):
    references_only = []
    for reference in references:
        if '[' in reference:
            item = reference.split('[')[1]
            item = '[' + item
            references_only.append(item)
    return references_only

references_only = get_references_only(references)


In [10]:
#get item that looks like ****.****
def get_arxiv_number(reference):
    # Extract the arXiv number
    arxiv_number = re.search(r'arXiv:\d{4}\.\d{4,5}(v\d+)?', reference)
    # get rid of arxiv: in the string
    if arxiv_number:
        arxiv_number = arxiv_number.group(0)[6:]
        return arxiv_number
    ref_number = re.search(r'abs/\d{4}\.\d{4,5}(v\d+)?', reference)
    if ref_number:
        ref_number = ref_number.group(0)[4:]
        return ref_number
    return None

In [12]:
arxiv_numbers = [get_arxiv_number(reference) for reference in references_only]

def get_axiv_numbers_from_pdf(pdf_path):
    references = extract_references_from_pdf(pdf_path)
    references_only = get_references_only(references)
    arxiv_numbers = [get_arxiv_number(reference) for reference in references_only]
    return arxiv_numbers

arxiv_numbers = get_axiv_numbers_from_pdf("attention_paper.pdf")
print(arxiv_numbers)

['1607.06450', '1409.0473', '1703.03906', '1601.06733', '1406.1078', '1610.02357', '1412.3555', None, '1705.03122v2', '1308.0850', None, None, None, None, '1602.02410', None, None, '1610.10099v2', None, None, '1703.10722', '1703.03130', '1511.06114', '1508.04025', None, None, None, '1705.04304', None, '1608.05859', '1508.07909', '1701.06538', None, None, None, '1512.00567', None, '1609.08144', '1606.04199', None]
