A partir da saída da etapa anterior (arquivo sections-xxxx.pkl), carrega todos os artigos necessários em um mapa do tipo:

    papers = {
      "paperId": {
        "metadata": OBJETO_RETORNADO_PELO_SEMANTIC_SCHOLAR,
        "url": URL PARA O PDF DO ARTIGO,
        "title": TÍTULO,
        "abstract": ABSTRACT,
        "text": TEXTO
      }
    }


Observação: na prática, os dados de url/title/abstract podem ser acessados pelos metadados, mas vamos deixar no primeiro nível pra facilitar o acesso.

In [None]:
# Input file from last stage
file_sections_structure = 'sections-how to represent text for information retrieval-2010-2023.pkl'
# Output file with the contents of each paper
file_papers_contents = 'papers_contents-how to represent text for information retrieval-2010-2023.pkl'

folder_papers = './papers_pdf/'

In [None]:
!pip install pypdfium2 -q

!mkdir -p {folder_papers}

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m1.9/2.9 MB[0m [31m27.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Para ser populado

papers = {}

In [None]:
import pickle
with open(file_sections_structure, "rb") as f:
   sections = pickle.load(f)

In [None]:
def download_pdf(paperId, url):
  print(f"Downloading paper {paperId}: {url}")
  !wget {url} -O {folder_papers}{paperId}.pdf --user-agent="Mozilla" --tries=1 -T 5

In [None]:
import pypdfium2 as pdfium

def extract_text(paperId):
  pdf_file = f'{folder_papers}{paperId}.pdf'
  txt_contents = ''

  print(f"Extracting {paperId}")
  try:
    pdf = pdfium.PdfDocument(pdf_file)
    for i in range(len(pdf)):
      txt_contents += pdf[i].get_textpage().get_text_range()

    # Remove the break lines and considers only one big string of text:
    txt_contents = txt_contents.replace('\r\n', ' ')
    txt_contents = txt_contents.replace('\n', ' ')
    # Remove everything before introduction and reference section:
    txt_contents_lower = txt_contents.lower()
    idx_introduction = max(txt_contents_lower.find('introduction'), 0)
    idx_references = max(txt_contents_lower.rfind('reference'), 0)
    txt_contents = txt_contents[idx_introduction:idx_references]

    return txt_contents
  except:
    print(f'***** Problems with {pdf_file}. Ignoring...')
    return ''

In [None]:
def walk_sections_and_save_pdfs(list_of_sections):
  for section in list_of_sections:
    papers_of_section = section['papers']

    for paper in papers_of_section:
      paperId = paper['paperId']
      url = paper['openAccessPdf']['url']
      if paperId not in papers:
        download_pdf(paperId, url)
        text = extract_text(paperId)
        papers[paperId] = {
          "metadata": paper,
          "url": url,
          "title": paper['title'],
          "abstract": paper['abstract'],
          "text": text
        }
        walk_sections_and_save_pdfs(section['subsections'])

In [None]:
%%time

papers = {}
walk_sections_and_save_pdfs(sections)

Downloading paper a5ac8ff1f9a5ae7a9a6650e1908c7f44614b1d23: http://hal.inria.fr/docs/00/72/18/06/PDF/MoDRE2012_Sannier_Baudry_Multilevel_Requirements_Traceability_Using_MDE_and_IR-cr3.pdf
--2023-06-27 10:52:37--  http://hal.inria.fr/docs/00/72/18/06/PDF/MoDRE2012_Sannier_Baudry_Multilevel_Requirements_Traceability_Using_MDE_and_IR-cr3.pdf
Resolving hal.inria.fr (hal.inria.fr)... 193.48.96.10
Connecting to hal.inria.fr (hal.inria.fr)|193.48.96.10|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://inria.hal.science/docs/00/72/18/06/PDF/MoDRE2012_Sannier_Baudry_Multilevel_Requirements_Traceability_Using_MDE_and_IR-cr3.pdf [following]
--2023-06-27 10:52:39--  https://inria.hal.science/docs/00/72/18/06/PDF/MoDRE2012_Sannier_Baudry_Multilevel_Requirements_Traceability_Using_MDE_and_IR-cr3.pdf
Resolving inria.hal.science (inria.hal.science)... 193.48.96.10
Connecting to inria.hal.science (inria.hal.science)|193.48.96.10|:443... connected.
HTTP re

In [None]:
has_text = 0
for key, paper in papers.items():
  has_text += (1 if paper['text'] != '' else 0)
print(f'Extracted {has_text}/{len(papers.keys())}')

Extracted 28/37


In [None]:
with open(file_papers_contents, 'wb') as f:
  pickle.dump(papers, f)

In [None]:
!zip -r papers.zip papers_pdf

  adding: papers_pdf/ (stored 0%)
  adding: papers_pdf/2c40b968cf53d4a82f6d603253a16f413b82c3e8.pdf (deflated 18%)
  adding: papers_pdf/20381cd41140b547d54899b69424b69d0de567e0.pdf (deflated 10%)
  adding: papers_pdf/e7d7dc15e7327d0b9b01d2987c63383c53325d50.pdf (stored 0%)
  adding: papers_pdf/ab9a2d43b3e7d99a717003f763f96b10e2c16ca1.pdf (deflated 9%)
  adding: papers_pdf/a246d0d4093c798d300ea96a5645b2aa21c022a0.pdf (stored 0%)
  adding: papers_pdf/95ee6dbaf9bb4ef6a9808366e552b400ad48dbc0.pdf (deflated 1%)
  adding: papers_pdf/f38237fa10da8bac34fa84fd63aa3dc254812ff6.pdf (deflated 3%)
  adding: papers_pdf/9c07a128b5b5e4d8edebfc36f384acf0f48296b7.pdf (deflated 14%)
  adding: papers_pdf/a5ac8ff1f9a5ae7a9a6650e1908c7f44614b1d23.pdf (deflated 10%)
  adding: papers_pdf/73068d13d6e53876c374ebd4c862ec01351c9f39.pdf (deflated 2%)
  adding: papers_pdf/972672719e67bd097954c36da4c2ad7c17246b18.pdf (deflated 15%)
  adding: papers_pdf/01657e073dbb39d5c0ffeccb85e5b239c0e35ff1.pdf (deflated 19%)
  ad

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
#!cp papers.zip '/content/drive/My Drive/papers_2015_2023.zip'