# Generating Markdown with `Nougat`

In [2]:
%pip install -q pymupdf python-Levenshtein nltk datasets transformers torch pillow kaggle nougat-ocr

Note: you may need to restart the kernel to use updated packages.


### If on Google Colab:

Comment to code below if not on Google Colab. This code brings Kaggle secrets into Google Colab

In [2]:
# from google.colab import files
# files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"dulloa6310","key":"6d92840966e000c00b675c1fa08ff7c5"}'}

Wait for `kaggle.json` to load...

In [3]:
# !mkdir ~/.kaggle
# !mv ./kaggle.json ~/.kaggle
# !chmod 600 ~/.kaggle/kaggle.json

## Accessing Papers

In order for the `kaggle` command to work, you need to create a folder called `.kaggle` in the root directory of your machine, generate an api key on kaggle.com (called `kaggle.json`), and place this file in the .kaggle folder.

In [4]:
!kaggle datasets download -d Cornell-University/arxiv

Downloading arxiv.zip to /content
100% 1.26G/1.26G [00:48<00:00, 28.1MB/s]
100% 1.26G/1.26G [00:48<00:00, 28.1MB/s]


The command:

`kaggle datasets download -d Cornell-University/arxiv`

downloads a .zip file, which we need to unzip to get the json file

In [5]:
!unzip ./arxiv.zip

Archive:  ./arxiv.zip
  inflating: arxiv-metadata-oai-snapshot.json  


## Processing Arxiv Data

In [11]:
import json
import requests
import os


def process_json_file(filename):
    dictionaries = []
    with open(filename, 'r') as file:
        for line in file:
            try:
                data = json.loads(line)
                dictionaries.append(data)
            except json.JSONDecodeError:
                print(f"Error decoding line: {line}")  # Handle invalid JSON lines

    return dictionaries

In [12]:
data = process_json_file('./arxiv-metadata-oai-snapshot.json')

In [13]:
import pandas as pd
print(data[0].keys)
df = pd.DataFrame(data)[['id', 'categories', 'doi', 'update_date']]
df['categories_split'] = df['categories'].apply(lambda x: x.split())
df.head()

<built-in method keys of dict object at 0x107ef7040>


Unnamed: 0,id,categories,doi,update_date,categories_split
0,704.0001,hep-ph,10.1103/PhysRevD.76.013009,2008-11-26,[hep-ph]
1,704.0002,math.CO cs.CG,,2008-12-13,"[math.CO, cs.CG]"
2,704.0003,physics.gen-ph,,2008-01-13,[physics.gen-ph]
3,704.0004,math.CO,,2007-05-23,[math.CO]
4,704.0005,math.CA math.FA,,2013-10-15,"[math.CA, math.FA]"


### Filter By Category

In [14]:
wanted_categories = set(['cs.AI','cs.AR','cs.CC','cs.CE','cs.CG','cs.CL','cs.CR','cs.CV',
'cs.CY','cs.DB','cs.DC','cs.DL','cs.DM','cs.DS','cs.ET','cs.FL','cs.GL','cs.GR','cs.GT','cs.HC',
'cs.IR','cs.IT','cs.LG','cs.LO','cs.MA','cs.MM','cs.MS','cs.NA','cs.NE','cs.NI','cs.OH','cs.OS',
'cs.PF','cs.PL','cs.RO','cs.SC','cs.SD','cs.SE','cs.SI','cs.SY'])

def filter_categories(row):
  has_cs = False
  have_categories = row['categories_split']
  for category in have_categories:
    if category == 'cs.AI':
      return True
  return False

only_cs = df[df.apply(filter_categories, axis=1)]
print(len(only_cs))
only_cs.head()

84174


Unnamed: 0,id,categories,doi,update_date,categories_split
46,704.0047,cs.NE cs.AI,,2009-09-29,"[cs.NE, cs.AI]"
49,704.005,cs.NE cs.AI,,2007-05-23,"[cs.NE, cs.AI]"
303,704.0304,cs.IT cs.AI math.IT q-bio.PE,10.1007/978-3-642-18003-3_10,2013-04-05,"[cs.IT, cs.AI, math.IT, q-bio.PE]"
984,704.0985,cs.NE cs.AI,,2007-05-23,"[cs.NE, cs.AI]"
1027,704.1028,cs.LG cs.AI cs.NE,,2007-05-23,"[cs.LG, cs.AI, cs.NE]"


### Sort Articles by Date

In [15]:
def sort_by_update_date(df):
  """Sorts a pandas DataFrame by the 'update-date' column.

  Args:
      df: A pandas DataFrame.

  Returns:
      A new pandas DataFrame sorted by the 'update-date' column.
  """

  # Ensure 'update-date' is treated as a datetime column
  df['update_date'] = pd.to_datetime(df['update_date'])

  # Sort in place (modifies the original DataFrame)
  df.sort_values(by='update_date', inplace=True, ascending = False)

  return df

sorted = sort_by_update_date(only_cs.copy())
print(len(sorted))
sorted.head()

84174


Unnamed: 0,id,categories,doi,update_date,categories_split
1616530,2203.03668,cs.LG cs.AI cs.HC,,2024-03-15,"[cs.LG, cs.AI, cs.HC]"
2026327,2403.09209,cs.CR cs.AI cs.LG,,2024-03-15,"[cs.CR, cs.AI, cs.LG]"
2026345,2403.09227,cs.RO cs.AI,,2024-03-15,"[cs.RO, cs.AI]"
2026350,2403.09232,cs.AI,,2024-03-15,[cs.AI]
2026367,2403.09249,cs.AI,,2024-03-15,[cs.AI]


### Collect Reference Count

In [74]:
import time

def get_citation_count(papers):
  """
  Input:
    papers: List of strings of the format "ARXIV:XXXXX.XXXXX", where the value after of ARXIV: is the arxiv id. As an example: "ARXIV:2106.15928".
  """
  too_many_request = False

  result = []
  while not too_many_request:
    response = requests.post(
      'https://api.semanticscholar.org/graph/v1/paper/batch',
      params={'fields': 'influentialCitationCount,externalIds,citationCount'},
      json={"ids": papers}
    )

    if response.status_code == 200:
      data = response.json()

      # Handle Arxiv id not found by API
      for article in data:
        if article is not None:
          cite_count = article['citationCount']
          inf_cite_count = article['influentialCitationCount']
          arxiv_id = article['externalIds']['ArXiv']
          result.append({"citation_count" : cite_count, "id": arxiv_id, "inf_cite_count" : inf_cite_count})

      return result
    elif response.status_code == 429:
      too_many_request = True
      time.sleep(10)
    else:
      print(f"Error: {response.status_code}")
      return []
# https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/post_graph_get_papers, get paper details, 500 at a time

In [77]:
SLICE_SIZE = 500

slices = len(sorted) // SLICE_SIZE
if len(sorted) % SLICE_SIZE != 0:
  slices += 1

all_papers = []
for i in range(slices):
  print(f"Processing slice {i + 1}/{slices}")
  start = i * SLICE_SIZE
  end = (i + 1) * SLICE_SIZE
  formatted_ids = list(sorted.iloc[start:end]['id'].apply(lambda x : f"ARXIV:{x}"))

  result = get_citation_count(formatted_ids)
  all_papers.extend(result)

all_papers = pd.DataFrame(all_papers)
all_papers = all_papers.sort_values('inf_cite_count', ascending = False)
all_papers.to_csv("all_papers.csv", index = False)

print(len(all_papers))
all_papers.head()

Processing slice 1/169
Processing slice 2/169
Processing slice 3/169
Processing slice 4/169
Processing slice 5/169
Processing slice 6/169
Processing slice 7/169
Processing slice 8/169
Processing slice 9/169
Processing slice 10/169
Processing slice 11/169
Processing slice 12/169
Processing slice 13/169
Processing slice 14/169
Processing slice 15/169
Processing slice 16/169
Processing slice 17/169
Processing slice 18/169
Processing slice 19/169
Processing slice 20/169
Processing slice 21/169
Processing slice 22/169
Processing slice 23/169
Processing slice 24/169
Processing slice 25/169
Processing slice 26/169
Processing slice 27/169
Processing slice 28/169
Processing slice 29/169
Processing slice 30/169
Processing slice 31/169
Processing slice 32/169
Processing slice 33/169
Processing slice 34/169
Processing slice 35/169
Processing slice 36/169
Processing slice 37/169
Processing slice 38/169
Processing slice 39/169
Processing slice 40/169
Processing slice 41/169
Processing slice 42/169
P

Unnamed: 0,citation_count,id,inf_cite_count
53219,18144,2010.11929,3017
71978,14320,1710.10903,2748
81207,20644,1106.1813,2367
73483,9359,1703.034,2199
75888,16830,1605.08695,1915


## Downloading PDFs

In [79]:
def download_pdf(arxiv_id, filename="paper.pdf"):
    """Downloads a PDF from arXiv given its ID.

    Args:
        arxiv_id (str): The arXiv ID of the paper.
        filename (str, optional): The desired filename for the downloaded PDF.
                                  Defaults to "paper.pdf".
    """

    url = f"https://arxiv.org/pdf/{arxiv_id}"

    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check for HTTP errors

        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # Filter out keep-alive chunks
                    f.write(chunk)

        print(f"PDF downloaded successfully as '{filename}'")

    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error: {err}")
    except requests.exceptions.RequestException as err:
        print(f"An error occurred: {err}")

!mkdir papers
to_download = list(all_papers.iloc[:500]['id'])
for id in to_download:
    filename = id
    if "/" in id:
        filename = id.split('/')[-1]
    download_pdf(id, f"papers/{filename}.pdf")

PDF downloaded successfully as 'papers/2010.11929.pdf'
PDF downloaded successfully as 'papers/1710.10903.pdf'
PDF downloaded successfully as 'papers/1106.1813.pdf'
PDF downloaded successfully as 'papers/1703.03400.pdf'
PDF downloaded successfully as 'papers/1605.08695.pdf'
PDF downloaded successfully as 'papers/1610.02391.pdf'
PDF downloaded successfully as 'papers/1705.07874.pdf'
PDF downloaded successfully as 'papers/1602.04938.pdf'
PDF downloaded successfully as 'papers/1801.01290.pdf'
PDF downloaded successfully as 'papers/1502.01852.pdf'
PDF downloaded successfully as 'papers/1405.4053.pdf'
PDF downloaded successfully as 'papers/1411.1784.pdf'
PDF downloaded successfully as 'papers/1512.03012.pdf'
PDF downloaded successfully as 'papers/1612.00796.pdf'
PDF downloaded successfully as 'papers/1909.11942.pdf'
PDF downloaded successfully as 'papers/1806.07366.pdf'
PDF downloaded successfully as 'papers/1802.09477.pdf'
PDF downloaded successfully as 'papers/1602.07360.pdf'
PDF downloade

KeyboardInterrupt: 

## Running `Nougat`

In [45]:
!mkdir papers
!mkdir markdown
# for id in df['id'][:10]:
#     download_pdf(id, f"papers/{id}.pdf")

!nougat ./papers -o ./markdown -m 0.1.0-base

mkdir: cannot create directory ‘papers’: File exists
downloading nougat checkpoint version 0.1.0-base to path /root/.cache/torch/hub/nougat-0.1.0-base
config.json: 100% 560/560 [00:00<00:00, 3.41Mb/s]
pytorch_model.bin: 100% 1.31G/1.31G [00:25<00:00, 55.4Mb/s]
special_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 619kb/s]
tokenizer.json: 100% 2.04M/2.04M [00:00<00:00, 15.9Mb/s]
tokenizer_config.json: 100% 106/106 [00:00<00:00, 663kb/s]
INFO:root:Found 1 files.
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  return torch.var(self.values, 1) / self.values.shape[1]
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
INFO:root:Processing file papers/raptor.pdf with 23 pages
100% 6/6 [03:00<00:00, 30.01s/it]
-> Cannot close object, library is destroyed. This may cause a memory leak!
