# Download files


In [None]:
!curl https://arxiv.org/e-print/1906.05433 --output /content/1906.05433.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4366k  100 4366k    0     0  1085k      0  0:00:04  0:00:04 --:--:-- 1092k


In [None]:
!tar -xf /content/1906.05433.tar.gz

# Parse Tex Files

In [None]:
!pip install texsoup==0.3.1 -qqq

In [None]:
import pathlib

import TexSoup

In [None]:
def parse_tex(tex_tree):
    """
    Accepts a list of Union[TexNode,Token] and returns a nested list
    of strings of the entire source document.

    Adapted from:
    https://github.com/alvinwan/TexSoup/blob/master/examples/list_everything.py
    """
    text_chunks = []
    for tex_code in tex_tree:
        if isinstance(tex_code, TexSoup.data.TexNamedEnv):
            text_chunks.extend(parse_tex(tex_code.all))
        elif isinstance(tex_code, TexSoup.data.TexText):
            if tex_code != "\n":
                text_chunks.append(tex_code)
        elif isinstance(tex_code, TexSoup.data.TexGroup):
            text_chunks.append(["{", parse_tex(TexSoup.TexSoup(tex_code.value).expr.all), "}"])
        elif isinstance(tex_code, TexSoup.data.TexMathModeEnv):
                text_chunks.append(tex_code.string)
        elif isinstance(tex_code, TexSoup.data.TexCmd) or isinstance(tex_code, TexSoup.utils.Token):
            # Skip parsing TexCmds and tokens
            continue
        else:
            print(f"unable to parse {tex_code} [type={type(tex_code)}]")
            continue

    return text_chunks

def parse_document(filepath: str) -> str:
    with open(filepath) as f:
        data = f.read()

    soup = TexSoup.TexSoup(data)

    text_chunks = parse_tex(soup.expr.all)

    result = "".join(text_chunks)
    
    # Clean up some weird artifacts.
    result = result.replace("\n", " ")
    result = result.replace("\\", "")
    result = result.replace("\\\\", "")

    return result


In [None]:
sections = {}

path = pathlib.Path("sections")  
for f in list(path.iterdir()):
    if f.is_file() and f.name.endswith(".tex"):
        section_name = f.name.split(".")[0]
        if section_name in ["acknowledgments"]:
            continue

        print(f"parsing {f.name}")
        sections[section_name] = parse_document(str(f.absolute()))

parsing societalImpacts.tex
parsing toolsIndividuals.tex
parsing electricitySystems.tex
parsing geoengineering.tex
parsing intro.tex
parsing climateModels.tex
parsing transportation.tex
parsing conclusion.tex
parsing finance.tex
parsing toolsSociety.tex
parsing ccs.tex
parsing buildingsCities.tex
parsing agricultureForestryLand.tex
parsing industry.tex
parsing education.tex


# Split text into smaller chunks

In [None]:
!pip install transformers -qqq

In [None]:
from typing import List

import nltk
import transformers
 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def text_to_chunks(
    input_text: str, 
    tokenizer: transformers.PreTrainedTokenizer, 
    max_token_sz: int = 1024, 
    overlapping_sentences: int = 10,
) -> List[str]:

    sentences = nltk.sent_tokenize(input_text)

    chunks = []

    first_sentence = 0
    last_sentence = 0
    while last_sentence <= len(sentences) - 1:
        last_sentence = first_sentence
        chunk_parts = []
        chunk_size = 0
        for sentence in sentences[first_sentence:]:
            sentence_sz = len(tokenizer.encode(sentence))
            if chunk_size + sentence_sz > max_token_sz:
                break
            
            chunk_parts.append(sentence)
            chunk_size += sentence_sz
            last_sentence += 1

        chunks.append(" ".join(chunk_parts))
        first_sentence = last_sentence - overlapping_sentences

    return chunks

In [None]:
tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")

section_chunks = {}
for section, section_text in sections.items():
    section_chunks[section] = text_to_chunks(section_text, tokenizer)

In [None]:
for section, chunks in section_chunks.items():
    print(f"{section}: {len(chunks)} chunks.")

societalImpacts: 4 chunks.
toolsIndividuals: 2 chunks.
electricitySystems: 6 chunks.
geoengineering: 2 chunks.
intro: 2 chunks.
climateModels: 3 chunks.
transportation: 6 chunks.
conclusion: 1 chunks.
finance: 1 chunks.
toolsSociety: 4 chunks.
ccs: 1 chunks.
buildingsCities: 5 chunks.
agricultureForestryLand: 3 chunks.
industry: 3 chunks.
education: 1 chunks.


# Summarize chunks

In [47]:
!pip install openai tenacity -qqq

In [77]:
from collections import defaultdict

from google.colab import drive
import openai

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

In [None]:
drive.mount('/content/drive', force_remount=True)

In [None]:
with open('/content/drive/MyDrive/openai.key') as f:
    openai.api_key = f.read().strip()

In [80]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)


def summarize_chunk(chunk: str, max_tokens: int = 512, temperature: int = 0) -> str:
    response = completion_with_backoff(
        model="text-davinci-002",
        prompt=f'Using scientific language, provide a long and detailed summary of the excerpt below".'
        f"\n###\nExcerpt:{chunk}\n###\n-",
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

    return response['choices'][0]['text'].strip()

In [85]:
summaries = defaultdict(list)
for section, chunks in section_chunks.items():
    for chunk in chunks:
        summaries[section].append(summarize_chunk(chunk))

In [92]:
out_path = pathlib.Path('/content/summaries')
out_path.mkdir(parents=False, exist_ok=True)

In [104]:
for section, summs in summaries.items():
    fp = out_path / f"{section}.txt"
    with fp.open(mode='w') as f:
        summary = "\n".join(summs)
        f.write(summary)