In [1]:
import os
import asyncio
import nest_asyncio

nest_asyncio.apply()

import pandas as pd
import logging
import openai
from openai import AzureOpenAI
import time
from glob import glob
from rex import init_logger

from elm.pdf import PDFtoTXT
from elm.embed import ChunkAndEmbed
from elm.osti import OstiList

from openai import AzureOpenAI

from glob import glob
import certifi

os.environ['SSL_CERT_FILE'] = certifi.where()

logger = logging.getLogger(__name__)
init_logger(__name__, log_level='DEBUG')
init_logger('elm', log_level='INFO')


<Logger elm (INFO)>

In [2]:
openai.api_base = 'https://stratus-embeddings-south-central.openai.azure.com/'
openai.api_key = 'eccaf77417c541a3ab38e6c550b56259'
openai.api_type = 'azure'
openai.api_version = '2024-02-15-preview'

ChunkAndEmbed.EMBEDDING_MODEL = 'text-embedding-ada-002-2'
ChunkAndEmbed.EMBEDDING_URL = ('https://stratus-embeddings-south-central.'
                               'openai.azure.com/openai/deployments/'
                               'text-embedding-ada-002-2/embeddings?'
                               f'api-version={openai.api_version}')
ChunkAndEmbed.HEADERS = {"Content-Type": "application/json",
                         "Authorization": f"Bearer {openai.api_key}",
                         "api-key": f"{openai.api_key}"}

PDF_DIR = './pdfs/'
TXT_DIR = './txt/'
EMBED_DIR = './embed/'

URL = ('https://www.osti.gov/api/v1/records?'
       'research_org=NREL'
       '&sort=publication_date%20desc'
       '&product_type=Technical%20Report'
       '&has_fulltext=true'
       '&publication_date_start=01/01/2023'
       '&publication_date_end=12/31/2023')

async def generate_embeddings():
    tag = f"Title: {row['title']}\nAuthors: {row['authors']}"
    obj = ChunkAndEmbed(text, tag=tag, tokens_per_chunk=500, overlap=1)
    embeddings = await obj.run_async(rate_limit=3e4)
    if any(e is None for e in embeddings):
        raise RuntimeError('Embeddings are None!')
    return embeddings



In [3]:
if __name__ == '__main__':
    os.makedirs(PDF_DIR, exist_ok=True)
    os.makedirs(TXT_DIR, exist_ok=True)
    os.makedirs(EMBED_DIR, exist_ok=True)

    osti = OstiList(URL, n_pages=1)
    osti.download(PDF_DIR)

    meta = osti.meta.copy()
    meta['osti_id'] = meta['osti_id'].astype(str)
    meta = meta.drop_duplicates(subset=['osti_id'])
    meta['fp'] = PDF_DIR + meta['fn']
    meta.to_csv('./meta.csv', index=False)

    missing = []
    for i, row in meta.iterrows():
        if not os.path.exists(row['fp']):
            missing.append(i)
    meta = meta.drop(missing, axis=0)

    for i, row in meta.iterrows():
        fp = os.path.join(PDF_DIR, row['fn'])
        txt_fp = os.path.join(TXT_DIR, row['fn'].replace('.pdf', '.txt'))
        embed_fp = os.path.join(EMBED_DIR, row['fn'].replace('.pdf', '.json'))

        assert fp.endswith('.pdf')
        assert os.path.exists(fp)

        if os.path.exists(txt_fp):
            with open(txt_fp, 'r') as f:
                text = f.read()
        else:
            pdf_obj = PDFtoTXT(fp)
            text = pdf_obj.clean_poppler(layout=True)
            if pdf_obj.is_double_col():
                text = pdf_obj.clean_poppler(layout=False)
            text = pdf_obj.clean_headers(char_thresh=0.6, page_thresh=0.8,
                                         split_on='\n',
                                         iheaders=[0, 1, 3, -3, -2, -1])
            with open(txt_fp, 'w') as f:
                f.write(text)
            logger.info(f'Saved: {txt_fp}')

        if not os.path.exists(embed_fp):
            logger.info('Embedding {}/{}: "{}"'
                        .format(i+1, len(meta), row['title']))
            tag = f"Title: {row['title']}\nAuthors: {row['authors']}"
            obj = ChunkAndEmbed(text, tag=tag, tokens_per_chunk=500, overlap=1)
            embeddings = asyncio.run(generate_embeddings())
            if any(e is None for e in embeddings):
                raise RuntimeError('Embeddings are None!')
            else:
                df = pd.DataFrame({'text': obj.text_chunks.chunks,
                                   'embedding': embeddings,
                                   'osti_id': row['osti_id']})
                df.to_json(embed_fp, indent=2)
                logger.info('Saved: {}'.format(embed_fp))
            time.sleep(5)

    bad = []
    fps = glob(EMBED_DIR + '*.json')
    for fp in fps:
        data = pd.read_json(fp)
        if data['embedding'].isna().any():
            bad.append(fp)
    assert not any(bad), f'Bad output: {bad}'

    logger.info('Finished!')


INFO - 2024-06-12 17:30:14,224 [osti.py:247] : Downloading 20 records to: ./pdfs/
INFO - 2024-06-12 17:30:14,225 [osti.py:257] : Finished download!
INFO - 2024-06-12 17:30:14,241 [1875023688.py:45] : Embedding 1/20: "Abbreviated Final Technical Report for the Energy Resilience Cost and Performance Tool: The Value of Solar Energy"
INFO - 2024-06-12 17:30:14,573 [embed.py:138] : Embedding 6 text chunks...
INFO - 2024-06-12 17:30:20,570 [embed.py:164] : Finished all embeddings.
INFO - 2024-06-12 17:30:20,573 [1875023688.py:57] : Saved: ./embed/2278804.json
INFO - 2024-06-12 17:30:25,586 [1875023688.py:45] : Embedding 2/20: "2022 Cost of Wind Energy Review [Slides]"
INFO - 2024-06-12 17:30:25,958 [embed.py:138] : Embedding 43 text chunks...
INFO - 2024-06-12 17:30:42,657 [embed.py:164] : Finished all embeddings.
INFO - 2024-06-12 17:30:42,672 [1875023688.py:57] : Saved: ./embed/2278805.json
INFO - 2024-06-12 17:30:47,691 [pdf.py:71] : Loading PDF: ./pdfs/2278633.pdf
INFO - 2024-06-12 17:30



INFO - 2024-06-12 17:36:03,469 [embed.py:138] : Embedding 17 text chunks...
INFO - 2024-06-12 17:36:09,319 [embed.py:164] : Finished all embeddings.
INFO - 2024-06-12 17:36:09,325 [1875023688.py:57] : Saved: ./embed/2229754.json
INFO - 2024-06-12 17:36:14,341 [pdf.py:71] : Loading PDF: ./pdfs/2315708.pdf
INFO - 2024-06-12 17:36:14,609 [pdf.py:89] : Finished loading PDF.
INFO - 2024-06-12 17:36:14,662 [parse.py:166] : Cleaning headers
INFO - 2024-06-12 17:36:14,669 [1875023688.py:42] : Saved: ./txt/2315708.txt
INFO - 2024-06-12 17:36:14,669 [1875023688.py:45] : Embedding 16/20: "Identifying and Estimating Project Development Costs [Slides]"
INFO - 2024-06-12 17:36:14,738 [embed.py:138] : Embedding 7 text chunks...
INFO - 2024-06-12 17:36:20,722 [embed.py:164] : Finished all embeddings.
INFO - 2024-06-12 17:36:20,727 [1875023688.py:57] : Saved: ./embed/2315708.json
INFO - 2024-06-12 17:36:25,745 [pdf.py:71] : Loading PDF: ./pdfs/2280958.pdf
INFO - 2024-06-12 17:36:26,171 [pdf.py:89] : Fi

# Run APP   