In [5]:
import os
import asyncio
import nest_asyncio

nest_asyncio.apply()

import pandas as pd
import logging
import openai
from openai import AzureOpenAI
import time
from glob import glob
from rex import init_logger

from elm.pdf import PDFtoTXT
from elm.embed import ChunkAndEmbed
from elm.osti import OstiList

from openai import AzureOpenAI

from glob import glob
import certifi
from docx import Document  # Import to handle .docx files


os.environ['SSL_CERT_FILE'] = certifi.where()

logger = logging.getLogger(__name__)
init_logger(__name__, log_level='DEBUG')
init_logger('elm', log_level='INFO')


<Logger elm (INFO)>

In [6]:
openai.api_base = 'https://stratus-embeddings-south-central.openai.azure.com/'
openai.api_key = 'eccaf77417c541a3ab38e6c550b56259'
openai.api_type = 'azure'
openai.api_version = '2024-02-15-preview'

ChunkAndEmbed.EMBEDDING_MODEL = 'text-embedding-ada-002-2'
ChunkAndEmbed.EMBEDDING_URL = ('https://stratus-embeddings-south-central.'
                               'openai.azure.com/openai/deployments/'
                               'text-embedding-ada-002-2/embeddings?'
                               f'api-version={openai.api_version}')
ChunkAndEmbed.HEADERS = {"Content-Type": "application/json",
                         "Authorization": f"Bearer {openai.api_key}",
                         "api-key": f"{openai.api_key}"}

PDF_DIR = './pdfs/'
DOCX_DIR = './docx/'  # Directory for .docx files
TXT_DIR = './txt/'
EMBED_DIR = './embed/'

URL = ('https://www.osti.gov/api/v1/records?'
       'research_org=NREL'
       '&sort=publication_date%20desc'
       '&product_type=Technical%20Report'
       '&has_fulltext=true'
       '&publication_date_start=01/01/2023'
       '&publication_date_end=12/31/2023')

# Function to read text from .docx file
def read_docx(file_path):
    doc = Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs])

# Function to generate embeddings asynchronously
async def generate_embeddings(text, tag):
    obj = ChunkAndEmbed(text, tag=tag, tokens_per_chunk=500, overlap=1)
    embeddings = await obj.run_async(rate_limit=3e4)
    if any(e is None for e in embeddings):
        raise RuntimeError('Embeddings are None!')
    return embeddings



In [8]:
if __name__ == '__main__':
    # Ensure directories exist
    os.makedirs(DOCX_DIR, exist_ok=True)
    os.makedirs(TXT_DIR, exist_ok=True)
    os.makedirs(EMBED_DIR, exist_ok=True)

    # Get list of local .docx files
    docx_files = glob(os.path.join(DOCX_DIR, '*.docx'))

    # Metadata to store file info
    meta_data = []

    for i, docx_fp in enumerate(docx_files):
        # Clean filename: replace spaces with underscores
        clean_filename = os.path.basename(docx_fp).replace(' ', '_')

        # Derive paths for the txt and embedding files
        txt_fp = os.path.join(TXT_DIR, clean_filename.replace('.docx', '.txt'))
        embed_fp = os.path.join(EMBED_DIR, clean_filename.replace('.docx', '.json'))

        # Read or generate text from .docx
        if os.path.exists(txt_fp):
            with open(txt_fp, 'r') as f:
                text = f.read()
        else:
            # Convert .docx to text
            text = read_docx(docx_fp)

            # Save the extracted text
            with open(txt_fp, 'w') as f:
                f.write(text)
            logger.info(f'Saved: {txt_fp}')

        # Check if embeddings already exist
        if not os.path.exists(embed_fp):
            # Generate embeddings
            logger.info(f'Embedding {i+1}/{len(docx_files)}: "{clean_filename}"')

            # Ensure the tag is correctly formatted
            tag = f"Filename: \"{clean_filename}\""

            # Run the embedding function
            embeddings = asyncio.run(generate_embeddings(text, tag))

            # Save embeddings
            if any(e is None for e in embeddings):
                raise RuntimeError('Embeddings are None!')
            else:
                # Instead of using the full text, use the text chunks for each embedding
                df = pd.DataFrame({
                    'text': ChunkAndEmbed(text).text_chunks.chunks,  # Get the text chunks
                    'embedding': embeddings  # Embeddings for each chunk
                })

                df.to_json(embed_fp, indent=2)
                logger.info(f'Saved: {embed_fp}')

            # Rate limiting to avoid overloading the API
            time.sleep(5)

        # Add metadata for the file
        meta_data.append({
            'filename': clean_filename,
            'txt_fp': txt_fp,
            'embed_fp': embed_fp
        })

    # Convert metadata to DataFrame and save
    meta_df = pd.DataFrame(meta_data)
    meta_df.to_csv('./meta.csv', index=False)

    # Validation: Check for any bad outputs (missing embeddings)
    bad = []
    for fp in glob(EMBED_DIR + '*.json'):
        data = pd.read_json(fp)
        if data['embedding'].isna().any():
            bad.append(fp)
    assert not any(bad), f'Bad output: {bad}'

    logger.info('Finished!')


INFO - 2024-09-04 11:15:21,508 [3506957178.py:32] : Saved: ./txt/02_C2C_Expert_Match_Summary_New_York_City,_NY.txt
INFO - 2024-09-04 11:15:21,509 [3506957178.py:37] : Embedding 1/43: "02_C2C_Expert_Match_Summary_New_York_City,_NY.docx"
INFO - 2024-09-04 11:15:21,531 [embed.py:138] : Embedding 1 text chunks...
INFO - 2024-09-04 11:15:22,201 [embed.py:164] : Finished all embeddings.
INFO - 2024-09-04 11:15:22,221 [3506957178.py:56] : Saved: ./embed/02_C2C_Expert_Match_Summary_New_York_City,_NY.json
INFO - 2024-09-04 11:15:27,242 [3506957178.py:32] : Saved: ./txt/28_C2C_Expert_Match_Summary_Free_Union_Pine_Way_NC.txt
INFO - 2024-09-04 11:15:27,243 [3506957178.py:37] : Embedding 2/43: "28_C2C_Expert_Match_Summary_Free_Union_Pine_Way_NC.docx"
INFO - 2024-09-04 11:15:27,264 [embed.py:138] : Embedding 1 text chunks...
INFO - 2024-09-04 11:15:27,869 [embed.py:164] : Finished all embeddings.
INFO - 2024-09-04 11:15:27,891 [3506957178.py:56] : Saved: ./embed/28_C2C_Expert_Match_Summary_Free_Unio