In [3]:
from openai import AzureOpenAI

# # Load the JSON data from the file
# with open("test_files.json", "r") as file:
#     data = json.load(file
import os

AZURE_OPENAI_API_KEY = "OPENIA_API_KEY"
endpoint = os.getenv("ENDPOINT_URL", "https://rtw-accommodations.openai.azure.com/")
deployment = os.getenv("DEPLOYMENT_NAME", "RTW-Accommodation")
subscription_key = os.getenv("AZURE_OPENAI_API_KEY", AZURE_OPENAI_API_KEY)
# OPENAI_API_KEY = ''
# client = OpenAI(api_key=OPENAI_API_KEY)
client = AzureOpenAI(
    azure_endpoint = endpoint,
    api_key = subscription_key,
    api_version = "2024-05-01-preview",
)

In [16]:
import fitz
import docx2txt
import os

def extract_text_from_pdf(file_path):
    with fitz.open(file_path) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])
        return text

def extract_text_from_docx(file_path):
    text = docx2txt.process(file_path)
    return text

def extract_text(file_path):
    # Get the file extension
    file_extension = os.path.splitext(file_path)[1].lower()

    # Determine the extraction method based on the file extension
    if file_extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension == '.docx':
        return extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type. Please provide a PDF or DOCX file.")

#filter through all docs in a folder and extract text from them instead of one at a time
extracted_data = []

#speicify directory containing files
directory ="./downloads"

#loop through all files
for file in os.listdir(directory):
    # Extract the file name without extension
    file_name = os.path.splitext(os.path.basename(file))[0]

# Extract text from the file
    extracted_text = extract_text(f'{directory}/{file}')

# Create a dictionary with extracted details
    file_data = {
    "name": file_name,
    "text": extracted_text,
    "extension": os.path.splitext(file)[1].lower(),
    "link":''
    }
    
    extracted_data.append(file_data)

#print extracted text to confirm accuracy
#print(json.dumps(file_data, indent=4))
for part in extracted_data:
    print(part)

{'name': 'Accommodations - Construction - Material Lifters (WSIB Newsletter)', 'text': ' \n \n  \nThe Accommodations SME’s are accepting referrals! \n \nACCOMMODATIONS NEWS \nVolume 2/ Issue 16 (April 21, 2023) \n \n   \n \n \n \n \n \n \nDrywall Panel Lifters \nDrywall lifters eliminate the need for lifting \nand holding heavy sheets of drywall \noverhead and allow one person to complete \nthis task. \n \nConstruction – Material lifters \nThis week’s construction edition will focus on lifting/carrying heavy and awkward construction materials.  We have all seen \noverhead cranes, forklifts and pallet jacks, but here are some ideas for handling more awkward items when on a worksite. \n \n  \n \n \n \n \n \n                                                  \n$30- $50 \nManual Jacks \nThis manual jack can be used to raise materials and \nlevel them before securing in place, eliminating the need \nfor a worker to lift and hold the materials. This is another \ntool that allows an IIP to wor

In [24]:
from datetime import datetime
import json
#loop through all extracted data
for doc in extracted_data:
    name = doc["name"]
    text = doc["text"]
    prompt = f"""
    You will receive a text containing information about multiple accommodations. Extract each accommodation from the text with the following details:
    - Accommodation Name: The specific tool or method being used to accommodate workers. Retrieve directly from the text; do not create or alter names;;
    - Description: A detailed explanation of the accommodation.Get this info from the text directly. Retrieve directly from the text; do not create or alter description;ter names;
    - Injury Location Name: Choosing only from the list: {"Body systems", "Multiple body parts", "Cranial region, including skull", "Leg(s)", "Lower back (lumbar, sacral, coccygeal regions)", "Shoulder", "Ankle(s)", "Finger(s), fingernail(s)", "Arm(s)", "Wrist(s)", "Not Coded", "Foot (feet), except toe(s)", "Chest, including ribs, internal organs", "Pelvic region", "Upper extremities, unspecified, NEC", "Multiple trunk locations", "Multiple lower extremities locations", "Hand(s), except finger(s)", "Upper back (cervical, thoracic regions)", "Multiple back regions", "Abdomen", "Back, unspecified, NEC", "Head, unspecified, NEC", "Eye(s)", "Face", "Toe(s), toenail(s)", "Ear(s)", "Multiple head locations", "Lower extremities, unspecified, NEC", "Trunk, unspecified, NEC", "Other body parts including unclassified, NEC"} identify the part of the body that the accommodation aims to protect or assist;
    - Industry Name: Choosing only from the list: {"Agriculture, forestry, fishing, and hunting", "Mining, quarrying, and oil and gas extraction", "Utilities", "Construction", "Manufacturing", "Wholesale trade", "Retail trade", "Transportation and warehousing", "Information and cultural industries", "Finance and insurance", "Real estate and rental and leasing", "Professional, scientific, and technical services", "Management of companies and enterprises", "Administrative and support, waste management, and remediation services", "Educational services", "Health care and social assistance", "Arts, entertainment, and recreation", "Accommodation and food services", "Other services (except public administration)", "Public administration"} identify the industry in which the accommodation is used (e.g., Construction). If there is no industry specified say "Multiple";
    - Activity Name: Identify the physical tasks associated with or accommodated by the accommodation.
    - Summary: please create a summary of the document so that any one who wishes to know what the document is about can get a brief overview. Please take ideas directly from the document only.

    Format the extracted data as a JSON object, with an array if multiple are mentioned in the text. Use the following structure:
    {{
      "accommodations": [
        {{
         "accommodation_name": "",
          "accommodation_description": "",
          "injury_location_name": "",
          "industry_name": "",
          "activity_name": ""
        }}
      ], 
      "document_description": ""
    }}

    Title: {name}
    Text: {text}
    """

    response = client.chat.completions.create(
        model=deployment,
        messages=[
            {"role": "system", "content": "You are a helpful assistant designed to extract data from text and format it as JSON object."},
            {"role": "user", "content": prompt}
        ],
        # past_messages=10,
        max_tokens=2000,
        temperature=0.7,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None,
        stream=False
    )
    
    # Parse the response JSON string into a Python dictionary
    extracted_json = json.loads(response.choices[0].message.content)
    # Add verified and date_created fields
    current_date = datetime.now().strftime("%Y-%m-%d")  # Current date only
    for accommodation in extracted_json['accommodations']:
        accommodation['verified'] = False
        accommodation['date_created'] = current_date

    print(f"Response for '{name}':")
    print(json.dumps(extracted_json, indent=2))
    print("\n" + "=" * 50 + "\n")

Response for 'Accommodations - Construction - Material Lifters (WSIB Newsletter)':
{
  "accommodations": [
    {
      "accommodation_name": "Drywall Panel Lifters",
      "accommodation_description": "Drywall lifters eliminate the need for lifting and holding heavy sheets of drywall overhead and allow one person to complete this task.",
      "injury_location_name": "Arm(s)",
      "industry_name": "Construction",
      "activity_name": "Lifting and holding heavy sheets of drywall overhead",
      "verified": false,
      "date_created": "2024-09-17"
    },
    {
      "accommodation_name": "Manual Jacks",
      "accommodation_description": "This manual jack can be used to raise materials and level them before securing in place, eliminating the need for a worker to lift and hold the materials. This is another tool that allows an IIP to work independently and in a safe manner.",
      "injury_location_name": "Arm(s)",
      "industry_name": "Construction",
      "activity_name": "Raisi

In [21]:
from azure.storage.blob import BlobServiceClient

# Replace with your Azure Blob Storage credentials
account_name = 'account_name'
account_key = 'key' 
container_name = 'container_name'

# Create a BlobServiceClient object using the account URL and account key
blob_service_client = BlobServiceClient(
    account_url=f"https://{account_name}.blob.core.windows.net",
    credential=account_key
)

# Create a ContainerClient object
container_client = blob_service_client.get_container_client(container_name)


#documents are pushed

for doc in extracted_data:
    blob_client = container_client.get_blob_client(doc["name"])
    file_path = f"./{directory}/{doc['name']}{doc['extension']}"
    
    #for real implementation we would need to do some series error handling here
    
    if os.path.exists(file_path):
        with open(file_path, "rb") as data:
            blob_client.upload_blob(data)
            doc["link"] = blob_client.url
    else:
        print(f"File not found: {file_path}")
        


https://rtwblobwsib.blob.core.windows.net/rtwblobs/Accommodations%20-%20Construction%20-%20Material%20Lifters%20%28WSIB%20Newsletter%29
https://rtwblobwsib.blob.core.windows.net/rtwblobs/agriculture
