In [7]:
from pydantic import BaseModel
from openai import AzureOpenAI
import os
import json
import pandas as pd
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
import re
from azure.cosmos import CosmosClient, exceptions, PartitionKey
from dotenv import load_dotenv
import os

In [8]:
# Load environment variables from .env file
load_dotenv()

def read_json_files_from_blob(folder_path):
    # Retrieve the connection string from the environment variables
    connection_string = os.getenv('STORAGE_CONNECTION_STRING')

    # Ensure the connection string is not None
    if connection_string is None:
        raise ValueError("The connection string environment variable is not set.")

    # Create a BlobServiceClient
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)

    # Get the container client
    container_client = blob_service_client.get_container_client("data")

    # List all blobs in the specified folder
    blob_list = container_client.list_blobs(name_starts_with=folder_path)

    # Filter out JSON files and read their contents
    for blob in blob_list:
        if blob.name.endswith('.json'):
            blob_client = container_client.get_blob_client(blob.name)
            blob_data = blob_client.download_blob().readall()
            data = json.loads(blob_data)
            return data 

In [9]:
houseloan = read_json_files_from_blob("houseloan")

In [10]:
def clean_json_data(json_data):
    # Extract relevant text content from the JSON
    content = []

    # Extract text from paragraphs
    paragraphs = json_data.get("paragraphs", [])
    for paragraph in paragraphs:
        content.append(paragraph.get("text", "").strip())

    # Extract text from pages and lines
    pages = json_data.get("pages", [])
    for page in pages:
        for line in page.get("lines", []):
            content.append(line.get("text", "").strip())

    # Join all text content into a single string with spaces between components
    plain_text_content = " ".join(content)

    print(plain_text_content)

    return plain_text_content

# Clean the JSON data and extract Customer ID
houseloan_structured = clean_json_data(houseloan)

Contoso Bank - House Loan Terms and Conditions 1. Introduction These terms and conditions govern the house loans provided by Contoso Bank (referred to as "the Bank") to customers (referred to as "Borrower"). By applying for and accepting a house loan, the Borrower agrees to the terms and conditions outlined herein. 2. Loan Amount and Purpose · The loan is granted exclusively for the purpose of purchasing a residential property, refinancing an existing mortgage, or for approved home improvement projects. . The maximum loan amount will be determined by the Bank based on the Borrower's financial profile, creditworthiness, and property value. 3. Interest Rates . Fixed Rate: The interest rate remains constant throughout the loan term. . Variable Rate: The interest rate may fluctuate based on market conditions and will be tied to a publicly available index. Changes in the interest rate will affect the Borrower's monthly payments. . Interest rates are disclosed at the time of loan approval an

In [11]:
# Load environment variables from .env file
load_dotenv()

# Cosmos DB connection details from environment variables
endpoint = os.getenv("COSMOS_ENDPOINT")
key = os.getenv("COSMOS_KEY")

def upload_text_to_cosmos_db(text_content, container_name):
    # Check if the text is empty
    if not text_content:
        print("The text content is empty. No data to upload.")
        return
    
    # Initialize the Cosmos client
    client = CosmosClient(endpoint, key)
    
    try:
        # Create or get the database
        database = client.create_database_if_not_exists(id="ContosoDB")
        
        # Create or get the container
        container = database.create_container_if_not_exists(
            id=container_name,
            partition_key=PartitionKey(path=f"/id"),
            offer_throughput=400
        )
    except exceptions.CosmosHttpResponseError as e:
        print(f"An error occurred while creating the database or container: {e.message}")
        return
    
    # Create a document with the text content and partition key
    document = {
        'id': str(10002),  # Generate a unique ID for the document
        'content': text_content,  # Store the plain text as 'content'
    }
    
    # Upload the document to the container
    try:
        container.create_item(body=document)
        houseloan_json = document
        print(f"Text content uploaded successfully with ID '{document['id']}' in Cosmos DB.")
    except exceptions.CosmosHttpResponseError as e:
        print(f"An error occurred while uploading the document: {e.message}")
    return document

In [12]:
upload_text_to_cosmos_db(houseloan_structured, "HouseLoan")

Text content uploaded successfully with ID '10002' in Cosmos DB.


{'id': '10002',
 'content': 'Contoso Bank - House Loan Terms and Conditions 1. Introduction These terms and conditions govern the house loans provided by Contoso Bank (referred to as "the Bank") to customers (referred to as "Borrower"). By applying for and accepting a house loan, the Borrower agrees to the terms and conditions outlined herein. 2. Loan Amount and Purpose · The loan is granted exclusively for the purpose of purchasing a residential property, refinancing an existing mortgage, or for approved home improvement projects. . The maximum loan amount will be determined by the Bank based on the Borrower\'s financial profile, creditworthiness, and property value. 3. Interest Rates . Fixed Rate: The interest rate remains constant throughout the loan term. . Variable Rate: The interest rate may fluctuate based on market conditions and will be tied to a publicly available index. Changes in the interest rate will affect the Borrower\'s monthly payments. . Interest rates are disclosed 

In [14]:
from pydantic import BaseModel
from openai import AzureOpenAI
from typing import List

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
  api_key=os.getenv("AZURE_OPENAI_KEY"),
  api_version= "2024-08-01-preview"
)

class Article(BaseModel):
    article_title: str
    content: str

class Document(BaseModel):
    title: str
    items: List[Article]

completion = client.beta.chat.completions.parse(
    model="gpt-4o", # replace with the model deployment name of your gpt-4o 2024-08-06 deployment
    messages=[
        {"role": "system", "content": "Extract the list of information about house loan document's article title (i.e. '1. Title A', '2. Title B') and content including bullet point lists"},
        {"role": "user", "content": houseloan_structured},
    ],
    response_format=Document,
)

finaljsonstr = completion.model_dump_json(indent=2)
print(finaljsonstr)

{
  "id": "chatcmpl-Ae0PkDG4stWdXsTs6npoPvhSFZ46G",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "{\"title\":\"Contoso Bank - House Loan Terms and Conditions\",\"items\":[{\"article_title\":\"1. Introduction\",\"content\":\"These terms and conditions govern the house loans provided by Contoso Bank (referred to as \\\"the Bank\\\") to customers (referred to as \\\"Borrower\\\"). By applying for and accepting a house loan, the Borrower agrees to the terms and conditions outlined herein.\"},{\"article_title\":\"2. Loan Amount and Purpose\",\"content\":\"· The loan is granted exclusively for the purpose of purchasing a residential property, refinancing an existing mortgage, or for approved home improvement projects.\\n· The maximum loan amount will be determined by the Bank based on the Borrower's financial profile, creditworthiness, and property value.\"},{\"article_title\":\"3. Interest Rates\",\"conte

In [15]:
def formatted_data_cleaning(json_string):
    # Load the JSON string into a Python dictionary
    data = json.loads(json_string)

    # Extract the parsed information
    parsed_info = data["choices"][0]["message"]["parsed"]

    # Create a new dictionary with only the parsed information
    result = {
        "id": "10003",
        **parsed_info
    }

    return result

# Example usage
result_json = formatted_data_cleaning(finaljsonstr)
print(result_json)

{'id': '10003', 'title': 'Contoso Bank - House Loan Terms and Conditions', 'items': [{'article_title': '1. Introduction', 'content': 'These terms and conditions govern the house loans provided by Contoso Bank (referred to as "the Bank") to customers (referred to as "Borrower"). By applying for and accepting a house loan, the Borrower agrees to the terms and conditions outlined herein.'}, {'article_title': '2. Loan Amount and Purpose', 'content': "· The loan is granted exclusively for the purpose of purchasing a residential property, refinancing an existing mortgage, or for approved home improvement projects.\n· The maximum loan amount will be determined by the Bank based on the Borrower's financial profile, creditworthiness, and property value."}, {'article_title': '3. Interest Rates', 'content': "· Fixed Rate: The interest rate remains constant throughout the loan term.\n· Variable Rate: The interest rate may fluctuate based on market conditions and will be tied to a publicly availabl

In [18]:
upload_text_to_cosmos_db(result_json, "HouseLoan")

Text content uploaded successfully with ID '10002' in Cosmos DB.


{'id': '10002',
 'content': {'id': '10003',
  'title': 'Contoso Bank - House Loan Terms and Conditions',
  'items': [{'article_title': '1. Introduction',
    'content': 'These terms and conditions govern the house loans provided by Contoso Bank (referred to as "the Bank") to customers (referred to as "Borrower"). By applying for and accepting a house loan, the Borrower agrees to the terms and conditions outlined herein.'},
   {'article_title': '2. Loan Amount and Purpose',
    'content': "· The loan is granted exclusively for the purpose of purchasing a residential property, refinancing an existing mortgage, or for approved home improvement projects.\n· The maximum loan amount will be determined by the Bank based on the Borrower's financial profile, creditworthiness, and property value."},
   {'article_title': '3. Interest Rates',
    'content': "· Fixed Rate: The interest rate remains constant throughout the loan term.\n· Variable Rate: The interest rate may fluctuate based on market 