### Tries to parse pdf and generate structured format

In [1]:
%pip install instructor
%pip install pydantic
%pip install openai
%pip install deep-translator
%pip install python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;4

In [2]:
import os
import re
import json
import requests
import pandas as pd
from enum import Enum
from bs4 import BeautifulSoup
import instructor
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field
from deep_translator import GoogleTranslator

load_dotenv()


True

In [3]:
class DocumentType(Enum):
    ACTE_DELIB = "Deliberation"
    ACTE_ARRETE = "Decree"
    ACTE_RAA = "Collection of administrative acts"
    PV_FULL = "Minutes"
    PV_CR = "Minutes"
    PV_ODJ = "Agenda"
    PV_VIDEO = "Video"
    PROJET = "Draft act"
    DLAO_PLU = "Local Urban Plan"
    DLAO_PCAET = "Territorial Climate-Air-Energy Plan"
    DLAO_SCOT = "Territorial Coherence Scheme"
    DLAO_AUTRES = "Other planning document"
    MP_ANNONCE = "Public procurement announcement"
    MP_AVENANT = "Public procurement amendment"
    MP_REGLEMENT = "Public procurement regulations"
    COMM = "Other communication"
    COMM_ORG = "Organizational chart"
    RAPPORT = "Report"
    BDJ = "Budget"
    BDJ_ANNEXES = "Annexed budget document"
    COMM_AGENDA = "Agenda"
    COMM_CONTACT = "Contact"
    COMM_CONCERT = "Consultation"
    COMM_DEMARCHES = "Procedures"
    COMM_EMPLOI = "Employment"
    COMM_PRESENT = "Presentation"
    COMM_PROJECT = "Project"
    COMM_MAGAZINE = "Periodical"
    COMM_ELECTION = "Elections"
    COMM_SERVICES = "Services"
    OTHER = "Other"

In [4]:
def get_document_meaning(code):
    try:
        return DocumentType[code.replace(".", "_").upper()].value
    except KeyError:
        return DocumentType.OTHER.value

In [5]:
# Binary Filter Pydantic Model

class GeothermalProjectDetection(BaseModel):
    is_geothermal: bool = Field(description="Whether the document concerns a geothermal project")
    chain_of_thought: str = Field(
        description="""Logical Steps that were taken to derive
        the final concluding statement"""
    )

In [6]:
# Stage Classifier Pydantic Model

class StageClassifier(BaseModel):
    stage: str = Field(description="The stage of the geothermal project")
    chain_of_thought: str = Field(
        description="""Logical Steps that were taken to derive
        the final concluding statement"""
    )

In [7]:
# Define the Pydantic models for the data extraction i am attempting to do

class InitialBudget(BaseModel):
    answer: float = Field(description="Initial budget of the project")
    chain_of_thought: str = Field(
        description="""Logical Steps that were taken to derive
        the final concluding statement"""
    )

class FinalCost(BaseModel):
    answer: float = Field(description="Final cost of the project")
    chain_of_thought: str = Field(
        description="""Logical Steps that were taken to derive
        the final concluding statement"""
    )

class EstimatedDuration(BaseModel):
    answer: int = Field(description="Estimated duration in days")
    chain_of_thought: str = Field(
        description="""Logical Steps that were taken to derive
        the final concluding statement"""
    )

class ActualDuration(BaseModel):
    answer: int = Field(description="Actual duration in days")
    chain_of_thought: str = Field(
        description="""Logical Steps that were taken to derive
        the final concluding statement"""
    )

In [8]:
# TO-DO : Remove hardcoded API key
API_KEY = os.getenv("HYPERBOLIC_API_KEY")

client = instructor.from_openai(OpenAI(
    base_url="https://api.hyperbolic.xyz/v1",
    api_key=API_KEY,
  ),
  mode=instructor.Mode.JSON,
)

In [9]:
# Load the dataset
df = pd.read_csv('../dataset.csv')

# Load a single record but should scale to process multiple records
df = df.sample(1)


# TO-DO : Replace with logic that converts pdf to markdown
def extract_text(row):
    url = row['fulltext']
    nature = get_document_meaning(row['nature'])
    print(f"The meaning of '{row['nature']}' is '{nature}'")
    published = row['published']
    entity_name = row['entity_name']
    entity_type = row['entity_type']
    geo_path = row['geo_path']
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text()
    text = re.sub(r'\s+', ' ', text).strip()
    concat_text = f" Article Published on {published} by {entity_name} of type {entity_type} in {geo_path} with nature {nature} has the following text: {text}"
    return concat_text

df['extracted_text'] = df.apply(extract_text, axis=1)

The meaning of 'dlao.plu' is 'Local Urban Plan'


In [10]:
def translate_text(doc_id, french_text):
    output_dir = 'processed_text'

    # Remove special characters from document id
    doc_id = re.sub(r'\W+', '', doc_id)

    try:
        # Initialize translator and translate
        translator = GoogleTranslator(source='fr', target='en')

        # Handle long text by splitting into chunks
        chunk_size = 4500  # Google Translate limit
        chunks = [french_text[i:i+chunk_size] for i in range(0, len(french_text), chunk_size)]
        
        # Translate chunks and combine
        translated_chunks = [translator.translate(text) for text in chunks]
        translated_text = ' '.join(translated_chunks)
        
        # Save original and translated text to file for validation and return path
        with open(f'{output_dir}/{doc_id}_french.txt', 'w') as f:
            f.write(french_text)

        with open(f'{output_dir}/{doc_id}_english.txt', 'w') as f:
            f.write(translated_text)

    except Exception as e:
        print(f"Translation error: {str(e)}")

    return f'{output_dir}/{doc_id}_french.txt' , f'{output_dir}/{doc_id}_english.txt'

In [11]:
# Binary Filter System Message
# TO-DO : Set more realistic criteria for classification of geothermal projects in bullet points
geothermal_detection_message = {
        "role": "system",
        "content": f"""
                You are a helpful AI Question Answerer. You are about to
                be passed a project description text in French by a User.

                Your task is to determine if the text concerns a geothermal project
                or is unrelated to a geothermal project. Consider mentions of:
                - Geothermal energy installations
                - Geothermal studies or surveys
                - Heat pump systems using geothermal energy
                - Ground source heating/cooling
                - Drilling for geothermal resources
                - Keywords like "géothermie", "géothermique"
                - Nature of the document

                Make sure to generate a series of logical steps and reason
                about the problem before generating a solution.
                """,
}

In [12]:
# Stage Classifier System Message
stage_classifier_message = {
        "role": "system",
        "content": f"""
                You are a helpful AI Question Answerer. You are about to
                be passed a project description text in French by a User.

                Your task is to determine the stage of the geothermal project
                based on the text. Classify the project into one of the following:
                - idea/wish
                - preliminary studies
                - budget voted for the definitive project
                - implementation in progress
                - installation completed

                Make sure to generate a series of logical steps and reason
                about the problem before generating a solution.
                """,
}

In [13]:
initial_budget_message = {
        "role": "system",
        "content": f"""
                You are a helpful AI Question Answerer. You are about to
                be passed a project description text in French by a User.

                Your task is to extract the initial budget of the project.
                Make sure to generate a series of logical steps and reason
                about the problem before generating a solution.
                """,
}
final_cost_message = {
        "role": "system",
        "content": f"""
                You are a helpful AI Question Answerer. You are about to
                be passed a project description text in French by a User.

                Your task is to extract the final cost of the project.
                Make sure to generate a series of logical steps and reason
                about the problem before generating a solution.
                """,
}

estimated_duration_message = {
        "role": "system",
        "content": f"""
                You are a helpful AI Question Answerer. You are about to
                be passed a project description text in French by a User.

                Your task is to extract the estimated duration of the project.
                Make sure to generate a series of logical steps and reason
                about the problem before generating a solution.
                """,
}

actual_duration_message = {
        "role": "system",
        "content": f"""
                You are a helpful AI Question Answerer. You are about to
                be passed a project description text in French by a User.

                Your task is to extract the actual duration of the project.
                Make sure to generate a series of logical steps and reason
                about the problem before generating a solution.
                """,
}


In [15]:


# Define a function to extract project details
def extract_project_details(doc_id, text):
    print(f"Extracting project details for document {doc_id}")
    model = "deepseek-ai/DeepSeek-V3"
    user_message = {
                "role": "user",
                "content": text,
            }
    # Execute translation
    original_text_path, english_translation_path = translate_text(doc_id, text)

    # Execute Geothermal Detection
    handle_geothermal_detection_response = client.chat.completions.create(
        model = model,
        messages = [
            geothermal_detection_message,
            user_message,
            ],
        max_tokens=512,
        temperature=0.1,
        top_p=0.9,
        response_model = GeothermalProjectDetection,
    )

    print("Is the project geothermal: ", handle_geothermal_detection_response.is_geothermal)
    print("The chain of thought is: ", handle_geothermal_detection_response.chain_of_thought)


    # Execute Stage Classifier
    handle_stage_classifier_response = client.chat.completions.create(
        model = model,
        messages = [
            stage_classifier_message,
            user_message,
            ],
        max_tokens=512,
        temperature=0.1,
        top_p=0.9,
        response_model = StageClassifier,
    )

    print("The stage of the project is: ", handle_stage_classifier_response.stage)
    print("The chain of thought is: ", handle_stage_classifier_response.chain_of_thought)

    handle_initial_budget_response = client.chat.completions.create(
        model = model,
        messages = [
            initial_budget_message,
            user_message,
            ],
        max_tokens=512,
        temperature=0.1,
        top_p=0.9,
        response_model = InitialBudget,
    )

    print("The initial budget is: ", handle_initial_budget_response.answer)
    print("The chain of thought is: ", handle_initial_budget_response.chain_of_thought)

    handle_final_cost_response = client.chat.completions.create(
        model = model,
        messages = [
            final_cost_message,
            user_message,
            ],
        max_tokens=512,
        temperature=0.1,
        top_p=0.9,
        response_model = FinalCost,
    )

    print("The final cost is: ", handle_final_cost_response.answer)
    print("The chain of thought is: ", handle_final_cost_response.chain_of_thought)

    handle_estimated_duration_response = client.chat.completions.create(
        model = model,
        messages = [
            estimated_duration_message,
            user_message,
            ],
        max_tokens=512,
        temperature=0.1,
        top_p=0.9,
        response_model = EstimatedDuration,
    )

    print("The estimated duration is: ", handle_estimated_duration_response.answer)
    print("The chain of thought is: ", handle_estimated_duration_response.chain_of_thought)

    handle_actual_duration_response = client.chat.completions.create(

        model = model,
        messages = [
            actual_duration_message,
            user_message,
            ],
        max_tokens=512,
        temperature=0.1,
        top_p=0.9,
        response_model = ActualDuration,
    )

    print("The actual duration is: ", handle_actual_duration_response.answer)
    print("The chain of thought is: ", handle_actual_duration_response.chain_of_thought)

    return {
        "original_text_path": original_text_path,
        "english_translation_path": english_translation_path,
        "is_geothermal": handle_geothermal_detection_response.is_geothermal,
        "geothermal_chain_of_thought": handle_geothermal_detection_response.chain_of_thought,
        "initial_budget": handle_initial_budget_response.answer,
        "initial_budget_chain_of_thought": handle_initial_budget_response.chain_of_thought,
        "final_cost": handle_final_cost_response.answer,
        "final_cost_chain_of_thought": handle_final_cost_response.chain_of_thought,
        "estimated_duration": handle_estimated_duration_response.answer,
        "estimated_duration_chain_of_thought": handle_estimated_duration_response.chain_of_thought,
        "actual_duration": handle_actual_duration_response.answer,
        "actual_duration_chain_of_thought": handle_actual_duration_response.chain_of_thought,
    }

# Apply the function to the subset of the dataset
project_details_list = df.apply( lambda x: extract_project_details(x['doc_id'], x['extracted_text']), axis=1)

Extracting project details for document 34755/84947b29159ed87b5eda940797aca0d3c772ef39_1-Rapport-de-Pre
Is the project geothermal:  True
The chain of thought is:  The document mentions 'géothermie basse enthalpie' (low-enthalpy geothermal energy) and discusses the potential for geothermal energy exploitation in the commune of Neydens. It also references the use of geothermal heat pumps and the favorable conditions for geothermal energy in the area. These references indicate that the document is related to a geothermal project.
The stage of the project is:  preliminary studies
The chain of thought is:  The text discusses the Local Urban Plan (PLU) for Neydens, which includes detailed environmental assessments, studies on biodiversity, water management, and energy potential. These elements indicate that the project is in the stage of preliminary studies, where various aspects of the project are being analyzed and planned before any implementation can begin.
The initial budget is:  0.0
Th

In [16]:
# Save the results as formatted json for validation
with open('project_details.json', 'w') as f:
    json.dump(project_details_list.to_dict(), f, indent=4)