In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_deepseek import ChatDeepSeek
from langchain.prompts import PromptTemplate
import os



In [20]:
# make sure to be able to import utils functions
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../..")) 
if project_root not in sys.path:
    sys.path.append(project_root)

## Sparse and embedding the document

In [48]:
def build_pdf_retriever(
    pdf_path: str,
    embedding_model_name: str = "BAAI/bge-base-en-v1.5",
    chunk_size: int = 1000,
    chunk_overlap: int = 150,
    top_k: int = 5,
):
    """
    Load a PDF, split into chunks, embed with specified model, build FAISS retriever.

    Args:
        pdf_path (str): Path to the PDF document.
        embedding_model_name (str): HF model name for embeddings.
        chunk_size (int): Max tokens/characters per chunk.
        chunk_overlap (int): Overlap size between chunks.
        top_k (int): Number of top similar chunks to retrieve.

    Returns:
        retriever: LangChain retriever for semantic search.
    """

    # Load PDF pages as documents
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()

    # Split into overlapping chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    docs = text_splitter.split_documents(pages)

    # Initialize embedding model
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

    # Build FAISS vector store from document chunks
    vectorstore = FAISS.from_documents(docs, embedding_model)

    # Create retriever with similarity search
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": top_k}
    )

    return retriever

In [49]:
retriever = build_pdf_retriever(
    "data/1526439.pdf",
    embedding_model_name="BAAI/bge-base-en-v1.5"
)

## Initilize and test LLM

In [21]:
from ccai9012 import llm_utils

# Initialize LLM
api_key = llm_utils.get_deepseek_api_key()
llm = llm_utils.initialize_llm()

Enter your DEEPSEEK_API_KEY:  Â·Â·Â·Â·Â·Â·Â·Â·


In [22]:
# Test connection
test = ["Is 9.9 or 9.11 bigger?"]
llm_utils.ask_llm(test)


ðŸ“Œ Prompt:
['Is 9.9 or 9.11 bigger?']

To determine which number is larger between **9.9** and **9.11**, let's compare them step by step.

1. **Align the Decimal Places:**
   
   - **9.9** can be written as **9.90** to have the same number of decimal places as **9.11**.
   
2. **Compare Digit by Digit from Left to Right:**
   
   - **Units Place:** Both numbers have **9**.
   - **Tenths Place:** 
     - **9.90** has **9** in the tenths place.
     - **9.11** has **1** in the tenths place.
   
   Since **9** (from 9.90) is greater than **1** (from 9.11), we can conclude that **9.90** is larger than **9.11**.

3. **Final Comparison:**
   
   \[
   9.90 > 9.11
   \]
   
   Therefore, **9.9** is greater than **9.11**.

\[
\boxed{9.9 \text{ is bigger}}
\]



## Summarize the document

In [42]:
def run_qa_chain(
    query: str,
    retriever,
    llm,
    prompt_template: PromptTemplate = None,
    return_sources: bool = False,
    save_path: str = None,
):
    """
    Run a retrieval-based QA chain with optional prompt template, source printing, and saving.

    Args:
        query (str): The question to ask.
        retriever: A retriever object from LangChain.
        llm: The LLM to use.
        prompt_template (PromptTemplate, optional): Custom prompt template.
        return_sources (bool): Whether to print the source documents.
        save_path (str, optional): File path to save the result as a .txt file.

    Returns:
        str: The final result from the QA chain.
    """
    chain_kwargs = {}
    if prompt_template is not None:
        chain_kwargs["prompt"] = prompt_template

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        chain_type_kwargs=chain_kwargs,
        return_source_documents=return_sources,
    )

    response = qa_chain.invoke(query)

    print("\n--- Final Answer ---")
    print(response["result"])

    if return_sources:
        for i, doc in enumerate(response["source_documents"]):
            print(f"\n-------------------- Document {i+1} --------------------")
            print(doc.page_content)

    if save_path is not None:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(response["result"])
        print(f"\n Answer saved to: {save_path}")

    return response["result"]

In [50]:
summary = run_qa_chain(
    query="Please summarize the location, main objectives, actions, and stakeholders described in this energy plan document.",
    retriever=retriever,
    llm=llm,
    return_sources=True,
    save_path="output/summary.txt"
)


--- Final Answer ---
Hereâ€™s a summary of the **Aniak Energy Action Plan 2019** based on the provided context:

### **Location**  
- **Aniak, Alaska** â€“ The project focuses on tribal buildings in the Aniak community.

### **Main Objectives**  
1. **Improve energy efficiency** in tribal buildings through audits, retrofits, and maintenance plans.  
2. **Reduce energy consumption** by identifying cost-effective measures.  
3. **Develop a long-term Energy Action Plan** to guide future efficiency projects.  
4. **Enhance data monitoring** to track energy use and occupant comfort.  
5. **Secure funding and training** for retrofits and maintenance.  

### **Key Actions**  
1. **Energy Audits** â€“ Conducted by Energy Audits of Alaska to assess tribal buildings and recommend efficiency measures.  
2. **Data Monitoring Plan** â€“ Track energy use and comfort metrics in participating buildings.  
3. **Maintenance Plan** â€“ Provide checklists for staff to implement low-cost energy-saving mea

## Ask specific question

In [45]:
answer = run_qa_chain(
    query="Give detailed description of the responsibility of Cold Climate Housing Research Center (CCHRC)",
    retriever=retriever,
    llm=llm,
    return_sources=True,
    save_path="output/question.txt"
)


--- Final Answer ---
The **Cold Climate Housing Research Center (CCHRC)** plays a key role in supporting energy efficiency and renewable energy projects in cold climates, particularly in communities like Aniak, Alaska. Based on the provided context, here are the detailed responsibilities of CCHRC in the **Aniak Energy Action Plan 2019**:

### **Primary Responsibilities:**
1. **Project Coordination & Planning**  
   - Collaborates with **Energy Audits of Alaska** to develop an **Energy Action Plan** for the Aniak Tribe.  
   - Helps the Tribe meet grant requirements (e.g., writing quarterly progress reports, final reports, and outreach materials).  

2. **Energy Audits & Assessments**  
   - Works with **Energy Audits of Alaska** to conduct **on-site energy assessments** of buildings in Aniak.  
   - Collects **baseline data** (energy use, building conditions, occupant comfort) before audits.  

3. **Data Management & Reporting**  
   - Stores **baseline data** on CCHRCâ€™s server and 

## Extract information form multiple documents for comparison

In [51]:
structured_prompt = PromptTemplate.from_template(
"""
Given the following document text, extract key information and output a markdown table with columns:

| Location | Main Objectives | Key Actions | Stakeholders | Timeline |
|----------|-----------------|-------------|--------------|----------|

Use exact information from the text; if any info is missing, write "N/A".

Context:
{context}

Question:
{question}
"""
)

In [54]:
import pandas as pd

folder_path = "data"
save_csv_path = "output/multiple_comparison.csv"
query_text = "Extract key information from this document."
results = []

for fname in sorted(os.listdir(folder_path)):
    if not fname.lower().endswith(".pdf"):
        continue
    pdf_path = os.path.join(folder_path, fname)
    print(f"Processing {pdf_path} ...")

    # Build retriever for this pdf
    retriever = build_pdf_retriever(pdf_path)

    # Run QA chain for structured extraction
    extracted_text = run_qa_chain(
        query=query,
        retriever=retriever,
        llm=llm,
        prompt_template=structured_prompt,
        return_sources=False
    )

    results.append({
        "pdf_path": fname,
        "extracted_text": extracted_text,
    })

df = pd.DataFrame(results)
os.makedirs(os.path.dirname(save_csv_path), exist_ok=True)
df.to_csv(save_csv_path, index=False, encoding="utf-8-sig")
print(f"\n Saved results to {save_csv_path}")

Processing data/1526439.pdf ...

--- Final Answer ---
Here is the extracted key information in the requested markdown table format:

| Location | Main Objectives | Key Actions | Stakeholders | Timeline |
|----------|-----------------|-------------|--------------|----------|
| Aniak | 1. Summarize energy audit recommendations<br>2. Guide future retrofit projects<br>3. Reduce energy consumption<br>4. Improve occupant comfort | 1. Conduct energy audits of tribal buildings<br>2. Develop data monitoring plan (tracking energy use/comfort)<br>3. Create maintenance checklists<br>4. Identify training/funding opportunities<br>5. Submit quarterly reports to DOE<br>6. Coordinate with regional energy projects | 1. Aniak Traditional Council<br>2. CCHRC (Cold Climate Housing Research Center)<br>3. Energy Audits of Alaska<br>4. Nuvista Electric Light & Power/CEMAI<br>5. Department of Energy<br>6. Local staff/contractors | 2017-2018 (with quarterly reports submitted Oct-Dec 2017 and Jan-Mar 2018) |

No

## Combine the result extracted from the single dcoument

In [60]:
from io import StringIO
import re

def parse_markdown_table(md_text: str) -> pd.DataFrame:
    """
    Parse a markdown table from a string, handling <br> line breaks within cells.
    """
    # Extract markdown table (start from the first '|')
    table_lines = [line for line in md_text.splitlines() if line.strip().startswith("|")]

    if len(table_lines) < 2:
        raise ValueError("No valid markdown table found.")

    # Extract header and rows
    header_line = table_lines[0]
    column_names = [col.strip() for col in header_line.strip().strip('|').split('|')]

    # Parse rows
    data_rows = []
    for line in table_lines[2:]:  # skip header and separator
        cells = [re.sub(r'<br\s*/?>', '; ', cell.strip(), flags=re.IGNORECASE) for cell in line.strip().strip('|').split('|')]
        # If row is shorter than columns, pad
        while len(cells) < len(column_names):
            cells.append("")
        data_rows.append(cells)

    df = pd.DataFrame(data_rows, columns=column_names)
    return df

In [63]:
merged_df_list = []

for i, md in enumerate(df["extracted_text"]):
    try:
        parsed = parse_markdown_table(md)
        parsed["source_doc"] = df.loc[i, "pdf_path"] if "pdf_path" in df.columns else f"doc_{i+1}"
        merged_df_list.append(parsed)
    except Exception as e:
        print(f"Failed to parse doc {i+1}: {e}")

final_df = pd.concat(merged_df_list, ignore_index=True)
final_df.to_csv("output/merged_energy_policy.csv", index=False)

In [65]:
final_df

Unnamed: 0,Location,Main Objectives,Key Actions,Stakeholders,Timeline,source_doc
0,Aniak,1. Summarize energy audit recommendations; 2. ...,1. Conduct energy audits of tribal buildings; ...,1. Aniak Traditional Council; 2. CCHRC (Cold C...,2017-2018 (with quarterly reports submitted Oc...,1526439.pdf
1,Kwigillingok,"Improve energy efficiency, reduce energy costs...","Conduct energy audits, create maintenance plan...","CCHRC, Energy Audits of Alaska, Kwigillingok T...",Quarterly reports for Oct-Dec 2017 and Jan-Mar...,1526994.pdf
2,"Atmautluak, Alaska",Summarize energy audit recommendations and gui...,- Create Energy Action Plan; - Communicate wit...,- Atmautluak Traditional Council; - Project st...,February 2019 (Final review); October 2017 - M...,1527001.pdf
3,Akiachak (Tribal buildings),"Reduce energy costs, improve building safety a...",- Conduct energy audits ; - Implement monthly...,- Cold Climate Housing Research Center (CCHRC)...,2017-2018 (with ongoing maintenance plans),1527003.pdf


## Is LLM able to further analysis the combined summary?

In [66]:
summary_prompt = PromptTemplate.from_template(
    """
    Below are extracted policy tables from multiple documents.

    Please compare and summarize:
    - What are the common energy policy goals?
    - Which regions have the most comprehensive plans?
    - Are there unique or innovative actions mentioned?
    - Summarize the main differences in stakeholders and timelines.

    Keep the answer concise and structured in paragraphs with bullet points.

    Tables:
    {context}
    """
)

from langchain.chains import LLMChain

all_tables = "\n\n".join(df["extracted_text"].dropna().tolist())

llm_chain = LLMChain(prompt=summary_prompt, llm=llm)
summary = llm_chain.invoke({"context": all_tables})

print(summary["text"])
with open("output/energy_policy_summary.txt", "w", encoding="utf-8") as f:
    f.write(summary["text"])

  llm_chain = LLMChain(prompt=summary_prompt, llm=llm)


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
### **Comparison and Summary of Energy Policies**  

#### **Common Energy Policy Goals**  
- **Reduce energy consumption** and costs across tribal buildings.  
- **Improve occupant comfort** and building safety.  
- **Guide future retrofit projects** through energy audits and data monitoring.  
- **Enhance community engagement** via outreach (presentations, flyers, videos).  
- **Secure funding/training** for long-term sustainability.  

#### **Most Comprehensive Plans**  
- **Aniak** and **Akiachak** stand out for detailed actions:  
  - Both include energy audits, maintenance checklists, training, and quarterly reporting.  
  - Akiachak explicitly seeks financing for retrofits, while Aniak emphasizes coordination with regional projects.  

#### **Unique or Innovative Actions**  
- **Atmautluak**: Interviews with experts to refine en