In [4]:
import os
from concurrent.futures import ThreadPoolExecutor
from typing import List
from langchain_community.llms import Ollama
from langchain_core.documents import Document

# --- SETTINGS ---
CHUNKS_FOLDER = "chunks"
MAX_WORKERS = 8
LLM_MODEL = "llama3.2:1b"


# --- LOAD TEXT CHUNKS CONCURRENTLY ---
def load_chunks() -> List[Document]:
    txt_paths = [
        os.path.join(root, fn)
        for root, _, files in os.walk(CHUNKS_FOLDER)
        for fn in files if fn.endswith(".txt")
    ]

    def read_file(path: str):
        try:
            with open(path, "r", encoding="utf-8") as f:
                content = f.read().strip()
        except UnicodeDecodeError:
            try:
                with open(path, "r", encoding="latin-1") as f:
                    content = f.read().strip()
            except:
                return None
        return Document(page_content=content, metadata={"source": path}) if content else None

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results = list(executor.map(read_file, txt_paths))

    docs = [d for d in results if d]
    print(f"[load_chunks] Loaded {len(docs)} chunks.")
    return docs


# --- RUN LLM DIRECTLY ON CONCATENATED TEXT ---
def run_llm_direct(query: str) -> str:
    docs = load_chunks()
    if not docs:
        return "[run_llm] No documents found!"

    # Combine all content into one large context string
    combined_text = "\n\n".join([doc.page_content for doc in docs])
    prompt = f"{combined_text}\n\nAnswer this: {query}"

    llm = Ollama(model=LLM_MODEL)
    response = llm.invoke(prompt)
    return response


# --- USAGE ---
if __name__ == "__main__":
    query = "Summarize the key ideas"
    answer = run_llm_direct(query)
    print("Answer:", answer)


[load_chunks] Loaded 7166 chunks.
Answer: The text discusses various aspects of solar cells, including:

1. **Passivation techniques**: The importance of passivation in improving the efficiency and durability of solar cells.
2. **Contact technologies**: Different types of contacts used in solar cells, such as screen-printed, fired metallization, and tunnel oxide passivated contact (i-TOPCon).
3. **Screen-printed solar cells**: Advances in screen printing technology for large-area industrial silicon solar cells with i-TOPCon design.
4. **Nanostructured surfaces**: The use of nanostructures to improve the efficiency of solar cells, including thin-film solar cells and nano-wire arrays.

Some key findings from the research include:

* Screen-printed solar cells can be used to fabricate large-area industrial silicon solar cells with i-TOPCon design.
* Nanostructured surfaces, such as nano-wire arrays, can improve the efficiency of solar cells.
* The tunnel oxide passivated contact (i-TOPCon

In [None]:
import os
import json
import pandas as pd
from pydantic import BaseModel, Field, field_validator, ValidationError

from langchain_ollama import OllamaLLM  # updated import for latest Ollama LLM wrapper
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate

# --- SETTINGS ---
CHUNKS_FOLDER = "chunkTest"
LLM_MODEL = "llama3.2:1b"
OUTPUT_CSV = "pv_extraction_results_ollama.csv"

# --- SCHEMA ---
class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def convert_to_string(cls, v):
        return str(v) if v is not None else "N/A"

# --- LOAD CHUNKS ---
def load_all_chunks() -> str:
    full_text = ""
    for root, _, files in os.walk(CHUNKS_FOLDER):
        for file in sorted(files):
            if file.endswith(".txt"):
                path = os.path.join(root, file)
                try:
                    with open(path, "r", encoding="utf-8") as f:
                        content = f.read().strip()
                except UnicodeDecodeError:
                    with open(path, "r", encoding="latin-1") as f:
                        content = f.read().strip()
                if content:
                    full_text += f"\n\n--- {file} ---\n{content}"
    return full_text.strip()

# --- PROMPT TEMPLATE ---
prompt_template = PromptTemplate.from_template("""
Extract structured data from the following academic article text on photovoltaic cells.
Focus only on the most efficient cell mentioned.

Return only a JSON object matching this schema exactly (no explanations or markdown):

TEXT:
{text}

Schema:
{format_instructions}
""")

# --- EXTRACT DATA ---
def extract_data_from_text(text: str) -> dict | str:
    try:
        llm = OllamaLLM(model=LLM_MODEL)
        parser = PydanticOutputParser(pydantic_object=PVArticleData)

        prompt = prompt_template.format(
            text=text,
            format_instructions=parser.get_format_instructions()
        )
        
        print("DEBUG PROMPT (first 1000 chars):\n", prompt[:1000])

        raw_output = llm.invoke(prompt).strip()
        
        print("\n📝 Raw LLM Output:\n", raw_output[:2000])  # show first 2000 chars max

        if not raw_output or raw_output.lower().startswith(("error", "none", "no data")):
            return {"error_type": "empty", "details": "LLM returned empty or irrelevant output."}

        try:
            parsed = json.loads(raw_output)
        except json.JSONDecodeError as je:
            return {
                "error_type": "invalid_json",
                "details": f"JSON parsing failed: {je.msg} (line {je.lineno}, column {je.colno})",
                "raw_output": raw_output
            }

        try:
            record = PVArticleData(**parsed)
        except ValidationError as ve:
            return {
                "error_type": "validation_error",
                "details": ve.errors(),
                "raw_output": parsed
            }

        result = record.model_dump()
        result["source"] = "ALL_CHUNKS"
        return result

    except Exception as e:
        return {"error_type": "exception", "details": str(e)}

# --- MAIN ---
if __name__ == "__main__":
    full_context = load_all_chunks()
    print("DEBUG: Loaded text length =", len(full_context))
    print("DEBUG: Loaded text snippet:\n", full_context[:500])

    if not full_context:
        print("⚠️ No content found in any .txt files.")
    else:
        result = extract_data_from_text(full_context)

        if isinstance(result, dict) and "error_type" not in result:
            df_new = pd.DataFrame([result])
            if os.path.exists(OUTPUT_CSV):
                df_existing = pd.read_csv(OUTPUT_CSV)
                df_combined = pd.concat([df_existing, df_new], ignore_index=True)
            else:
                df_combined = df_new
            df_combined.to_csv(OUTPUT_CSV, index=False)
            print(f"✅ Saved extracted data to {OUTPUT_CSV}")
        else:
            print(f"\n⚠️ Extraction failed:")
            print(f"🔍 Error Type: {result['error_type']}")
            print(f"📄 Details: {result['details']}")
            if "raw_output" in result:
                print(f"📝 Raw Output:\n{result['raw_output']}")


  llm = Ollama(model=LLM_MODEL)



📝 Raw LLM Output:

Here is the extracted and formatted JSON instance according to the provided schema:
```
{
  "properties": {
    "title": "Title",
    "last_name": "Last Name",
    "year": "Year",
    "doi": "DOI",
    "research_focus": "Research Focus",
    "key_findings": "Key Findings",
    "device_type": "Device Type",
    "absorber_material": "Absorber Material",
    "absorber_material_term_used": "Absorber Material Term Used",
    "absorber_dopant_material": "Absorber Dopant Material",
    "absorber_dopant_material_term_used": "Absorber Dopant Material Term Used",
    "absorber_dopant_polarity": "Absorber Dopant Polarity",
    "absorber_dopant_polarity_term_used": "Absorber Dopant Polarity Term Used",
    "front_surface_morphology": "Front Surface Morphology",
    "front_surface_morphology_term_used": "Front Surface Morphology Term Used",
    "rear_surface_morphology": "Rear Surface Morphology",
    "rear_surface_morphology_term_used": "Rear Surface Morphology Term Used",
    

In [6]:
import requests
from bs4 import BeautifulSoup

def fetch_doc_text(url: str) -> str:
    """
    Fetches the raw text content from a published or editable Google Doc URL.
    For /edit URLs, uses the export?format=txt endpoint.
    For /pub URLs, parses the HTML to extract text.
    """
    if "/edit" in url:
        # Convert to export URL for plain text
        url = url.replace("/edit", "/export?format=txt")
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    elif "/pub" in url:
        # Fetch HTML content and extract paragraphs text
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")
        return "\n".join(p.get_text() for p in paragraphs)
    else:
        raise ValueError("Unsupported URL format. Please provide a published or editable Google Doc link.")

def parse_triplets(text: str):
    """
    Parses the input text to extract (x, y, char) triplets describing the grid.
    Assumes each triplet consists of three lines: x, character, y.
    """
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    triplets = []
    i = 0

    # Skip lines until the first integer (x coordinate)
    while i < len(lines):
        try:
            int(lines[i])
            break
        except ValueError:
            i += 1

    # Parse triplets
    while i + 2 < len(lines):
        try:
            x = int(lines[i])
            char = lines[i + 1]
            y = int(lines[i + 2])
            triplets.append((x, y, char))
        except ValueError:
            pass
        i += 3

    return triplets

def build_grid(triplets):
    """
    Builds a 2D grid (list of lists) from the triplets, filling unspecified cells with spaces.
    """
    if not triplets:
        return []

    max_x = max(x for x, _, _ in triplets)
    max_y = max(y for _, y, _ in triplets)

    # Initialize grid filled with spaces
    grid = [[" " for _ in range(max_x + 1)] for _ in range(max_y + 1)]

    for x, y, char in triplets:
        grid[y][x] = char

    return grid

def print_grid(grid):
    """
    Prints the 2D character grid line by line.
    """
    for row in grid:
        print("".join(row))

def decode_google_doc(url: str):
    """
    Main function: fetches document text, parses triplets, builds grid, and prints it.
    """
    text = fetch_doc_text(url)
    triplets = parse_triplets(text)
    grid = build_grid(triplets)
    print_grid(grid)

# Example usage:
if __name__ == "__main__":
    doc_url = input("Enter published Google Doc URL: ").strip()
    decode_google_doc(doc_url)


██░    ███░ ██████░    ███████░     ██░     ██░     ██████████░ ████████░    ████████░  
██░  ███░     ██░    ███░    ██░   ████░   ████░    ██░         ██░     ██░  ██░     ██░
██░███░       ██░   ███░           ██░██░ ██░██░    ██░         ██░      ██░ ██░     ██░
████░         ██░   ██░           ███░ ██░██░ ██░   ████████░   ██░      ██░ ████████░  
██░███░       ██░   ███░          ██░  █████░ ███░  ██░         ██░      ██░ ██░     ██░
██░  ███░     ██░    ███░    ██░ ███░   ███░   ██░  ██░         ██░     ██░  ██░     ██░
██░    ███░ ██████░    ███████░  ██░           ███░ ██████████░ ████████░    ████████░  
