# 1.Google GenAI Provider

In [25]:
# providers/google_genai_provider.py
from google import genai

class GoogleGenAIProvider:
    def __init__(self, model="gemini-1.5-pro", api_key=None):
        self.client = genai.Client(api_key=api_key)
        self.model = model

    def extract(self, prompt: str) -> str:
        response = self.client.models.generate_content(
            model=self.model,
            contents=prompt
        )
        return response.text


# 2.Local LLM Provider

In [2]:
# providers/local_llm_provider.py
import subprocess

class LocalLLMProvider:
    def __init__(self, model="llama3"):
        self.model = model

    def extract(self, prompt: str) -> str:
        result = subprocess.run(
            ["ollama", "run", self.model, prompt],
            capture_output=True,
            text=True
        )
        return result.stdout.strip()


# 3.Docling Adapter

In [4]:
%pip install docling

Collecting docling
  Downloading docling-2.48.0-py3-none-any.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.42.0 (from docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Downloading docling_core-2.46.0-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.2.2 (from docling)
  Downloading docling_parse-4.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting docling-ibm-models<4,>=3.9.0 (from docling)
  Downloading docling_ibm_models-3.9.0-py3-none-any.whl.metadata (6.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting easyocr<2.0,>=1.7 (from docling)
  Downloading eas

In [5]:
# utils/docling_adapter.py
from docling.document_converter import DocumentConverter

class DoclingAdapter:
    def __init__(self):
        self.converter = DocumentConverter()

    def convert(self, file_path: str) -> str:
        doc = self.converter.convert(file_path)
        return doc.document.export_to_markdown()


# 4.Integration Pipeline

In [17]:
from google.colab import files

uploaded = files.upload()   # this will open a file picker


Saving report.pdf to report.pdf


In [28]:
file_path = "report.pdf"


In [36]:
# 1️⃣ Install required packages
!pip install docling transformers sentencepiece spacy --quiet
!python -m spacy download en_core_web_sm --quiet

# 2️⃣ Imports
from docling.document_converter import DocumentConverter
from transformers import pipeline
import spacy

# 3️⃣ PDF → text conversion
file_path = "report.pdf"  # make sure the file is uploaded

converter = DocumentConverter()
doc = converter.convert(file_path)
text = doc.document.export_to_markdown()

print("✅ Text extracted from PDF (first 500 chars):\n")
print(text[:500], "...")

# 4️⃣ Setup summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# 5️⃣ Chunking function
def chunk_text(text, chunk_size=1000):
    """Split text into chunks of at most chunk_size characters."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# 6️⃣ Summarize chunks
chunks = chunk_text(text, chunk_size=1000)
summaries = []

for i, chunk in enumerate(chunks):
    print(f"⏳ Summarizing chunk {i+1}/{len(chunks)}...")
    s = summarizer(chunk, max_length=200, min_length=50, do_sample=False)
    summaries.append(s[0]['summary_text'])

final_summary = " ".join(summaries)
print("\n✅ Final Summary:\n")
print(final_summary)

# 7️⃣ NER / entity extraction using spaCy
nlp = spacy.load("en_core_web_sm")
doc_nlp = nlp(text)

entities = []
for ent in doc_nlp.ents:
    entities.append({"text": ent.text, "label": ent.label_})

print("\n✅ Extracted Entities:\n")
for e in entities:
    print(e)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
✅ Text extracted from PDF (first 500 chars):

## Press Release

2 nd April, 2025 Accra

## Government of Ghana convenes National Landscape Forum on Natural Resource Management and Forest Economy for inclusive and sustainable growth

The Government of Ghana (GoG), with support from the World Bank, has organized a 3-day National Landscape Forum, a programme which aimed to bring together professionals and interested persons from  public,  private  and  non-governmental  agencies  working  in  the  areas  of  

Device set to use cpu


⏳ Summarizing chunk 1/8...
⏳ Summarizing chunk 2/8...


Your max_length is set to 200, but your input_length is only 191. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)


⏳ Summarizing chunk 3/8...


Your max_length is set to 200, but your input_length is only 190. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)


⏳ Summarizing chunk 4/8...


Your max_length is set to 200, but your input_length is only 197. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=98)


⏳ Summarizing chunk 5/8...
⏳ Summarizing chunk 6/8...


Your max_length is set to 200, but your input_length is only 176. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=88)


⏳ Summarizing chunk 7/8...


Your max_length is set to 200, but your input_length is only 156. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=78)


⏳ Summarizing chunk 8/8...

✅ Final Summary:

The Government of Ghana (GoG), with support from the World Bank, has organized a 3-day National Landscape Forum. The forum aimed to bring together professionals and interested persons from  public,  private  and  non-governmental  agencies. Main environmental challenges in Ghana include deforestation, land and landscape degradation, pollution, coastal degradation, and depletion of fish. The GoG has rolled out several programs to support the sustainable management of its natural resources and environmental restoration. The World Bank has supported the Government in developing the land accounts, the ecosystem extent accounts, and the ecosystem services accounts. These accounts present changes in land and ecosystem cover and provide physical and monetary value estimates. Ghana is one of the first countries globally to receive carbon payments from the World Bank for emissions reductions generated in the cocoa forest mosaic landscape. Ghana's in