## Import Library

In [16]:
import io
import os
import zipfile
import requests
import frontmatter
import numpy as np

from typing import List, Any
from minsearch import Index, VectorSearch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

import google.generativeai as genai

from pydantic_ai import Agent
from pydantic_ai.models.google import GoogleModel
from pydantic_ai.providers.google import GoogleProvider


## Define Function

In [2]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data

def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

## Ingest and Index Data

In [3]:
qgis_web = read_repo_data('qgis', 'QGIS-Website')

qgis_chunks = []

for doc in qgis_web:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    qgis_chunks.extend(chunks)

Error processing QGIS-Website-main/content/project/case-studies/italy_cesena.md: while parsing a quoted scalar
  in "<unicode string>", line 2, column 8
found unknown escape character
  in "<unicode string>", line 2, column 36


## Search

In [4]:
query = "How to load raster layer?"

### Text Search

In [5]:
index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)

index.fit(qgis_chunks)

<minsearch.minsearch.Index at 0x1fb05f48e30>

In [6]:
qgis_text_search = index.search(query)
qgis_text_search

[{'start': 5000,
  'chunk': '\n-   The GRASS tool v.to.rast.attribute converts contour elevation lines to raster, taking the contour shapefile, the name of the z field and the raster resolution as input;\n-   The GRASS tool r.surf.contour generates the elevation model taking as input the rasterized temporary output from previous step and the raster resolution;\n-   The GDAL tool "gdaldem" generates the slope expressed as degrees from the elevation model;\n-   The GRASS tool r.mapcalculator is used to generate a 1 bit raster identifying areas with slope greater than 15 degrees (this value is coded in the microzonation guidelines, and so it is fixed), using the expression:\n\nif(A\\>15,1,null())\n\nwhere A is the temporary slope raster generated by gdaldem;\n\n-   The GDAL tool "gdal_polygonize" converts the 1 bit raster to polygons;\n-   The QGIS tool "Intersection" is used to overlay the areas with slope greater than 15 degrees with the chosen intersection layer.\n\nThe result is a pol

### Vector Search

In [7]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')
qgis_embeddings = []

for d in tqdm(qgis_chunks):
    v = embedding_model.encode(d['chunk'])
    qgis_embeddings.append(v)

qgis_embeddings = np.array(qgis_embeddings)

qgis_vindex = VectorSearch()
qgis_vindex.fit(qgis_embeddings, qgis_chunks)

  0%|          | 0/3002 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x1fabef3b440>

In [8]:
q = embedding_model.encode(query)
qgis_vector_results = qgis_vindex.search(q, num_results=5)
qgis_vector_results

[{'start': 57000,
  'chunk': 'is/QGIS/pull/45267)    | [PR #45273](https://github.com/qgis/QGIS/pull/45273)   \n| QGIS crashes if network connexion is lost and a raster layer is loaded                                                                           | [#45293](https://github.com/qgis/QGIS/issues/45293) | [GDAL PR 4560](https://github.com/OSGeo/gdal/pull/4560) | N/A                                                    \n\nThese bug fixes were funded by [QGIS.ORG (through donations and sustaining memberships)](https://www.qgis.org/)\n\nBugs fixed by [Even Rouault](https://www.spatialys.com/)\n\n### Bug fixes by Alessandro Pasotti\n\n| Bug Title                                                                                                               | URL issues.qgis.org (if reported)                   | URL Commit (Github)                                  | 3.16 backport commit (GitHub)                        | Remark\n|----|----|----|----|----|\n| Data Source Manager - ArcGIS

## Build Agents 

### Using `genai`

In [17]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [18]:
model = genai.GenerativeModel(model_name='gemini-2.5-flash')
user_prompt = "How to load raster layer in QGIS?"
chat_messages = [
    {"role": "user", "parts": [user_prompt]}
]

response = model.generate_content(chat_messages) 
response.text

'Loading raster layers in QGIS is a fundamental task, and there are several ways to do it, depending on your data source and preference.\n\nHere are the most common methods:\n\n---\n\n## 1. Using the Data Source Manager (Recommended for precision)\n\nThis is the most versatile method and recommended for adding individual or multiple rasters, or for connecting to various data sources.\n\n1.  **Open the Data Source Manager:**\n    *   Go to `Layer` > `Add Layer` > `Add Raster Layer...`\n    *   OR click the `Open Data Source Manager` icon (looks like a database symbol with a plus sign, or multiple stacked squares) on the `Data Source Manager Toolbar`.\n\n2.  **Select the "Raster" Tab:** In the Data Source Manager window, ensure the `Raster` tab is selected on the left panel.\n\n3.  **Browse for your Raster File(s):**\n    *   Click the `...` (Browse) button next to the "Source" field.\n    *   Navigate to the directory where your raster file is located.\n    *   Select your raster file (

### Function Call

In [19]:
def gemini_search_tool(query: str) -> str:
    print(f'query: {query}')

available_tools = {
    "text_search": gemini_search_tool,
}

text_search_tool_genai = genai.protos.Tool(
    function_declarations=[
        genai.protos.FunctionDeclaration(
            name="text_search",
            description="Search the FAQ database about QGIS", 
            parameters=genai.protos.Schema(
                type=genai.protos.Type.OBJECT,
                properties={
                    "query": genai.protos.Schema(
                        type=genai.protos.Type.STRING,
                        description="Search query text to look up in QGIS FAQ."
                    )
                },
                required=["query"]
            )
        )
    ]
)

system_prompt = """
You are a helpful assistant for QGIS software course. 

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.
If the search doesn't return relevant results, let the user know and provide general guidance.
"""

model = genai.GenerativeModel(
    model_name="gemini-2.5-flash",
    tools=[text_search_tool_genai],
    system_instruction=system_prompt
)

chat = model.start_chat(enable_automatic_function_calling=False)
response = chat.send_message(user_prompt)
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "function_call": {
                  "name": "text_search",
                  "args": {
                    "query": "load raster layer QGIS"
                  }
                }
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "index": 0
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 127,
        "candidates_token_count": 19,
        "total_token_count": 234
      },
      "model_version": "gemini-2.5-flash"
    }),
)

In [20]:
part = response.candidates[0].content.parts[0]
function_call = part.function_call
tool_name = function_call.name
tool_args_struct = function_call.args
tool_function = available_tools.get(tool_name)
tool_args = {key: value for key, value in tool_args_struct.items()}
tool_result = tool_function(**tool_args)
response = chat.send_message(
            genai.protos.Part(
                function_response=genai.protos.FunctionResponse(
                    name=tool_name,
                    response={'result': tool_result}
                )
            )
        )
response.candidates[0].content.parts[0]

query: load raster layer QGIS


text: "I\'m sorry, I couldn\'t find specific instructions on how to load a raster layer in QGIS within the course materials. However, I can give you general guidance.\n\nTypically, in QGIS, you can load a raster layer by going to:\n1.  **Layer Menu**\n2.  **Add Layer**\n3.  **Add Raster Layer...**\n\nFrom there, you would navigate to your raster file (e.g., a GeoTIFF, JPEG2000, etc.) and add it to your project."

### Using Pydantic AI

In [21]:
def text_search(query: str) -> List[Any]:
    return index.search(query, num_results=5)

In [22]:
provider = GoogleProvider(api_key=os.getenv('GOOGLE_API_KEY'))
model = GoogleModel('gemini-2.5-flash', provider=provider)

agent = Agent(
    name="faq_agent",
    instructions=system_prompt,
    tools=[text_search],
    model=model
)

In [24]:
result = await agent.run(user_prompt=user_prompt)
result

AgentRunResult(output='To load a raster layer in QGIS, you typically use the "Layer" menu. Here\'s a general approach:\n\n1.  Open QGIS.\n2.  Go to the **Layer** menu in the top toolbar.\n3.  Select **Add Layer**.\n4.  Choose **Add Raster Layer...** from the submenu.\n5.  A new window will open, allowing you to browse your computer and select the raster file you wish to load.\n\nIn newer versions of QGIS (like 3.0 and later), there\'s a unified "Add Layer" dialog which might simplify the process further, allowing you to add various data formats from a single interface.')

In [25]:
result.new_messages()

[ModelRequest(parts=[UserPromptPart(content='How to load raster layer in QGIS?', timestamp=datetime.datetime(2025, 10, 30, 22, 35, 8, 201983, tzinfo=datetime.timezone.utc))], instructions="You are a helpful assistant for QGIS software course. \n\nUse the search tool to find relevant information from the course materials before answering questions.\n\nIf you can find specific information through search, use it to provide accurate answers.\nIf the search doesn't return relevant results, let the user know and provide general guidance."),
 ModelResponse(parts=[ThinkingPart(content='', signature='Cs4BAdHtim9IdmhtO3erhsqDilzoQJAh6o1/SSCWz11oGGmPDIz3/8qYAG89yUAPVS8tM+e1CVgb1QQ9MS8xhpestN3EjrFbrQiHXROf/xH+2xigfg/G6bOcGSk6fTczJeXvEUJbyxPnEIfR5BmwurQ60++O6ISocGwJM5IPmvjAuqz5A0GmxtT9ULyWGCSGv0ajmLjtmJ3WtdsqfUrgHq9+EcuZkebBOpXGtFS4bs/T/C0H+qyTPG7zPcNiVcJPiNqmQIK+lHvWkZCWPNwnUwU=', provider_name='google-gla'), ToolCallPart(tool_name='text_search', args={'query': 'load raster layer QGIS'}, tool_call