In [5]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data

def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

def text_search(query):
    return faq_index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return faq_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results


In [2]:
evidently_docs = read_repo_data('evidentlyai', 'docs')
qgis_web = read_repo_data('qgis', 'QGIS-Website')

qgis_chunks = []

for doc in qgis_web:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    qgis_chunks.extend(chunks)

Error processing QGIS-Website-main/content/project/case-studies/italy_cesena.md: while parsing a quoted scalar
  in "<unicode string>", line 2, column 8
found unknown escape character
  in "<unicode string>", line 2, column 36


In [7]:
from minsearch import Index
index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)

index.fit(qgis_chunks)

<minsearch.minsearch.Index at 0x229f673a210>

In [9]:
query = "How to load raster layer?"
qgis_text_search = index.search(query)
qgis_text_search

[{'start': 5000,
  'chunk': '\n-   The GRASS tool v.to.rast.attribute converts contour elevation lines to raster, taking the contour shapefile, the name of the z field and the raster resolution as input;\n-   The GRASS tool r.surf.contour generates the elevation model taking as input the rasterized temporary output from previous step and the raster resolution;\n-   The GDAL tool "gdaldem" generates the slope expressed as degrees from the elevation model;\n-   The GRASS tool r.mapcalculator is used to generate a 1 bit raster identifying areas with slope greater than 15 degrees (this value is coded in the microzonation guidelines, and so it is fixed), using the expression:\n\nif(A\\>15,1,null())\n\nwhere A is the temporary slope raster generated by gdaldem;\n\n-   The GDAL tool "gdal_polygonize" converts the 1 bit raster to polygons;\n-   The QGIS tool "Intersection" is used to overlay the areas with slope greater than 15 degrees with the chosen intersection layer.\n\nThe result is a pol

In [11]:
from sentence_transformers import SentenceTransformer
from minsearch import VectorSearch

from tqdm.auto import tqdm
import numpy as np

embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')
qgis_embeddings = []

for d in tqdm(qgis_chunks):
    v = embedding_model.encode(d['chunk'])
    qgis_embeddings.append(v)

qgis_embeddings = np.array(qgis_embeddings)

qgis_vindex = VectorSearch()
qgis_vindex.fit(qgis_embeddings, qgis_chunks)

  0%|          | 0/3001 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x229fd53a2a0>

In [12]:
q = embedding_model.encode(query)
qgis_vector_results = qgis_vindex.search(q, num_results=5)
qgis_vector_results

[{'start': 57000,
  'chunk': 'is/QGIS/pull/45267)    | [PR #45273](https://github.com/qgis/QGIS/pull/45273)   \n| QGIS crashes if network connexion is lost and a raster layer is loaded                                                                           | [#45293](https://github.com/qgis/QGIS/issues/45293) | [GDAL PR 4560](https://github.com/OSGeo/gdal/pull/4560) | N/A                                                    \n\nThese bug fixes were funded by [QGIS.ORG (through donations and sustaining memberships)](https://www.qgis.org/)\n\nBugs fixed by [Even Rouault](https://www.spatialys.com/)\n\n### Bug fixes by Alessandro Pasotti\n\n| Bug Title                                                                                                               | URL issues.qgis.org (if reported)                   | URL Commit (Github)                                  | 3.16 backport commit (GitHub)                        | Remark\n|----|----|----|----|----|\n| Data Source Manager - ArcGIS

In [13]:
final_results = qgis_text_search + qgis_vector_results
final_results

[{'start': 5000,
  'chunk': '\n-   The GRASS tool v.to.rast.attribute converts contour elevation lines to raster, taking the contour shapefile, the name of the z field and the raster resolution as input;\n-   The GRASS tool r.surf.contour generates the elevation model taking as input the rasterized temporary output from previous step and the raster resolution;\n-   The GDAL tool "gdaldem" generates the slope expressed as degrees from the elevation model;\n-   The GRASS tool r.mapcalculator is used to generate a 1 bit raster identifying areas with slope greater than 15 degrees (this value is coded in the microzonation guidelines, and so it is fixed), using the expression:\n\nif(A\\>15,1,null())\n\nwhere A is the temporary slope raster generated by gdaldem;\n\n-   The GDAL tool "gdal_polygonize" converts the 1 bit raster to polygons;\n-   The QGIS tool "Intersection" is used to overlay the areas with slope greater than 15 degrees with the chosen intersection layer.\n\nThe result is a pol