## Getting documents

In [2]:
!uv add requests

[2mResolved [1m111 packages[0m [2min 2.45s[0m[0m
[2mAudited [1m107 packages[0m [2min 0.10ms[0m[0m


In [1]:
import requests

In [4]:
repo_owner = 'evidentlyai'
repo_name = 'docs'
branch_name = 'main'

zip_url = f'https://github.com/{repo_owner}/{repo_name}/archive/refs/heads/{branch_name}.zip'
zip_response = requests.get(zip_url)

In [5]:
len(zip_response.content)

17545668

In [6]:
import io
import zipfile

zip_archive = zipfile.ZipFile(io.BytesIO(zip_response.content))

In [10]:
filenames = zip_archive.namelist()
filenames[20:30]

['docs-main/docs/library/report.mdx',
 'docs-main/docs/library/synthetic_data_api.mdx',
 'docs-main/docs/library/tags_metadata.mdx',
 'docs-main/docs/library/tests.mdx',
 'docs-main/docs/platform/',
 'docs-main/docs/platform/alerts.mdx',
 'docs-main/docs/platform/dashboard_add_panels.mdx',
 'docs-main/docs/platform/dashboard_add_panels_ui.mdx',
 'docs-main/docs/platform/dashboard_overview.mdx',
 'docs-main/docs/platform/dashboard_panel_types.mdx']

In [30]:
filename = 'docs-main/docs/platform/alerts.mdx'

mdx_file = zip_archive.open(filename)
mdx_content = mdx_file.read().decode('utf8')

In [22]:
!uv add python-frontmatter

[2mResolved [1m112 packages[0m [2min 399ms[0m[0m
[2mInstalled [1m1 package[0m [2min 126ms[0m[0m
 [32m+[39m [1mpython-frontmatter[0m[2m==1.1.0[0m


In [23]:
import frontmatter

post = frontmatter.loads(mdx_content)

<Check>
  Built-in alerting is a Pro feature available in the **Evidently Cloud** and **Evidently En


In [25]:
post.metadata

{'title': 'Alerts', 'description': 'How to set up alerts.'}

In [28]:
print(post.content[:100])

<Check>
  Built-in alerting is a Pro feature available in the **Evidently Cloud** and **Evidently En


In [39]:
_, filename_corrected = filename.split('/', maxsplit=1)
print(filename_corrected)

docs/platform/alerts.mdx


In [43]:
doc = {
    'content': post.content,
    'title': post.metadata.get('title'),
    'description': post.metadata.get('description'),
    'filename': filename_corrected
}

In [45]:
documents = []
with zipfile.ZipFile(io.BytesIO(zip_response.content)) as zip_ref:
    for file_path in zip_ref.namelist():
        if not file_path.endswith(('.md', '.mdx')):
            continue
        with zip_ref.open(file_path) as file:
            content = file.read().decode('utf-8')
            post = frontmatter.loads(content)
            doc = {
                'content': post.content,
                'title': post.metadata.get('title'),
                'description': post.metadata.get('description'),
                'filename': file_path.split('/', 1)[-1]
            }
            documents.append(doc)

In [46]:
len(documents)

95

In [47]:
!uv add gitsource

[2mResolved [1m113 packages[0m [2min 1.49s[0m[0m
[2mInstalled [1m1 package[0m [2min 83ms[0m[0m
 [32m+[39m [1mgitsource[0m[2m==0.0.4[0m


In [48]:
from gitsource import GithubRepositoryDataReader

reader = GithubRepositoryDataReader(
    repo_owner="evidentlyai",
    repo_name="docs",
    allowed_extensions={"md", "mdx"},
)

files = reader.read()

print(f"Loaded {len(files)} documents")


Loaded 95 documents


In [52]:
md_file = files[10]

In [56]:
documents = [f.parse() for f in files]

In [57]:
len(documents)

95

In [58]:
documents[10]

{'title': 'Output formats',
 'description': 'How to export the evaluation results.',
 'content': 'You can view or export Reports in multiple formats.\n\n**Pre-requisites**:\n\n* You know how to [generate Reports](/docs/library/report).\n\n## Log to Workspace\n\nYou can save the computed Report in Evidently Cloud or your local workspace.\n\n```python\nws.add_run(project.id, my_eval, include_data=False)\n```\n\n<Info>\n  **Uploading evals**. Check Quickstart examples [for ML](/quickstart_ml) or [for LLM](/quickstart_llm) for a full workflow.\n</Info>\n\n## View in Jupyter notebook\n\nYou can directly render the visual summary of evaluation results in interactive Python environments like Jupyter notebook or Colab.\n\nAfter running the Report, simply call the resulting Python object:\n\n```python\nmy_report\n```\n\nThis will render the HTML object directly in the notebook cell.\n\n## HTML\n\nYou can also save this interactive visual Report as an HTML file to open in a browser:\n\n```python

## Search

In [59]:
query = 'LLM as a Judge'

In [60]:
!uv add minsearch

[2mResolved [1m120 packages[0m [2min 1.45s[0m[0m
[2mInstalled [1m7 packages[0m [2min 6.13s[0m[0m
 [32m+[39m [1mjoblib[0m[2m==1.5.3[0m
 [32m+[39m [1mminsearch[0m[2m==0.0.7[0m
 [32m+[39m [1mnumpy[0m[2m==2.4.1[0m
 [32m+[39m [1mpandas[0m[2m==3.0.0[0m
 [32m+[39m [1mscikit-learn[0m[2m==1.8.0[0m
 [32m+[39m [1mscipy[0m[2m==1.17.0[0m
 [32m+[39m [1mthreadpoolctl[0m[2m==3.6.0[0m


In [61]:
from minsearch import Index

In [63]:
index = Index(
    text_fields=["title", "description", "content"],
    keyword_fields=["filename"]
)
index.fit(documents)

<minsearch.minsearch.Index at 0x2e03554e490>

In [67]:
results = index.search(query, num_results=5)

In [68]:
len(results)

5

In [80]:
len(results[0]['content'])

21834

## Chunking

In [101]:
doc_sizes = [(doc.filename, len(doc.content)) for doc in files]
doc_sizes.sort(key=lambda x: x[1], reverse=True)

for filename, size in doc_sizes[:5]:
    print(f"{filename}: {size} characters")

metrics/all_metrics.mdx: 55085 characters
metrics/all_descriptors.mdx: 31976 characters
docs/platform/dashboard_panel_types.mdx: 31647 characters
docs/library/leftover_content.mdx: 28742 characters
metrics/customize_llm_judge.mdx: 26847 characters


rag:

1. search <-- 5 docs + chunkin
2. prompt <-- 5 x 20k = 100k
3. llm 

In [102]:
document = list(range(0, 100))

In [110]:
window_size = 10
start = 0
step = 5

chunks = []

while start < len(document):
    end = start + window_size
    chunk = document[start:end]
    if len(chunk) < window_size:
        break
    chunks.append(chunk)
    print(chunk)
    start = start + step

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
[55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
[65, 66, 67, 68, 69, 70, 71, 72, 73, 74]
[70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
[75, 76, 77, 78, 79, 80, 81, 82, 83, 84]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
[85, 86, 87, 88, 89, 90, 91, 92, 93, 94]
[90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [111]:
def sliding_window(text, size=1000, step=500):
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + size
        chunk = text[start:end]
        chunks.append({'start': start, 'content': chunk})
        
        start = end - step
        
        if end >= text_length:
            break
    
    return chunks


In [114]:
len(sliding_window(results[0]['content'], size=3000, step=2500))

39

In [137]:
document_chunks = []

for doc in documents:
    if not doc.get('content'):
        continue
    copy = doc.copy()
    content = copy.pop('content')

    chunks = sliding_window(content, size=3000, step=1500)
    
    for i, chunk in enumerate(chunks):
        chunk.update(copy)
        chunk['chunk_id'] = i
        document_chunks.append(chunk)

In [138]:
document_chunks[10]

{'start': 9000,
 'content': 'cation=[BinaryClassification(\n        target="target",\n        prediction_labels="prediction")],\n    categorical_columns=["target", "prediction"])\n```\n\nAvailable options and defaults:\n\n```python\n    target: str = "target"\n    prediction_labels: Optional[str] = None\n    prediction_probas: Optional[str] = "prediction" #if probabilistic classification\n    pos_label: Label = 1 #name of the positive label\n    labels: Optional[Dict[Label, str]] = None\n```\n\n### Ranking\n\n#### RecSys\n\nTo evaluate recommender systems performance, you must map the columns with:\n\n- Prediction: this could be predicted score or rank.\n- Target: relevance labels (e.g., this could be an interaction result like user click or upvote, or a true relevance label)\n\nThe **target** column can contain either:\n\n- a binary label (where `1` is a positive outcome)\n- any scores (positive values, where a higher value corresponds to a better match or a more valuable user action)

In [139]:
chunk_index = Index(
    text_fields=["title", "description", "content"],
    keyword_fields=["filename"]
)
chunk_index.fit(document_chunks)

<minsearch.minsearch.Index at 0x2e00f830190>

In [141]:
results = chunk_index.search(query)

In [142]:
from gitsource import chunk_documents

In [145]:
document_chunks = chunk_documents(documents, size=3000, step=1500)