In [1]:
import io
import zipfile
import requests
import frontmatter

## Day 1 : Ingest and Index Your Data

In [2]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data

In [3]:
code_collab_docs = read_repo_data('YannPhamVan', 'code-collab-studio')
snake_arena_docs = read_repo_data('YannPhamVan', 'snake-arena')

print(f"Code Collab documents: {len(code_collab_docs)}")
print(f"Snake Arena documents: {len(snake_arena_docs)}")

Code Collab documents: 5
Snake Arena documents: 49


## Day 2 : chunking

In [4]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [5]:
snake_chunks = []

for doc in snake_arena_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    snake_chunks.extend(chunks)

In [6]:
len(snake_chunks)

411

In [7]:
import re
text = snake_arena_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

In [8]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections

In [9]:
sections = split_markdown_by_level(text, level=2)

In [10]:
snake_chunks = []

for doc in snake_arena_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        snake_chunks.append(section_doc)

In [11]:
len(snake_chunks)

255

## Day 3 : Add Search
### 1. Refaire le chunking simple pour Day 3

In [24]:
snake_chunks_simple = []

for doc in snake_arena_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    snake_chunks_simple.extend(chunks)

print(f"Total chunks for search: {len(snake_chunks_simple)}")

Total chunks for search: 411


### 2. Install minsearch if not already installed
#### uv add minsearch  # ex√©cuter dans terminal/console

In [25]:
from minsearch import Index, VectorSearch
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm

### 3. Text search

In [26]:
index = Index(
    text_fields=["chunk", "title", "filename"],
    keyword_fields=[]
)
index.fit(snake_chunks_simple)

<minsearch.minsearch.Index at 0x1621a5aa990>

In [27]:
query = "How do I start a new game?"
text_results = index.search(query, num_results=5)
print("Text search results:")
for r in text_results:
    print(r['filename'], "-", r['chunk'][:100], "...")

Text search results:
snake-arena-main/frontend/README.md - # Neon Snake üêç

A retro-futuristic Snake game with a cyberpunk aesthetic, featuring two game modes,  ...
snake-arena-main/frontend/README.md - un tests in watch mode
```

## Game Controls

- **Arrow Keys** or **WASD**: Move snake
- **Space**:  ...
snake-arena-main/frontend/README.md - # Live games viewer
‚îú‚îÄ‚îÄ lib/
‚îÇ   ‚îú‚îÄ‚îÄ mockBackend.ts      # Centralized mock API
‚îÇ   ‚îú‚îÄ‚îÄ gameLogic.ts ...
snake-arena-main/README.md - ![Snake Arena Banner](./assets/banner.png)

**Snake Arena** is a modern, web-based implementation of ...
snake-arena-main/node_modules/concurrently/README.md - ne of these statuses, kill other commands.
    Can be an array containing the strings `success` (sta ...


### 4. Vector search

In [28]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [29]:
# Create embeddings for all chunks
snake_embeddings = []
for d in tqdm(snake_chunks_simple):
    v = embedding_model.encode(d['chunk'])
    snake_embeddings.append(v)
snake_embeddings = np.array(snake_embeddings)

  0%|          | 0/411 [00:00<?, ?it/s]

In [30]:
vector_index = VectorSearch()
vector_index.fit(snake_embeddings, snake_chunks_simple)

<minsearch.vector.VectorSearch at 0x1621a473e00>

In [31]:
v_query = embedding_model.encode(query)
vector_results = vector_index.search(v_query, num_results=5)
print("\nVector search results:")
for r in vector_results:
    print(r['filename'], "-", r['chunk'][:100], "...")


Vector search results:
snake-arena-main/frontend/README.md - # Live games viewer
‚îú‚îÄ‚îÄ lib/
‚îÇ   ‚îú‚îÄ‚îÄ mockBackend.ts      # Centralized mock API
‚îÇ   ‚îú‚îÄ‚îÄ gameLogic.ts ...
snake-arena-main/frontend/README.md - un tests in watch mode
```

## Game Controls

- **Arrow Keys** or **WASD**: Move snake
- **Space**:  ...
snake-arena-main/README.md - ![Snake Arena Banner](./assets/banner.png)

**Snake Arena** is a modern, web-based implementation of ...
snake-arena-main/README.md - sh
   git clone https://github.com/YannPhamVan/snake-arena.git
   cd snake-arena
   ```

2. **Start  ...
snake-arena-main/README.md - bile devices.

## üöÄ Tech Stack

### Frontend
- **Framework**: React 18, Vite
- **Language**: TypeScr ...


### 5. Hybrid search

In [33]:
def hybrid_search(query, num_results=5):
    # Text search
    text_res = index.search(query, num_results=num_results)
    # Vector search
    v_query = embedding_model.encode(query)
    vector_res = vector_index.search(v_query, num_results=num_results)
    
    # Combine and deduplicate
    seen_files = set()
    combined = []
    for r in text_res + vector_res:
        if r['filename'] not in seen_files:
            seen_files.add(r['filename'])
            combined.append(r)
    return combined

In [34]:
hybrid_results = hybrid_search(query)
print("\nHybrid search results:")
for r in hybrid_results:
    print(r['filename'], "-", r['chunk'][:100], "...")


Hybrid search results:
snake-arena-main/frontend/README.md - # Neon Snake üêç

A retro-futuristic Snake game with a cyberpunk aesthetic, featuring two game modes,  ...
snake-arena-main/README.md - ![Snake Arena Banner](./assets/banner.png)

**Snake Arena** is a modern, web-based implementation of ...
snake-arena-main/node_modules/concurrently/README.md - ne of these statuses, kill other commands.
    Can be an array containing the strings `success` (sta ...
