Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aider/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from packaging import version

__version__ = "0.88.17.dev"
__version__ = "0.88.18.dev"
safe_version = __version__

try:
Expand Down
9 changes: 9 additions & 0 deletions aider/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Utility functions for aider."""

from .similarity import cosine_similarity, create_bigram_vector, normalize_vector

__all__ = [
"cosine_similarity",
"create_bigram_vector",
"normalize_vector",
]
98 changes: 98 additions & 0 deletions aider/helpers/similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import numpy as np


def normalize_vector(vector):
"""Normalize a vector to unit length (L2 norm).

Args:
vector (np.ndarray or list): Input vector

Returns:
np.ndarray: Normalized vector with length 1
"""
vector = np.asarray(vector, dtype=np.float64)
magnitude = np.linalg.norm(vector)
if magnitude == 0:
return vector # Return original if zero vector
return vector / magnitude


def cosine_similarity(vector1, vector2):
"""Calculate cosine similarity between two vectors.

Args:
vector1 (np.ndarray or list): First vector
vector2 (np.ndarray or list): Second vector

Returns:
float: Cosine similarity between the vectors (range: -1 to 1)
"""
vector1 = np.asarray(vector1, dtype=np.float64)
vector2 = np.asarray(vector2, dtype=np.float64)

if len(vector1) != len(vector2):
raise ValueError("Vectors must have the same length")

# Use NumPy's optimized dot product and norm functions
dot_product = np.dot(vector1, vector2)
magnitude1 = np.linalg.norm(vector1)
magnitude2 = np.linalg.norm(vector2)

if magnitude1 == 0 or magnitude2 == 0:
return 0.0 # Return 0 if either vector is zero

return dot_product / (magnitude1 * magnitude2)


def create_bigram_vector(texts):
"""Create a bigram frequency vector using optimized NumPy operations.

This version uses pre-computed bigram indices and NumPy's bincount
for maximum performance on large datasets.

Args:
texts (tuple): Tuple of strings to process

Returns:
np.ndarray: Vector of bigram frequencies
"""
# Pre-compute bigram indices (0 for 'aa', 1 for 'ab', ..., 675 for 'zz')
bigram_indices = {}
idx = 0
for i in range(ord("a"), ord("z") + 1):
for j in range(ord("a"), ord("z") + 1):
bigram = chr(i) + chr(j)
bigram_indices[bigram] = idx
idx += 1

# Initialize frequency vector
vector = np.zeros(26 * 26, dtype=np.int32)

# Process all texts
for text in texts:
text_lower = text.lower()
if len(text_lower) < 2:
continue

# Extract bigrams using NumPy sliding window view
# Convert string to character array for efficient slicing
chars = np.array(list(text_lower))

# Create bigrams by combining consecutive characters
bigrams = np.core.defchararray.add(chars[:-1], chars[1:])

# Filter only alphabetic bigrams
mask = np.array([bg.isalpha() for bg in bigrams])
valid_bigrams = bigrams[mask]

# Count bigrams using bincount with pre-computed indices
indices = []
for bg in valid_bigrams:
if bg in bigram_indices:
indices.append(bigram_indices[bg])

if indices:
counts = np.bincount(indices, minlength=26 * 26)
vector += counts

return vector
4 changes: 2 additions & 2 deletions aider/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ def get_io(pretty):
return await main_async(argv, input, output, right_repo_root, return_coder=return_coder)

if args.just_check_update:
update_available = check_version(io, just_check=True, verbose=args.verbose)
update_available = await check_version(io, just_check=True, verbose=args.verbose)
analytics.event("exit", reason="Just checking update")
return 0 if not update_available else 1

Expand All @@ -821,7 +821,7 @@ def get_io(pretty):
return 0 if success else 1

if args.check_update:
check_version(io, verbose=args.verbose)
await check_version(io, verbose=args.verbose)

if args.verbose:
show = format_settings(parser, args)
Expand Down
90 changes: 89 additions & 1 deletion aider/repomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
from tqdm import tqdm

from aider.dump import dump
from aider.helpers.similarity import (
cosine_similarity,
create_bigram_vector,
normalize_vector,
)
from aider.special import filter_important_files
from aider.tools.tool_utils import ToolError

Expand Down Expand Up @@ -183,6 +188,12 @@ def __init__(
self.map_processing_time = 0
self.last_map = None

# Initialize cache for mentioned identifiers similarity
self._last_mentioned_idents = None
self._last_mentioned_idents_vector = None
self._has_last_mentioned_idents = False
self._mentioned_ident_similarity = 0.8

if self.verbose:
self.io.tool_output(
f"RepoMap initialized with map_mul_no_files: {self.map_mul_no_files}"
Expand Down Expand Up @@ -807,6 +818,15 @@ def get_ranked_tags_map(
mentioned_idents=None,
force_refresh=False,
):
if not other_fnames:
other_fnames = list()
if not max_map_tokens:
max_map_tokens = self.max_map_tokens
if not mentioned_fnames:
mentioned_fnames = set()
if not mentioned_idents:
mentioned_idents = set()

# Create a cache key
cache_key = [
tuple(sorted(chat_fnames)) if chat_fnames else None,
Expand All @@ -815,11 +835,15 @@ def get_ranked_tags_map(
]

if self.refresh == "auto":
# Handle mentioned_fnames normally
cache_key += [
tuple(sorted(mentioned_fnames)) if mentioned_fnames else None,
tuple(sorted(mentioned_idents)) if mentioned_idents else None,
]

# Handle mentioned_idents with similarity check
cache_key_component = self._get_mentioned_idents_cache_component(mentioned_idents)
cache_key.append(cache_key_component)

cache_key = hash(str(tuple(cache_key)))

use_cache = False
Expand Down Expand Up @@ -1005,6 +1029,70 @@ def to_tree(self, tags, chat_rel_fnames):

return output

def _get_mentioned_idents_cache_component(self, mentioned_idents):
"""
Determine the cache key component for mentioned_idents using similarity comparison.

This method compares the current mentioned_idents with the previous ones using
cosine similarity. If the similarity is high enough, it returns the previous
cache key component to maintain cache hits. Otherwise, it updates the stored
values and returns the current mentioned_idents.

Args:
mentioned_idents (set): Current set of mentioned identifiers

Returns:
tuple or None: Cache key component for mentioned_idents
"""
if not mentioned_idents:
self._last_mentioned_idents = None
self._last_mentioned_idents_vector = None
self._has_last_mentioned_idents = False
return None

current_mentioned_idents = tuple(mentioned_idents)

# Check if we have a previous cached value to compare against
if self._has_last_mentioned_idents:
# Create vector for current mentioned_idents
current_vector = create_bigram_vector(current_mentioned_idents)
current_vector_norm = normalize_vector(current_vector)

# Calculate cosine similarity
similarity = cosine_similarity(self._last_mentioned_idents_vector, current_vector_norm)
# If similarity is high enough, use the previous cache key component
if similarity >= self._mentioned_ident_similarity:
# Use the previous mentioned_idents for cache key to maintain cache hit
cache_key_component = self._last_mentioned_idents

# Make similarity more strict the more consecutive cache hits
self._mentioned_ident_similarity = min(
0.9, self._mentioned_ident_similarity + 0.025
)
else:
# Similarity is too low, use current mentioned_idents
cache_key_component = current_mentioned_idents

# Update stored values
self._last_mentioned_idents = current_mentioned_idents
self._last_mentioned_idents_vector = current_vector_norm

# Make similarity less strict the more consecutive cache misses
self._mentioned_ident_similarity = max(
0.5, self._mentioned_ident_similarity - 0.025
)
else:
# First time or no previous value, use current mentioned_idents
cache_key_component = current_mentioned_idents
current_vector = create_bigram_vector(current_mentioned_idents)

# Store for future comparisons
self._last_mentioned_idents = current_mentioned_idents
self._last_mentioned_idents_vector = normalize_vector(current_vector)

self._has_last_mentioned_idents = True
return cache_key_component


def truncate_long_lines(text, max_length):
return "\n".join([line[:max_length] for line in text.splitlines()]) + "\n"
Expand Down
4 changes: 2 additions & 2 deletions aider/versioncheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ async def install_upgrade(io, latest_version=None):
return


def check_version(io, just_check=False, verbose=False):
async def check_version(io, just_check=False, verbose=False):
if not just_check and VERSION_CHECK_FNAME.exists():
day = 60 * 60 * 24
since = time.time() - os.path.getmtime(VERSION_CHECK_FNAME)
Expand Down Expand Up @@ -109,5 +109,5 @@ def check_version(io, just_check=False, verbose=False):
if not is_update_available:
return False

install_upgrade(io, latest_version)
await install_upgrade(io, latest_version)
return True
Loading