diff --git a/aider/__init__.py b/aider/__init__.py index 9be6a040faa..2486d09481e 100644 --- a/aider/__init__.py +++ b/aider/__init__.py @@ -1,6 +1,6 @@ from packaging import version -__version__ = "0.88.17.dev" +__version__ = "0.88.18.dev" safe_version = __version__ try: diff --git a/aider/helpers/__init__.py b/aider/helpers/__init__.py new file mode 100644 index 00000000000..abcf8f9c2c7 --- /dev/null +++ b/aider/helpers/__init__.py @@ -0,0 +1,9 @@ +"""Utility functions for aider.""" + +from .similarity import cosine_similarity, create_bigram_vector, normalize_vector + +__all__ = [ + "cosine_similarity", + "create_bigram_vector", + "normalize_vector", +] diff --git a/aider/helpers/similarity.py b/aider/helpers/similarity.py new file mode 100644 index 00000000000..4f725d7aeae --- /dev/null +++ b/aider/helpers/similarity.py @@ -0,0 +1,98 @@ +import numpy as np + + +def normalize_vector(vector): + """Normalize a vector to unit length (L2 norm). + + Args: + vector (np.ndarray or list): Input vector + + Returns: + np.ndarray: Normalized vector with length 1 + """ + vector = np.asarray(vector, dtype=np.float64) + magnitude = np.linalg.norm(vector) + if magnitude == 0: + return vector # Return original if zero vector + return vector / magnitude + + +def cosine_similarity(vector1, vector2): + """Calculate cosine similarity between two vectors. + + Args: + vector1 (np.ndarray or list): First vector + vector2 (np.ndarray or list): Second vector + + Returns: + float: Cosine similarity between the vectors (range: -1 to 1) + """ + vector1 = np.asarray(vector1, dtype=np.float64) + vector2 = np.asarray(vector2, dtype=np.float64) + + if len(vector1) != len(vector2): + raise ValueError("Vectors must have the same length") + + # Use NumPy's optimized dot product and norm functions + dot_product = np.dot(vector1, vector2) + magnitude1 = np.linalg.norm(vector1) + magnitude2 = np.linalg.norm(vector2) + + if magnitude1 == 0 or magnitude2 == 0: + return 0.0 # Return 0 if either vector is zero + + return dot_product / (magnitude1 * magnitude2) + + +def create_bigram_vector(texts): + """Create a bigram frequency vector using optimized NumPy operations. + + This version uses pre-computed bigram indices and NumPy's bincount + for maximum performance on large datasets. + + Args: + texts (tuple): Tuple of strings to process + + Returns: + np.ndarray: Vector of bigram frequencies + """ + # Pre-compute bigram indices (0 for 'aa', 1 for 'ab', ..., 675 for 'zz') + bigram_indices = {} + idx = 0 + for i in range(ord("a"), ord("z") + 1): + for j in range(ord("a"), ord("z") + 1): + bigram = chr(i) + chr(j) + bigram_indices[bigram] = idx + idx += 1 + + # Initialize frequency vector + vector = np.zeros(26 * 26, dtype=np.int32) + + # Process all texts + for text in texts: + text_lower = text.lower() + if len(text_lower) < 2: + continue + + # Extract bigrams using NumPy sliding window view + # Convert string to character array for efficient slicing + chars = np.array(list(text_lower)) + + # Create bigrams by combining consecutive characters + bigrams = np.core.defchararray.add(chars[:-1], chars[1:]) + + # Filter only alphabetic bigrams + mask = np.array([bg.isalpha() for bg in bigrams]) + valid_bigrams = bigrams[mask] + + # Count bigrams using bincount with pre-computed indices + indices = [] + for bg in valid_bigrams: + if bg in bigram_indices: + indices.append(bigram_indices[bg]) + + if indices: + counts = np.bincount(indices, minlength=26 * 26) + vector += counts + + return vector diff --git a/aider/main.py b/aider/main.py index f4bd9b8c827..11813bea7fc 100644 --- a/aider/main.py +++ b/aider/main.py @@ -806,7 +806,7 @@ def get_io(pretty): return await main_async(argv, input, output, right_repo_root, return_coder=return_coder) if args.just_check_update: - update_available = check_version(io, just_check=True, verbose=args.verbose) + update_available = await check_version(io, just_check=True, verbose=args.verbose) analytics.event("exit", reason="Just checking update") return 0 if not update_available else 1 @@ -821,7 +821,7 @@ def get_io(pretty): return 0 if success else 1 if args.check_update: - check_version(io, verbose=args.verbose) + await check_version(io, verbose=args.verbose) if args.verbose: show = format_settings(parser, args) diff --git a/aider/repomap.py b/aider/repomap.py index 2408ebcd58a..020f51c1029 100644 --- a/aider/repomap.py +++ b/aider/repomap.py @@ -17,6 +17,11 @@ from tqdm import tqdm from aider.dump import dump +from aider.helpers.similarity import ( + cosine_similarity, + create_bigram_vector, + normalize_vector, +) from aider.special import filter_important_files from aider.tools.tool_utils import ToolError @@ -183,6 +188,12 @@ def __init__( self.map_processing_time = 0 self.last_map = None + # Initialize cache for mentioned identifiers similarity + self._last_mentioned_idents = None + self._last_mentioned_idents_vector = None + self._has_last_mentioned_idents = False + self._mentioned_ident_similarity = 0.8 + if self.verbose: self.io.tool_output( f"RepoMap initialized with map_mul_no_files: {self.map_mul_no_files}" @@ -807,6 +818,15 @@ def get_ranked_tags_map( mentioned_idents=None, force_refresh=False, ): + if not other_fnames: + other_fnames = list() + if not max_map_tokens: + max_map_tokens = self.max_map_tokens + if not mentioned_fnames: + mentioned_fnames = set() + if not mentioned_idents: + mentioned_idents = set() + # Create a cache key cache_key = [ tuple(sorted(chat_fnames)) if chat_fnames else None, @@ -815,11 +835,15 @@ def get_ranked_tags_map( ] if self.refresh == "auto": + # Handle mentioned_fnames normally cache_key += [ tuple(sorted(mentioned_fnames)) if mentioned_fnames else None, - tuple(sorted(mentioned_idents)) if mentioned_idents else None, ] + # Handle mentioned_idents with similarity check + cache_key_component = self._get_mentioned_idents_cache_component(mentioned_idents) + cache_key.append(cache_key_component) + cache_key = hash(str(tuple(cache_key))) use_cache = False @@ -1005,6 +1029,70 @@ def to_tree(self, tags, chat_rel_fnames): return output + def _get_mentioned_idents_cache_component(self, mentioned_idents): + """ + Determine the cache key component for mentioned_idents using similarity comparison. + + This method compares the current mentioned_idents with the previous ones using + cosine similarity. If the similarity is high enough, it returns the previous + cache key component to maintain cache hits. Otherwise, it updates the stored + values and returns the current mentioned_idents. + + Args: + mentioned_idents (set): Current set of mentioned identifiers + + Returns: + tuple or None: Cache key component for mentioned_idents + """ + if not mentioned_idents: + self._last_mentioned_idents = None + self._last_mentioned_idents_vector = None + self._has_last_mentioned_idents = False + return None + + current_mentioned_idents = tuple(mentioned_idents) + + # Check if we have a previous cached value to compare against + if self._has_last_mentioned_idents: + # Create vector for current mentioned_idents + current_vector = create_bigram_vector(current_mentioned_idents) + current_vector_norm = normalize_vector(current_vector) + + # Calculate cosine similarity + similarity = cosine_similarity(self._last_mentioned_idents_vector, current_vector_norm) + # If similarity is high enough, use the previous cache key component + if similarity >= self._mentioned_ident_similarity: + # Use the previous mentioned_idents for cache key to maintain cache hit + cache_key_component = self._last_mentioned_idents + + # Make similarity more strict the more consecutive cache hits + self._mentioned_ident_similarity = min( + 0.9, self._mentioned_ident_similarity + 0.025 + ) + else: + # Similarity is too low, use current mentioned_idents + cache_key_component = current_mentioned_idents + + # Update stored values + self._last_mentioned_idents = current_mentioned_idents + self._last_mentioned_idents_vector = current_vector_norm + + # Make similarity less strict the more consecutive cache misses + self._mentioned_ident_similarity = max( + 0.5, self._mentioned_ident_similarity - 0.025 + ) + else: + # First time or no previous value, use current mentioned_idents + cache_key_component = current_mentioned_idents + current_vector = create_bigram_vector(current_mentioned_idents) + + # Store for future comparisons + self._last_mentioned_idents = current_mentioned_idents + self._last_mentioned_idents_vector = normalize_vector(current_vector) + + self._has_last_mentioned_idents = True + return cache_key_component + def truncate_long_lines(text, max_length): return "\n".join([line[:max_length] for line in text.splitlines()]) + "\n" diff --git a/aider/versioncheck.py b/aider/versioncheck.py index c994e1c76ff..119c96911fb 100644 --- a/aider/versioncheck.py +++ b/aider/versioncheck.py @@ -61,7 +61,7 @@ async def install_upgrade(io, latest_version=None): return -def check_version(io, just_check=False, verbose=False): +async def check_version(io, just_check=False, verbose=False): if not just_check and VERSION_CHECK_FNAME.exists(): day = 60 * 60 * 24 since = time.time() - os.path.getmtime(VERSION_CHECK_FNAME) @@ -109,5 +109,5 @@ def check_version(io, just_check=False, verbose=False): if not is_update_available: return False - install_upgrade(io, latest_version) + await install_upgrade(io, latest_version) return True