diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index 64dbe7c..d19e2da 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -389,7 +389,7 @@ def _get_resource_cache_location(self, path, create_dirs=False): path = clean_path(path) # for the cached file name, we use an md5 of the path to avoid things being too long - resid = str(md5(path.encode("utf-8")).hexdigest()) + resid = str(md5(path.encode("utf-8", "surrogateescape")).hexdigest()) cache_sub_dir, cache_file_name = resid[-2:], resid parent = join(self.cache_dir, cache_sub_dir) diff --git a/tests/test_resource.py b/tests/test_resource.py index 9711b66..6249ebb 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -695,6 +695,11 @@ def test_codebase_cache_default(self): child_2 = codebase.get_resource(path=child.path) assert child_2 == child + def test_codebase_cache_handles_non_utf8_path(self): + test_codebase = self.get_test_loc("resource/cache2") + codebase = Codebase(test_codebase) + codebase._get_resource_cache_location("resource/cache2/\udce9", create_dirs=True) + def test_codebase_cache_all_in_memory(self): test_codebase = self.get_test_loc("resource/cache2") codebase = Codebase(test_codebase, max_in_memory=0)