From cba595bd21fd0731e37898dbd6c5b55f65fd8aea Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 22 Mar 2012 14:53:24 -0400 Subject: [PATCH 1/4] drop casts from users EMPTY_TREE_SHA1_BIN This macro already evaluates to the correct type, as it casts the string literal to "unsigned char *" itself (and callers who want the literal can use the _LITERAL form). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/diff.c | 2 +- merge-recursive.c | 2 +- sequencer.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/builtin/diff.c b/builtin/diff.c index 424c815f9bc2ca..9069dc41be33a3 100644 --- a/builtin/diff.c +++ b/builtin/diff.c @@ -327,7 +327,7 @@ int cmd_diff(int argc, const char **argv, const char *prefix) add_head_to_pending(&rev); if (!rev.pending.nr) { struct tree *tree; - tree = lookup_tree((const unsigned char*)EMPTY_TREE_SHA1_BIN); + tree = lookup_tree(EMPTY_TREE_SHA1_BIN); add_pending_object(&rev, &tree->object, "HEAD"); } break; diff --git a/merge-recursive.c b/merge-recursive.c index 6479a60cd112c5..318d32e223242b 100644 --- a/merge-recursive.c +++ b/merge-recursive.c @@ -1914,7 +1914,7 @@ int merge_recursive(struct merge_options *o, /* if there is no common ancestor, use an empty tree */ struct tree *tree; - tree = lookup_tree((const unsigned char *)EMPTY_TREE_SHA1_BIN); + tree = lookup_tree(EMPTY_TREE_SHA1_BIN); merged_common_ancestors = make_virtual_commit(tree, "ancestor"); } diff --git a/sequencer.c b/sequencer.c index a37846a594f5a2..4307364b261bcb 100644 --- a/sequencer.c +++ b/sequencer.c @@ -164,7 +164,7 @@ static void write_message(struct strbuf *msgbuf, const char *filename) static struct tree *empty_tree(void) { - return lookup_tree((const unsigned char *)EMPTY_TREE_SHA1_BIN); + return lookup_tree(EMPTY_TREE_SHA1_BIN); } static int error_dirty_index(struct replay_opts *opts) From f8582cad8d11cae15e73fe5213b6180ee4b9fcce Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 22 Mar 2012 14:53:39 -0400 Subject: [PATCH 2/4] make is_empty_blob_sha1 available everywhere The read-cache implementation defines this static function, but it is a generally useful concept in git. Let's give the empty blob the same treatment as the empty tree, providing both hex and binary forms of the sha1. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- cache.h | 13 +++++++++++++ read-cache.c | 10 ---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cache.h b/cache.h index e5e1aa4e15a336..52805748012f97 100644 --- a/cache.h +++ b/cache.h @@ -708,6 +708,19 @@ static inline void hashclr(unsigned char *hash) #define EMPTY_TREE_SHA1_BIN \ ((const unsigned char *) EMPTY_TREE_SHA1_BIN_LITERAL) +#define EMPTY_BLOB_SHA1_HEX \ + "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" +#define EMPTY_BLOB_SHA1_BIN_LITERAL \ + "\xe6\x9d\xe2\x9b\xb2\xd1\xd6\x43\x4b\x8b" \ + "\x29\xae\x77\x5a\xd8\xc2\xe4\x8c\x53\x91" +#define EMPTY_BLOB_SHA1_BIN \ + ((const unsigned char *) EMPTY_BLOB_SHA1_BIN_LITERAL) + +static inline int is_empty_blob_sha1(const unsigned char *sha1) +{ + return !hashcmp(sha1, EMPTY_BLOB_SHA1_BIN); +} + int git_mkstemp(char *path, size_t n, const char *template); int git_mkstemps(char *path, size_t n, const char *template, int suffix_len); diff --git a/read-cache.c b/read-cache.c index 274e54b4f31da6..6c8f3958369b0a 100644 --- a/read-cache.c +++ b/read-cache.c @@ -157,16 +157,6 @@ static int ce_modified_check_fs(struct cache_entry *ce, struct stat *st) return 0; } -static int is_empty_blob_sha1(const unsigned char *sha1) -{ - static const unsigned char empty_blob_sha1[20] = { - 0xe6,0x9d,0xe2,0x9b,0xb2,0xd1,0xd6,0x43,0x4b,0x8b, - 0x29,0xae,0x77,0x5a,0xd8,0xc2,0xe4,0x8c,0x53,0x91 - }; - - return !hashcmp(sha1, empty_blob_sha1); -} - static int ce_match_stat_basic(struct cache_entry *ce, struct stat *st) { unsigned int changed = 0; From 90d43b07687fdc51d1f2fc14948df538dc45584b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 22 Mar 2012 18:52:13 -0400 Subject: [PATCH 3/4] teach diffcore-rename to optionally ignore empty content Our rename detection is a heuristic, matching pairs of removed and added files with similar or identical content. It's unlikely to be wrong when there is actual content to compare, and we already take care not to do inexact rename detection when there is not enough content to produce good results. However, we always do exact rename detection, even when the blob is tiny or empty. It's easy to get false positives with an empty blob, simply because it is an obvious content to use as a boilerplate (e.g., when telling git that an empty directory is worth tracking via an empty .gitignore). This patch lets callers specify whether or not they are interested in using empty files as rename sources and destinations. The default is "yes", keeping the original behavior. It works by detecting the empty-blob sha1 for rename sources and destinations. One more flexible alternative would be to allow the caller to specify a minimum size for a blob to be "interesting" for rename detection. But that would catch small boilerplate files, not large ones (e.g., if you had the GPL COPYING file in many directories). A better alternative would be to allow a "-rename" gitattribute to allow boilerplate files to be marked as such. I'll leave the complexity of that solution until such time as somebody actually wants it. The complaints we've seen so far revolve around empty files, so let's start with the simple thing. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- diff.c | 5 +++++ diff.h | 2 +- diffcore-rename.c | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/diff.c b/diff.c index 377ec1ea4cd905..0b70aadc4f973d 100644 --- a/diff.c +++ b/diff.c @@ -3136,6 +3136,7 @@ void diff_setup(struct diff_options *options) options->rename_limit = -1; options->dirstat_permille = diff_dirstat_permille_default; options->context = 3; + DIFF_OPT_SET(options, RENAME_EMPTY); options->change = diff_change; options->add_remove = diff_addremove; @@ -3506,6 +3507,10 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac) } else if (!strcmp(arg, "--no-renames")) options->detect_rename = 0; + else if (!strcmp(arg, "--rename-empty")) + DIFF_OPT_SET(options, RENAME_EMPTY); + else if (!strcmp(arg, "--no-rename-empty")) + DIFF_OPT_CLR(options, RENAME_EMPTY); else if (!strcmp(arg, "--relative")) DIFF_OPT_SET(options, RELATIVE_NAME); else if (!prefixcmp(arg, "--relative=")) { diff --git a/diff.h b/diff.h index cb687436a0ddb9..dd48eca71ddf1a 100644 --- a/diff.h +++ b/diff.h @@ -60,7 +60,7 @@ typedef struct strbuf *(*diff_prefix_fn_t)(struct diff_options *opt, void *data) #define DIFF_OPT_SILENT_ON_REMOVE (1 << 5) #define DIFF_OPT_FIND_COPIES_HARDER (1 << 6) #define DIFF_OPT_FOLLOW_RENAMES (1 << 7) -/* (1 << 8) unused */ +#define DIFF_OPT_RENAME_EMPTY (1 << 8) /* (1 << 9) unused */ #define DIFF_OPT_HAS_CHANGES (1 << 10) #define DIFF_OPT_QUICK (1 << 11) diff --git a/diffcore-rename.c b/diffcore-rename.c index f639601c762ebb..216a7a4bbcab18 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -512,9 +512,15 @@ void diffcore_rename(struct diff_options *options) else if (options->single_follow && strcmp(options->single_follow, p->two->path)) continue; /* not interested */ + else if (!DIFF_OPT_TST(options, RENAME_EMPTY) && + is_empty_blob_sha1(p->two->sha1)) + continue; else locate_rename_dst(p->two, 1); } + else if (!DIFF_OPT_TST(options, RENAME_EMPTY) && + is_empty_blob_sha1(p->one->sha1)) + continue; else if (!DIFF_PAIR_UNMERGED(p) && !DIFF_FILE_VALID(p->two)) { /* * If the source is a broken "delete", and From 4f7cb99ada26be5d86402a6e060f3ee16a672f16 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 22 Mar 2012 18:52:24 -0400 Subject: [PATCH 4/4] merge-recursive: don't detect renames of empty files Merge-recursive detects renames so that if one side modifies "foo" and the other side moves it to "bar", the modification is applied to "bar". However, our rename detection is based on content analysis, it can be wrong (i.e., two files were not intended as a rename, but just happen to have the same or similar content). This is quite rare if the files actually contain content, since two unrelated files are unlikely to have exactly the same content. However, empty files present a problem, in that there is nothing to analyze. An uninteresting placeholder file with zero bytes may or may not be related to a placeholder file with another name. The result is that adding content to an empty file may cause confusion if the other side of a merge removed it; your content may end up in another random placeholder file that was added. Let's err on the side of caution and not consider empty files as renames. This will cause a modify/delete conflict on the merge, which will let the user sort it out themselves. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- merge-recursive.c | 1 + t/t6022-merge-rename.sh | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/merge-recursive.c b/merge-recursive.c index 318d32e223242b..0fb174359ac12e 100644 --- a/merge-recursive.c +++ b/merge-recursive.c @@ -485,6 +485,7 @@ static struct string_list *get_renames(struct merge_options *o, renames = xcalloc(1, sizeof(struct string_list)); diff_setup(&opts); DIFF_OPT_SET(&opts, RECURSIVE); + DIFF_OPT_CLR(&opts, RENAME_EMPTY); opts.detect_rename = DIFF_DETECT_RENAME; opts.rename_limit = o->merge_rename_limit >= 0 ? o->merge_rename_limit : o->diff_rename_limit >= 0 ? o->diff_rename_limit : diff --git a/t/t6022-merge-rename.sh b/t/t6022-merge-rename.sh index 9d8584e957a26c..1104249182074c 100755 --- a/t/t6022-merge-rename.sh +++ b/t/t6022-merge-rename.sh @@ -884,4 +884,20 @@ test_expect_success 'no spurious "refusing to lose untracked" message' ' ! grep "refusing to lose untracked file" errors.txt ' +test_expect_success 'do not follow renames for empty files' ' + git checkout -f -b empty-base && + >empty1 && + git add empty1 && + git commit -m base && + echo content >empty1 && + git add empty1 && + git commit -m fill && + git checkout -b empty-topic HEAD^ && + git mv empty1 empty2 && + git commit -m rename && + test_must_fail git merge empty-base && + >expect && + test_cmp expect empty2 +' + test_done