/
detect-duplicate-files
executable file
·384 lines (346 loc) · 16.7 KB
/
detect-duplicate-files
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#!/usr/bin/python
import glob
import array
import os
import sys
#import time
from optparse import OptionParser
from stat import *
import subprocess #check_output was introduced in 2.7, and would be preferable, but for compatibility reasons we'll use Popen
def check_output(command):
return subprocess.Popen(command, stdout=subprocess.PIPE).communicate()[0]
parser = OptionParser()
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=True, help="print status messages to stdout (DEFAULT)")
parser.add_option("-V", "--very-verbose", action="store_true", dest="veryverbose", default=False, help="print every filename to stdout")
parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout")
parser.add_option("-R", "-r", "--recursive", action="store_true", dest="recursive", default=False, help="search recursively (include files in subdirectories)")
parser.add_option("--max-depth", dest="maxdepth", type="int", default=None, help="if searching recursively, only descend N levels deep", metavar="N")
parser.add_option("-m", "--mount", action="store_true", dest="mount", default=False, help="search mounted file systems in the path")
parser.add_option("--keep-as-is", action="store_true", default=False, help="keep the first file given on the command line when two files are the same (default)")
parser.add_option("--keep-oldest", action="store_true", default=False, help="keep the oldest file when two files are the same")
parser.add_option("--keep-newest", action="store_true", default=False, help="keep the newest file when two files are the same")
parser.add_option("--keep-first", action="store_true", default=False, help="keep the first file (sorted by path and file name) when two files are the same")
parser.add_option("--keep-last", action="store_true", default=False, help="keep the last file (sorted by path and file name) when two files are the same")
parser.add_option("-l", "--list", action="store_true", default=False, help="Display a list of duplicates and which files they are duplicates of (default)")
parser.add_option("-L", "--list1", action="store_true", default=False, help="Display a list of as above, but each duplicate on a separate line (if a file has many duplicates)")
parser.add_option("--rm", action="store_true", default=False, help="Print a set of rm commands: rm {duplicate} # duplicate of {original}")
parser.add_option("--RM", action="store_true", default=False, help="Print a set of rm commands: rm {original} # duplicate(s): {duplicate(s)}")
parser.add_option("--mv", type="string", default=None, help="Print a set of mv commands: mv {duplicate} {dest} # duplicate of {original}", metavar="{dest}")
parser.add_option("--MV", type="string", default=None, help="Print a set of mv commands: mv {original} {dest} # duplicate(s): {duplicate(s)}", metavar="{dest}")
parser.add_option("--no-comment", dest="add_comment", action="store_false", default=True, help="omits the comment on --rm, --RM, --mv or --MV")
parser.add_option("-1", "--each-file", action="store_true", default=False, help="Print each duplicate on a separate line on --list, --rm or --mv")
parser.add_option("--c1", type="string", default="", help="Custom command: {1} {duplicate} {2}", metavar="{1}")
parser.add_option("--c2", type="string", default="", help="Custom command: {1} {duplicate} {2}", metavar="{2}")
parser.add_option("--C1", type="string", default="", help="Custom command: {1} {original} {2}", metavar="{1}")
parser.add_option("--C2", type="string", default="", help="Custom command: {1} {original} {2}", metavar="{2}")
parser.add_option("--cc1", type="string", default="", help="Custom command: {1} {duplicate} {2} {original} {3}", metavar="{1}")
parser.add_option("--cc2", type="string", default="", help="Custom command: {1} {duplicate} {2} {original} {3}", metavar="{2}")
parser.add_option("--cc3", type="string", default="", help="Custom command: {1} {duplicate} {2} {original} {3}", metavar="{3}")
parser.add_option("--CC1", type="string", default="", help="Custom command: {1} {original} {2} {duplicate} {3}", metavar="{1}")
parser.add_option("--CC2", type="string", default="", help="Custom command: {1} {original} {2} {duplicate} {3}", metavar="{2}")
parser.add_option("--CC3", type="string", default="", help="Custom command: {1} {original} {2} {duplicate} {3}", metavar="{3}")
(options, filespecs) = parser.parse_args()
if options.recursive == False:
options.maxdepth = 0
if options.veryverbose:
options.verbose = True
if options.keep_oldest:
options.sort = "oldest"
elif options.keep_newest:
options.sort = "newest"
elif options.keep_first:
options.sort = "first"
elif options.keep_last:
options.sort = "last"
else:
options.sort = "asis"
options.c = True if options.c1 or options.c2 else False
options.C = True if options.C1 or options.C2 else False
options.cc = True if options.cc1 or options.cc2 or options.cc3 else False
options.CC = True if options.CC1 or options.CC2 or options.CC3 else False
# set default action
if not options.list and not options.list1 and not options.rm and not options.RM and not options.mv and not options.MV and not options.c and not options.C and not options.cc and not options.CC:
options.list = True
# default to all files in current directory if no argument given - analogous to "du"
if len(filespecs) == 0:
filespecs = "*"
def walktree(root, filespec, callback, maxdepth=None, depth=0):
'''recursively descend the directory tree from root,
calling the callback function for each regular file matching the filespec'''
if len(root) > 0 and root[-1] != "/":
root += "/"
if len(filespec) == 0:
filespec += "*"
# process files in this dir
for filename in glob.iglob(root + filespec):
if os.path.isfile(filename):
callback(filename)
# find directories to descent into
if depth < maxdepth or maxdepth == None:
for dirname in glob.iglob(root + "*"):
if os.path.isdir(dirname):
if not os.path.ismount(dirname) or options.mount: # don't descend into mount points unless mount option is on
walktree(dirname, filespec, callback, maxdepth, depth + 1)
num_files = 0
files = []
def add_to_files(filename):
global num_files
global files
files.append({"name": filename, "size": os.path.getsize(filename)})
num_files += 1
progress_info_last_len = 0
def progress_info(current, total):
global progress_info_last_len
# backspace, overwrite with spaces, then bacspace again
s = "\b" * progress_info_last_len + " " * progress_info_last_len + "\b" * progress_info_last_len
if isinstance(current,int) and isinstance(total,int):
new_s = str(current) + "/" + str(total)
progress_info_last_len = len(new_s)
s += new_s
elif current == None and total == None and progress_info_last_len > 0:
s += "\n" # end of progress info - go to next line
progress_info_last_len = 0
elif current == "" and total == "" and progress_info_last_len > 0:
s += "" # just return without printing anything - cursor is back to the position it was in before
progress_info_last_len = 0
return s
# fill the files array with files from all given filespecs
for dir_filespec in filespecs:
num_files = 0 # this var is set by add_to_files()
if os.path.isdir(dir_filespec):
# path/to/a/dir
rootdir = dir_filespec
filespec = "*"
else:
# path/to/a/dir/*.txt
lastslash = dir_filespec.rfind("/") + 1
rootdir = dir_filespec[:lastslash]
filespec = dir_filespec[lastslash:]
if options.verbose:
print "Searching",
if options.recursive:
if options.maxdepth == None:
print "recursively",
else:
print "recursively, down to", options.maxdepth, "level," if options.maxdepth == 1 else "levels,",
print "for", filespec,
if len(rootdir) > 0:
print "in", rootdir,
walktree(rootdir, filespec, add_to_files, options.maxdepth)
if options.verbose:
txt = "- " + str(num_files) + " file"
if num_files != 1:
txt += "s"
print txt + "."
if options.sort == "first" or options.sort == "last":
if options.veryverbose:
print "Sorting by path and filename...",
files = sorted(files, key=lambda file: ("0" if file["name"].count("/") == 0 else "1") + "/" + file["name"]) # sort by path and filename. The count thing is for sorting files in root before subdirs
if options.sort == "last":
files.reverse()
if options.veryverbose:
print progress_info("", "")
elif options.sort == "oldest" or options.sort == "newest":
if options.verbose:
print "Sorting by last modification time...",
for file_num, file in enumerate(files):
if options.verbose:
sys.stdout.write(progress_info(file_num + 1, len(files)))
files[file_num]["time"] = os.path.getmtime(file["name"])
files = sorted(files, key=lambda file: file["time"])
if options.sort == "newest":
files.reverse()
if options.verbose:
print progress_info("", "")
# remove duplicates, symlinks, and files with same inode as other files in the list
removed_duplicates = 0
removed_same_inode = 0
removed_symlink = 0
# duplicates first, otherwise statistics looks weird (duplicate symlinks are removed before duplicate entries)
if options.veryverbose:
print "removing duplicates..."
for file_num, file in enumerate(files):
if options.veryverbose:
sys.stdout.write(progress_info(file_num + 1, len(files)))
for other_file in files[file_num + 1:]:
if file["name"] == other_file["name"]:
if options.veryverbose:
print progress_info("", "") + "Removing", files.index(other_file, file_num + 1), other_file["name"], "(duplicate to", file_num, file["name"] + ")"
removed_duplicates += 1
del files[files.index(other_file, file_num + 1)] # second param is start position - this deletes the second occurrance, not the first - this is to make sure we don't mess up the loop.
if options.veryverbose:
sys.stdout.write(progress_info("", "")) #remove progress status
# now symlinks and ientical inodes
if options.veryverbose:
print "removing symlinks and files linked to the same inode..."
for file_num, file in enumerate(files):
if options.veryverbose:
sys.stdout.write(progress_info(file_num + 1, len(files)))
# remove it right away if it's a symlink
if os.path.islink(file["name"]):
if options.veryverbose:
print progress_info("", "") + "Removing", files.index(file), file["name"], "(symbolic link)"
removed_symlink += 1
del files[files.index(file)]
for other_file in files[file_num + 1:]:
remove_this = False
if os.path.islink(other_file["name"]):
if options.veryverbose:
print progress_info("", "") + "Removing", files.index(other_file), other_file["name"], "(symbolic link)"
removed_symlink += 1
remove_this = True
elif os.path.samefile(file["name"], other_file["name"]):
if options.veryverbose:
print progress_info("", "") + "Removing", files.index(other_file), other_file["name"], "(same file as", file_num, file["name"] + ")"
removed_same_inode += 1
remove_this = True
if remove_this:
del files[files.index(other_file)]
if options.veryverbose:
sys.stdout.write(progress_info("", "")) # remove progress info
if options.verbose:
if removed_duplicates > 0:
print "Removed", removed_duplicates, "duplicates from list."
if removed_same_inode > 0:
print "Removed", removed_symlink, "symbolic links."
if removed_same_inode > 0:
print "Removed", removed_same_inode, "files linking to the same inode as another file."
if options.verbose:
print "Total number of files:", len(files), "files."
# find files with same size
if options.verbose:
print "finding files with the same size...",
same_size = {}
for file_num, file in enumerate(files):
if options.veryverbose:
sys.stdout.write(progress_info(file_num + 1, len(files)))
for other_file_num, other_file in enumerate(files[file_num + 1:]):
if file["size"] == other_file["size"]:
same_size[file_num] = same_size.get(file_num, []) + [file_num + 1 + other_file_num]
if options.veryverbose:
sys.stdout.write(progress_info("", "")) # remove satus, but do not move to next line
if options.verbose:
print len(same_size), "file(s)"
# check the sha256 checksum of files with the same size
if options.verbose and len(same_size) > 0:
print "Compiling checksums..."
same_checksum = {}
list_of_duplicates = [] # contains all files that have been detected as duplicates of another
for num, other_nums in same_size.iteritems():
if options.verbose:
sys.stdout.write(progress_info(num, len(same_size)))
file_checksum = check_output(["sha256sum", files[num]["name"]])
file_checksum = file_checksum[0:file_checksum.index(" ")]
for other_num in other_nums:
other_file_checksum = check_output(["sha256sum", files[other_num]["name"]])
other_file_checksum = other_file_checksum[0:other_file_checksum.index(" ")]
if options.veryverbose:
print progress_info("", "") + "Comparing", files[num]["name"], "to", files[other_num]["name"], "-->",
if file_checksum == other_file_checksum:
if other_num not in list_of_duplicates:
print "DUPLICATE - added to list"
else:
print "duplicate - but already added to list"
else:
print "Files are different"
if file_checksum == other_file_checksum and other_num not in list_of_duplicates:
same_checksum[num] = same_checksum.get(num, []) + [other_num]
list_of_duplicates += [other_num]
sorted_dups = sorted(same_checksum, key=lambda file: ("0" if files[file]["name"].count("/") == 0 else "1") + "/" + files[file]["name"]) # sort by path and filename, files in rootdir first
if options.verbose:
print progress_info("", "") # clear status
print "Found", len(sorted_dups), "file(s) with duplicates."
if len(sorted_dups) == 0:
exit(0)
# --------------------------------------------------------------------------------------------------------
# printing output
if options.list:
if options.each_file:
if options.verbose:
print "\n# list of duplicates and which files they are duplicates of (one duplicate per line):"
for num in sorted_dups:
other_nums = same_checksum[num]
for other_num in other_nums:
print "#", files[other_num]["name"], "(identical to", files[num]["name"] + ")"
else:
if options.verbose:
print "\n# list of duplicates and which files they are duplicates of:"
for num in sorted_dups:
other_nums = same_checksum[num]
print "#",
for other_num in other_nums:
print files[other_num]["name"],
print "( identical to", files[num]["name"] + ")"
if options.rm:
if options.each_file:
if options.verbose:
print "\n# rm {duplicate} # duplicate of {original} (one duplicate per line):"
for num in sorted_dups:
other_nums = same_checksum[num]
for other_num in other_nums:
print "rm", files[other_num]["name"],
if options.add_comment:
print " # duplicate of", files[num]["name"]
else:
print
else:
if options.verbose:
print "\n# rm {duplicate(s)} # duplicate of {original}:"
for num in sorted_dups:
other_nums = same_checksum[num]
print "rm",
for other_num in other_nums:
print files[other_num]["name"],
if options.add_comment:
print " # duplicate of", files[num]["name"]
else:
print
if options.RM:
if options.verbose:
print "\n# rm {original} # duplicate(s): {duplicate(s)}:"
for num in sorted_dups:
other_nums = same_checksum[num]
print "rm", files[num]["name"],
if options.add_comment:
print" # duplicate(s):",
for other_num in other_nums:
print files[other_num]["name"],
print
if options.mv:
if options.each_file:
if options.verbose:
print "\n# mv {duplicate} {dest}",
if options.add_comment:
print " # duplicate of {original} (one duplicate per line):",
print
for num in sorted_dups:
other_nums = same_checksum[num]
for other_num in other_nums:
print "rm", files[other_num]["name"],
if options.add_comment:
print " # duplicate of", files[num]["name"]
else:
print
else:
if options.verbose:
print "\n# rm {duplicate(s)} # duplicate of {original}:"
for num in sorted_dups:
other_nums = same_checksum[num]
print "rm",
for other_num in other_nums:
print files[other_num]["name"],
if options.add_comment:
print " # duplicate of", files[num]["name"]
else:
print
if options.RM:
if options.verbose:
print "\n# rm {original} # duplicate(s): {duplicate(s)}:"
for num in sorted_dups:
other_nums = same_checksum[num]
print "rm", files[num]["name"],
if options.add_comment:
print" # duplicate(s):",
for other_num in other_nums:
print files[other_num]["name"],
print