/
matchPhotos.py
346 lines (287 loc) · 13.6 KB
/
matchPhotos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#! /usr/local/bin/python
import exifread
import datetime
import subprocess
import git
import os
import sys
import time
from bisect import bisect_left
import argparse
import cPickle as pickle
import shutil
#Also best to have leveldb if you're running it more than once.
#Some local settings:
def takeClosest(myList, myNumber):
"""
Assumes myList is sorted. Returns closest value to myNumber.
If two numbers are equally close, return the smallest number.
Taken from http://stackoverflow.com/questions/12141150/from-list-of-integers-get-number-closest-to-a-given-value
"""
pos = bisect_left(myList, myNumber)
if pos == 0:
return myList[0]
if pos == len(myList):
return myList[-1]
before = myList[pos - 1]
after = myList[pos]
if after - myNumber < myNumber - before:
return after
else:
return before
class GitRepo(object):
def __init__(self,basedir):
self.repo = git.repo.base.Repo(basedir)
self.commits = dict()
self.basedir = basedir
def edits(self,pageList):
self.editList = []
for markdownPage in pageList:
self.editList.append(self.repo.blame(self.repo.active_branch,markdownPage))
return self.editList
class observedDict(object):
"""
Maintains a database to store exif information for files.
Mimics dict methods mostly for convenience, but also so you can run
it once without having leveldb installed.
"""
def __init__(self,location = "exifCache"):
import leveldb
self.dbm = leveldb.LevelDB(location)
def __getitem__(self,item):
return self.dbm.Get(item)
def __setitem__(self,key,value):
self.dbm.Put(key,value)
def keys(self):
return [key for key,value in self.dbm.RangeIter()]
def resetSeen(self):
"""
Clear knowledge about what documents have been written to.
Only for highly arcane script testing stuff.
"""
keys = self.keys()
for key in keys:
tmp = picture(key)
tmp.document = None
tmp.Save()
class Picture(object):
"""
Stores relevant information about a picture: it's exif tag information,
time, whether it's already been assigned into a document, etc.
Includes methods to save into an observedDict class item, so knowledge
about what's been seen is persistent and you don't have to keep parsing
the exif information over and over again, which can be time-consuming.
"""
def __init__(self,string):
self.filename = os.path.basename(string)
self.location = string
self.document = None
try:
self.Load()
except KeyError:
print "processing" + string
self.parseExif()
def parseExif(self):
"""
Use the cache or parse new exif data.
"""
file = open(self.location,'rb')
self.tags = exifread.process_file(file)
self.time = self.tags['Image DateTime'].printable
self.time = datetime.datetime.strptime(self.time,"%Y:%m:%d %H:%M:%S")
self.epoch = time.mktime(self.time.timetuple())
self.thumbnail = self.tags['JPEGThumbnail']
self.Save()
def Load(self):
cached = cache[self.location]
try:
tmp_dict = pickle.loads(cached)
self.__dict__.update(tmp_dict)
except AttributeError:
del cached[self.location]
def Save(self):
cache[self.location] = pickle.dumps(self.__dict__)
def write_thumbnail_and_file(self,destination_dir):
self.thumbLocation = destination_dir + "/" + self.filename + ".thumb.jpg"
self.destination_location = destination_dir + "/" + self.filename
if not os.path.exists(self.destination_location):
print ("copying from %s to %s" % (self.location,self.destination_location))
shutil.copy(self.location,self.destination_location)
if not os.path.exists(self.thumbLocation):
out = open(self.thumbLocation,'wb')
out.write(self.thumbnail)
def markdown_string(self,label = "An archival photo",photoLinkPrefix="archivalPhotos/img"):
#Where the image appears: a link with a thumbnail as a separate line.
thumbnailSite = photoLinkPrefix + self.filename + ".thumb.jpg"
fullSizeSite = photoLinkPrefix + self.filename
return "[![%s](%s)](%s)" % (label,thumbnailSite,fullSizeSite)
def tagAsIncorporatedIn(self,document):
self.document = document
self.Save()
def parse_args():
"""
Parse a number of command-line arguments.
"""
parser = argparse.ArgumentParser(description="Parser")
### --dry-run prints the changed files to stdout, stores the exif metadata,
### does nothing else.
parser.add_argument('--dry-run', dest='dryRun', action='store_true',default=False,help="Cache photos and metadata, but do not write to Markdown files.")
parser.add_argument('--markdown-dir', default = "/Users/bschmidt/Dropbox/gitit/wikidata/",help="The directory where notes are kept in markdown format. This must be a git repository.")
parser.add_argument('--import-photo-dir', default = "/Volumes/NO NAME/DCIM/100OLYMP/",help="The directory to import photos from: can be a camera SD card, for example.")
parser.add_argument('--dest-photo-dir', help="The directory to save photos and thumbnails into", default = "/Users/bschmidt/Dropbox/gitit/static/img/archivalPhotos")
parser.add_argument('--ignore-past-age', default = float("Inf"), type=float,help = "Ignore photos more than this many days old from the directory: useful if you have other photos on your camera you don't want to waste time caching information and thumbnails for")
parser.add_argument('--photo-link-prefix', default = "/img/archivalPhotos/",help = "What to prefix the thumbnail name in the html: useful for displaying in gitit or other HTML formats.")
parser.add_argument('--markdown-suffix', default = ".page",help = "What markdown files inside the repo end in: usually this would be '.md', but it is '.page' by default since I use gitit.")
parser.add_argument('--picture-suffix', default = ".JPG",help = "Your camera's image extension. Possibly case sensitive.")
#The suffix your camera uses for photos
args = parser.parse_args()
return args
# A cache of seen photos. Initialized here so available to functions.
cache = ()
def main():
global cache
args = parse_args()
markdownDirectory = args.markdown_dir
photodir = args.import_photo_dir
dryRun = args.dryRun
pageSuffix = args.markdown_suffix
pictureSuffix = args.picture_suffix
dest_photo_dir = args.dest_photo_dir
photoLinkPrefix = args.photo_link_prefix
cache = observedDict()
#cache.resetSeen()
#Get the files we'll be working with: markdown documents and jpg files, most likely.
photoList = []
for pict in os.listdir(photodir):
mtime = os.path.getmtime(photodir + "/" + pict)
if ((time.time() - mtime)/60/60/24) > args.ignore_past_age:
continue
if pict.startswith("."):
continue
if pict.endswith(pictureSuffix):
# Obviously something particular to my corpus.
if not pict.endswith('.thumb.jpg'):
try:
try:
cached = pickle.loads(cache[markdownDirectory + photodir + "/" + pict])
except KeyError:
cached = pickle.loads(cache[markdownDirectory + photodir + "/" + pict])
# if it's not None, it's already been assigned in this corpus, and we don't need to
# do so again.
if cached.document is None:
photoList.append(pict)
except:
photoList.append(pict)
pageList = [page for page in os.listdir(markdownDirectory) if page.endswith(pageSuffix)]
pictures = []
#Set up a git repo.
repol = GitRepo(markdownDirectory)
#Get the list of edits associated with that directory.
edits = repol.edits(pageList)
timeLookup = dict()
#Store edits by the second they happened in. Not a real way to store commits. For each second, we'll keep a tuple of document, commit number, and line number that can be used as a key to specify exactly which line. We'll do one for the first line modified by that commit, and one for the last.
editNumber = -1
#First, initialize the list of edits.
for editList in edits:
editNumber +=1
commitNumber = -1
for commit in editList:
commitNumber +=1
eTime = commit[0].committed_date
if eTime not in timeLookup:
timeLookup[eTime] = dict()
#The first line is only updated once.
timeLookup[eTime]['firstLine'] = (editNumber,
commitNumber,
len(commit[1])-1)
#The last line in the commit gets updated for each subsequent edit
#If multiple documents are edited in the same commit,
#behavior is undefined but hopefully reasonable. (Probably the last edit
# is tagged to the alphabetically last document, but no guarantees).
timeLookup[eTime]['lastLine'] = (editNumber,
commitNumber,
len(commit[1])-1)
#Then initialize the photos
for name in photoList:
try:
pictures.append(Picture(photodir + "/" + name))
except IndexError:
print "couldn't get full data for " + name + ", skipping"
except TypeError:
print "couldn't get full data for " + name + ", skipping"
"""
ChangesToMake is a dict whose keys are a tuple consisting of an editnumber, a commitnumber, and a line number: and whose values are an array of tuples with markdown strings to add and whether they go before or after.
Obviously that's too complicated, and I'm doing something inelegant.
So if the item changesToMake[(3,4,2)] equals [("[a.jpg]()","firstLine"),("[b.jpg]()",lastLine)],
that means the second line of the fourth commit of the third file should have those two links added to it, the first in front and the second behind. (Well, really, the third line of the fifth commit of the fourth file, since we're zero-indexed)
"""
changesToMake = dict()
#The times of the edits will be matched against the times the pictures are edited:
editTimes = timeLookup.keys()
editTimes.sort()
for myPict in pictures:
#Write a thumbnail (only done if it doesn't exist)
myPict.write_thumbnail_and_file(args.dest_photo_dir)
#find the nearest edit
nearestEdit = takeClosest(editTimes,myPict.epoch)
# Should it go near the first or the last element?
# Well, was the picture before or after the typing?
putNear = 'lastLine'
if myPict.epoch < nearestEdit:
putNear = 'firstLine'
if abs(myPict.epoch - nearestEdit) > 60*60*2:
print "Skipping %s, it's over two hours from the nearest commit" % (markdownDirectory + photodir + myPict.filename)
myPict.document = "skipping"
myPict.Save()
continue
whereToPlace = timeLookup[nearestEdit][putNear]
if myPict.document is None:
if not dryRun:
myPict.tagAsIncorporatedIn(whereToPlace[0])
else:
continue
try:
changesToMake[whereToPlace] += [(myPict.markdown_string(photoLinkPrefix = photoLinkPrefix),putNear)]
except KeyError:
changesToMake[whereToPlace] = [(myPict.markdown_string(photoLinkPrefix = photoLinkPrefix),putNear)]
"""
finally, loop through the whole diff again, this time writing the files out witheir changes.
"""
editNumber = -1
for editList in edits:
editNumber +=1
commitNumber = -1
#We actually open up the file and write it completely anew. Only do this for
#files which have actually changed, to avoid messing with the timestamps.
anythingHasChanged = False
alteredText = []
for commit in editList:
commitNumber +=1
lineNumber = -1
for line in commit[1]:
lineNumber+=1
try:
picturesToAdd=changesToMake[(editNumber,commitNumber,lineNumber)]
newLines = [line]
anythingHasChanged = True
except:
#Most of the time, there won't be a change designated for the line.
alteredText += [line]
continue
addBefore = [p[0] for p in picturesToAdd if p[1]=="firstLine"]
addAfter = [p[0] for p in picturesToAdd if p[1]=="lastLine"]
#Newline at the end so the last picture doesn't enjamb against the
#next line. The goal is get one series of photos on the same line,
#but not do anything too ugly.
alteredText = alteredText + ["\n"] + addBefore + ["\n",line,"\n"] + addAfter + ["\n"]
if anythingHasChanged:
if dryRun:
print "CHANGES TO " + markdownDirectory + "/" + pageList[editNumber] + "\n"*5
output = sys.stdout
#output = open("/dev/null","a")
else:
output = open(markdownDirectory + "/" + pageList[editNumber],'w')
for line in alteredText:
output.write(line + "\n")
if __name__=="__main__":
main()