-
Notifications
You must be signed in to change notification settings - Fork 1
/
get-open-calls.py
444 lines (362 loc) · 12.7 KB
/
get-open-calls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
import os
import sys
import requests
from tinydb import TinyDB, Query
import re
import time
from datetime import datetime
from feedgen.feed import FeedGenerator
from git import Repo
# Get the current working directory
def getWorkingDir():
return os.path.dirname(os.path.abspath(__file__))
def getTimestampString():
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# TODO: Make sure there is a feeds directory
# Reference: https://stackoverflow.com/questions/1112012/replace-url-with-a-link-using-regex-in-python
def wrapLinks(string):
out = string
try:
URL_REGEX = re.compile(r"(^|[\n ])(([\w]+?://[\w\#$%&~.\-;:=,?@\[\]+]*)(/[\w\#$%&~/.\-;:=,?@\[\]+]*)?)", re.IGNORECASE | re.DOTALL)
out = URL_REGEX.sub(r'\1<a href="\2" target="_blank">\3</a>', string)
except:
print("Could not wrap links properly")
out = string
return out
# ========== USERPARAMS ============
# Title to save this document
saveTitle = "open-calls-7"
# Url to get the listings
listingsURL = "https://www.nyfa.org/Opportunities/Search"
# Create a database if one doesn't exist
db = TinyDB(getWorkingDir() + "/" + "db.json")
# Number of seconds to refresh
refreshSec = 60*60 # every hr
# Number of last seconds of data to include in every feed file
includeSec = 60 * 60 * 24 * 30 * 2
githubRepoURL = "https://raw.githubusercontent.com/bensnell/art-opp/master/"
repoName = "art-opp"
# =========== CODE =============
saveFolderPath = getWorkingDir() + "/" + "feeds"
if not os.path.exists(saveFolderPath):
os.makedirs(saveFolderPath)
def getShortDate(dateString):
out = ""
try:
nums = dateString.split("/")
out = str(int(nums[0])) + "/" + str(int(nums[1])) + " - ";
except:
print("Could not parse out the simplified date for the title")
out = ""
return out
# Try to get the url of the listings
def getListings(URL, pageNumber):
data = None
try:
# Reference: https://apitester.com/
payload = {"pageNumber" : pageNumber}
response = requests.post(URL, data = payload)
data = response.json()["Regular_Listings"]
if len(data) == 0:
data = None
except:
print("Could not retrieve the url " + URL)
data = None
return data
def uniqueListOfDicts(tag, array):
return list({v[tag]:v for v in array}.values())
def getAllListings(URL, maxPages):
out = []
for i in range(1, maxPages+1):
pagelistings = getListings(URL, i)
if pagelistings == None:
break
out = out + pagelistings
return uniqueListOfDicts("ID", out)
# Parse the information for opportunities; returns a list of ID's
def parseListings(listings, types):
out = []
# Find all of the specified type(s)
for item in listings:
thisID = ""
try:
thisID = item["ID"]
if item["OppType"] in types:
out.append(thisID)
except:
print("Error while parsing type in post with ID " + thisID)
return out
# Compare these IDs with those in our database to determine which are new
def getNewListings(ids):
out = []
Q = Query()
for thisID in ids:
try:
searchResults = db.search(Q.ID == thisID)
if len(searchResults) == 0:
# This is a new post
out.append(thisID);
# Don't add to db yet
except:
print("Error while searching db for ID " + ID)
return out
# Get the html text from a listing
def getListingHTMLText(url):
out = None
try:
response = requests.get(url)
out = response.text
except:
print("Could not get text from url " + url)
out = None
return out
def getTitle(opp, text):
try:
obj = re.search( r'<p class=\"contentTitle contentTitleMarginTop\">\s*(.*?)\s*</p>', text, re.M|re.I|re.S)
opp["Title"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve title information for "+opp["ID"])
print(text)
opp["Title"] = " "
return False
return True
def getOrganization(opp, text):
try:
obj = re.search( r'Organization</div><div class=\"info-right-column mobile-width-100-center\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Organization"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve organization information for "+opp["ID"])
opp["Organization"] = " "
return False
return True
def getWebsite(opp, text):
try:
obj = re.search( r'Website</div><div class=\"info-right-column mobile-width-100-center\">\s*<a href=\"\s*(.*?)\s*\" target=\"_blank\"', text, re.M|re.I|re.S)
opp["Website"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve website information for "+opp["ID"])
opp["Website"] = " "
return False
return True
def getCountry(opp, text):
try:
obj = re.search( r'Country</div><div class=\"info-right-column mobile-width-100-center\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Country"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve country information for "+opp["ID"])
opp["Country"] = " "
return False
return True
def getLocation(opp, text):
try:
obj = re.search( r'Location</div><div class=\"info-right-column mobile-width-100-center\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Location"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve location information for "+opp["ID"])
opp["Location"] = " "
return False
return True
def getOpportunityType(opp, text):
try:
obj = re.search( r'Opportunity Type</div><div class=\"info-right-column mobile-width-100-center\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Opportunity Type"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve opportunity type information for "+opp["ID"])
opp["Opportunity Type"] = " "
return False
return True
def getOpportunityDiscipline(opp, text):
try:
obj = re.search( r'Opportunity Discipline</div><div class=\"info-right-column mobile-width-100-center\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Opportunity Discipline"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve opportunity discipline information for "+opp["ID"])
opp["Opportunity Discipline"] = " "
return False
return True
def getApplicationFee(opp, text):
try:
obj = re.search( r'Application Fee</div><div class=\"info-right-column mobile-width-100-center\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Application Fee"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve application fee information for "+opp["ID"])
opp["Application Fee"] = " "
return False
return True
def getApplicationDeadline(opp, text):
try:
obj = re.search( r'Application Deadline</div><div class=\"info-right-column mobile-width-100-center\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Application Deadline"] = obj.group(1).strip().replace("\n","<br/>").replace("\r","")
except:
print("Could not retrieve application deadline information for "+opp["ID"])
opp["Application Deadline"] = " "
return False
return True
# Issue: https://stackoverflow.com/questions/20056306/match-linebreaks-n-or-r-n
def getDescription(opp, text):
try:
obj = re.search( r'Description</h2>\s*<div class=\"projectDetailsDiv text-justify text-pre-wrap\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Description"] = wrapLinks(obj.group(1).strip().replace("\n","<br/>").replace("\r",""))
except:
print("Could not retrieve description information for "+opp["ID"])
opp["Description"] = " "
return False
return True
def getApplicationInstructions(opp, text):
try:
obj = re.search( r'Application Instructions / Public Contact Information</h2>\s*<div class=\"projectDetailsDiv text-justify text-pre-wrap\">\s*(.*?)\s*</div>', text, re.M|re.I|re.S)
opp["Application Instructions"] = wrapLinks(obj.group(1).strip().replace("\n","<br/>").replace("\r",""))
except:
print("Could not retrieve application instructions information for "+opp["ID"])
opp["Application Instructions"] = " "
return False
return True
# Return a dictionary with attributes describing this listing
# If we couldn't retrieve information, None is returned
def getListingAttributes(ID):
# Get this url
thisUrl = "https://www.nyfa.org/Opportunities/Show/"+ID
# Retrieve the page
text = getListingHTMLText(thisUrl)
if (text == None): return None
# Output dictionary with attributes
out = {}
out["ID"] = ID
out["url"] = thisUrl; # url of post
getTitle(out, text)
getOrganization(out, text)
getWebsite(out, text)
getCountry(out, text)
getLocation(out, text)
getOpportunityType(out, text)
getOpportunityDiscipline(out, text)
getApplicationFee(out, text)
getApplicationDeadline(out, text)
getApplicationInstructions(out, text)
getDescription(out, text)
# Set the time we retrieved this data
out["timestamp"] = getTimestampString();
return out
# Get all listing attributes for a group of IDs
def getAllListingsAttributes(ids):
out = []
for ID in ids:
# Get this object from the id
thisObject = getListingAttributes(ID)
if thisObject != None:
# Save this object
out.append(thisObject)
return out
# Add all of the new opportunities to the RSS feed
def saveToDB(_db, array):
for item in array:
_db.insert(item)
# Get all items within a timeframe
def getLastItems(_db, _lastSec):
out = []
for item in _db:
if "ID" not in item:
continue
# Get this item's time
itemTime = datetime.strptime(item["timestamp"], '%Y-%m-%d %H:%M:%S')
# Check if this time is long ago enough
dt = datetime.now() - itemTime
diff = dt.total_seconds()
if diff < _lastSec:
out.append(item)
return out
# Format a post for the rss feed
def getHtmlFormattedListing(post):
out = ""
# out = out + "<p>" + post["Title"] + "</p>"
out = out + "<p>" + post["Organization"] + "<br/>"
out = out + "📍 " + post["Location"] + ", " + post["Country"] + "<br/>"
out = out + "🎨 " + post["Opportunity Discipline"] + "<br/>"
out = out + "📅 " + post["Application Deadline"] + " deadline" + "<br/>"
feeInfo = ""
if post["Application Fee"] == " ":
feeInfo = "unknown fee"
else:
feeInfo = post["Application Fee"] + " fee"
out = out + "💰 " + feeInfo + "<br/>"
out = out + "➡ " + "<a href=\""+post["Website"]+"\">"+"Apply"+"</a>" + "</p>"
out = out +"<br/><p><strong>" + "====== DESCRIPTION ======" + "</strong></p>"
out = out + "<p>" + post["Description"] + "</p>"
out = out +"<br/><p><strong>" + "===== INSTRUCTIONS =====" + "</strong></p>"
out = out + "<p>" + post["Application Instructions"] + "</p>"
return out
# Save a list of items to an rss xml feed file
def saveFeed(listings, title, path):
url = githubRepoURL + title + ".xml"
# Create a feed generator
fg = FeedGenerator()
# Create the feed's title
fg.id(url)
fg.title(title)
fg.author({'name':'Ben Snell'})
fg.description("Art Show Open Call Opportunities")
fg.link( href=url, rel='alternate' )
fg.language('en')
time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "-05:00"
fg.pubDate(time)
fg.updated(time)
for item in listings:
e = fg.add_entry()
e.id( item["ID"] )
# Get a clearer title
thisTitle = getShortDate(item["Application Deadline"]) + item["Title"]
e.title( thisTitle )
# for key, value in item.items():
# print(key, value);
# print(item["url"])
# if "url" in item:
e.link( href=item["url"] )
text = getHtmlFormattedListing(item)
e.content( type="html", content=text )
# This doesn't seem to work:
# e.pubDate( datetime2RSSString(clDate(apt[2])) )
# e.updated( datetime2RSSString(clDate(apt[2])) )
fg.atom_str(pretty=True)
fg.atom_file(path)
def process():
# Get listings
listings = getAllListings(listingsURL, 10)
if (listings == None): return
# Get recent post ID's
listingTypes = ["Call for Entry/Open Call"]
recentIDs = parseListings(listings, listingTypes)
# Get the new post IDs
newIDs = getNewListings(recentIDs)
# Get the attributes for each id (create a dict for each)
objects = getAllListingsAttributes(newIDs)
# Save all of these objects to the database
saveToDB(db, objects)
# Get all items observed within the last includeSec
lastItems = getLastItems(db, includeSec)
# Save these items to an xml rss file
savePath = getWorkingDir() + "/feeds/" + saveTitle + ".xml";
saveFeed(lastItems, saveTitle, savePath)
# Upload items to github
uploads = []
uploads.append(getWorkingDir() + "/feeds/" + saveTitle + ".xml")
uploads.append(getWorkingDir() + "/" + "db.json")
repo = Repo("../" + repoName)
repo.index.add(uploads)
repo.index.commit("Updated feeds")
origin = repo.remote('origin')
origin.push()
def main():
while True:
print("Starting Process ------------------------")
# Run code
start = time.time()
process()
stop = time.time()
print("Ending Process ------------------------Waiting...")
# Get duration in seconds
duration = stop - start
# Wait for no more than 15 minutes
time.sleep(max(refreshSec - duration, 0))
print("... Done waiting")
if __name__ == "__main__":
main()