Skip to content

Commit 3b3e78d

Browse files
authored
Before and After methods (#175)
* Added before and after functions * add tests * formatting
1 parent 0202efd commit 3b3e78d

File tree

2 files changed

+118
-0
lines changed

2 files changed

+118
-0
lines changed

tests/test_cdx_api.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,3 +176,39 @@ def test_near() -> None:
176176
filters=["statuscode:200"],
177177
)
178178
cdx.near(unix_timestamp=1286705410)
179+
180+
181+
def test_before() -> None:
182+
user_agent = (
183+
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
184+
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
185+
)
186+
187+
cdx = WaybackMachineCDXServerAPI(
188+
url="http://www.google.com/",
189+
user_agent=user_agent,
190+
filters=["statuscode:200"],
191+
)
192+
before = cdx.before(wayback_machine_timestamp=20160731235949)
193+
assert "20160731233347" in before.timestamp
194+
assert "google" in before.urlkey
195+
assert before.original.find("google.com") != -1
196+
assert before.archive_url.find("google.com") != -1
197+
198+
199+
def test_after() -> None:
200+
user_agent = (
201+
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
202+
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
203+
)
204+
205+
cdx = WaybackMachineCDXServerAPI(
206+
url="http://www.google.com/",
207+
user_agent=user_agent,
208+
filters=["statuscode:200"],
209+
)
210+
after = cdx.after(wayback_machine_timestamp=20160731235949)
211+
assert "20160801000917" in after.timestamp, after.timestamp
212+
assert "google" in after.urlkey
213+
assert after.original.find("google.com") != -1
214+
assert after.archive_url.find("google.com") != -1

waybackpy/cdx_api.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,88 @@ def add_payload(self, payload: Dict[str, str]) -> None:
191191

192192
payload["url"] = self.url
193193

194+
def before(
195+
self,
196+
year: Optional[int] = None,
197+
month: Optional[int] = None,
198+
day: Optional[int] = None,
199+
hour: Optional[int] = None,
200+
minute: Optional[int] = None,
201+
unix_timestamp: Optional[int] = None,
202+
wayback_machine_timestamp: Optional[Union[int, str]] = None,
203+
) -> CDXSnapshot:
204+
"""
205+
Gets the nearest archive before the given datetime.
206+
"""
207+
if unix_timestamp:
208+
timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
209+
elif wayback_machine_timestamp:
210+
timestamp = str(wayback_machine_timestamp)
211+
else:
212+
now = datetime.utcnow().timetuple()
213+
timestamp = wayback_timestamp(
214+
year=now.tm_year if year is None else year,
215+
month=now.tm_mon if month is None else month,
216+
day=now.tm_mday if day is None else day,
217+
hour=now.tm_hour if hour is None else hour,
218+
minute=now.tm_min if minute is None else minute,
219+
)
220+
self.closest = timestamp
221+
self.sort = "closest"
222+
self.limit = 25000
223+
for snapshot in self.snapshots():
224+
if snapshot.timestamp < timestamp:
225+
return snapshot
226+
227+
# If a snapshot isn't returned, then none were found.
228+
raise NoCDXRecordFound(
229+
"No records were found before the given date for the query."
230+
+ "Either there are no archives before the given date,"
231+
+ " the URL may not have any archived, or the URL may have been"
232+
+ " recently archived and is still not available on the CDX server."
233+
)
234+
235+
def after(
236+
self,
237+
year: Optional[int] = None,
238+
month: Optional[int] = None,
239+
day: Optional[int] = None,
240+
hour: Optional[int] = None,
241+
minute: Optional[int] = None,
242+
unix_timestamp: Optional[int] = None,
243+
wayback_machine_timestamp: Optional[Union[int, str]] = None,
244+
) -> CDXSnapshot:
245+
"""
246+
Gets the nearest archive after the given datetime.
247+
"""
248+
if unix_timestamp:
249+
timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
250+
elif wayback_machine_timestamp:
251+
timestamp = str(wayback_machine_timestamp)
252+
else:
253+
now = datetime.utcnow().timetuple()
254+
timestamp = wayback_timestamp(
255+
year=now.tm_year if year is None else year,
256+
month=now.tm_mon if month is None else month,
257+
day=now.tm_mday if day is None else day,
258+
hour=now.tm_hour if hour is None else hour,
259+
minute=now.tm_min if minute is None else minute,
260+
)
261+
self.closest = timestamp
262+
self.sort = "closest"
263+
self.limit = 25000
264+
for snapshot in self.snapshots():
265+
if snapshot.timestamp > timestamp:
266+
return snapshot
267+
268+
# If a snapshot isn't returned, then none were found.
269+
raise NoCDXRecordFound(
270+
"No records were found after the given date for the query."
271+
+ "Either there are no archives after the given date,"
272+
+ " the URL may not have any archives, or the URL may have been"
273+
+ " recently archived and is still not available on the CDX server."
274+
)
275+
194276
def near(
195277
self,
196278
year: Optional[int] = None,

0 commit comments

Comments
 (0)