Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
  • 3 commits
  • 2 files changed
  • 0 commit comments
  • 2 contributors
Showing with 69 additions and 7 deletions.
  1. +32 −1 warc/arc.py
  2. +37 −6 warc/tests/test_arc.py
View
33 warc/arc.py
@@ -147,6 +147,36 @@ def __init__(self, header = None, payload = None, headers = {}):
self.header = header or ARCHeader(**headers)
self.payload = payload
+ @classmethod
+ def from_string(cls, string, version):
+ """
+ Constructs an ARC record from a string and returns it.
+
+ TODO: It might be best to merge this with the _read_arc_record
+ function rather than reimplement the functionality here.
+ """
+ if string[0] == '\n': # Drop the initial newline
+ string = string[1:]
+ header, payload = string.split("\n",1)
+ if payload[0] == '\n': # There's an extra
+ payload = payload[1:]
+
+ if int(version) == 1:
+ url, ip_address, date, content_type, length = header.split()
+ headers = dict(url = url, ip_address = ip_address,
+ date = date, content_type = content_type,
+ length = length)
+ arc_header = ARCHeader(**headers)
+ elif int(version) == 2:
+ url, ip_address, date, content_type, result_code, checksum, location, offset, filename, length = header.split()
+ headers = dict(url = url, ip_address = ip_address, date = date,
+ content_type = content_type, result_code = result_code,
+ checksum = checksum, location = location, offset = offset,
+ filename = filename, length = length)
+ arc_header = ARCHeader(**headers)
+
+ return cls(header = arc_header, payload = payload)
+
def write_to(self, f, version = 2):
f.write("\n")
self.header.write_to(f, version)
@@ -306,7 +336,6 @@ def _read_arc_record(self):
# if r == "":
# return None
# header = self.fileobj.readline()
- # self.fileobj.readline() # Drop the separator newline
# Strip the initial new lines and read first line
header = self.fileobj.readline()
@@ -316,6 +345,8 @@ def _read_arc_record(self):
if header == "":
return None
+ self.fileobj.readline() # Drop the separator newline
+
if self.version == 1:
url, ip_address, date, content_type, length = header.split()
headers = dict(url = url, ip_address = ip_address,
View
43 warc/tests/test_arc.py
@@ -87,9 +87,9 @@ def test_arc_v1_record_creation():
filename = "sample.arc.gz")
record_v1 = arc.ARCRecord(header, "BlahBlah")
f = StringIO.StringIO()
- record_v1.write_to(f)
+ record_v1.write_to(f, 1)
record_v1_string = f.getvalue()
- assert record_v1_string == "\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nBlahBlah"
+ assert record_v1_string == "\nhttp://archive.org 127.0.0.1 20120301093000 text/html 500\n\nBlahBlah"
def test_arc_v2_record_creation():
"Validate ARC V1 record creation"
@@ -103,11 +103,11 @@ def test_arc_v2_record_creation():
location = "http://www.archive.org",
offset = "300",
filename = "sample.arc.gz")
- record_v1 = arc.ARCRecord(payload = "BlahBlah", headers = header)
+ record_v2 = arc.ARCRecord(payload = "BlahBlah", headers = header)
f = StringIO.StringIO()
- record_v1.write_to(f)
- record_v1_string = f.getvalue()
- assert record_v1_string == "\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nBlahBlah"
+ record_v2.write_to(f)
+ record_v2_string = f.getvalue()
+ assert record_v2_string == "\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nBlahBlah"
def test_arc_v1_writer():
"Try writing records to an ARC V1 file. This is what API will feel like to a user of the library"
@@ -241,3 +241,34 @@ def test_arc_reader_v2():
assert r1['filename'] == "sample.arc.gz"
assert r1['length'] == "8"
assert r1.payload == "Payload1"
+
+def test_arc_v1_record_from_string():
+ "Validate ARC V1 record creation from string"
+ record_v1_string = "\nhttp://www.archive.org 127.0.0.1 20120301093000 text/html 500\n\nBlahBlah"
+ record = arc.ARCRecord.from_string(record_v1_string, 1)
+
+ assert record['url'] == "http://www.archive.org"
+ assert record['ip_address'] == "127.0.0.1"
+ assert record['date'] == "20120301093000"
+ assert record['content_type'] == "text/html"
+ assert record['length'] == "500"
+ assert record.payload == "BlahBlah"
+
+
+def xtest_arc_v2_record_from_string():
+ "Validate ARC V2 record creation from string"
+ record_v2_string = "\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nBlahBlah"
+ record = arc.ARCRecord.from_string(record_v2_string, 2)
+
+ assert record['url'] == "http://archive.org"
+ assert record['ip_address'] == "127.0.0.1"
+ assert record['date'] == "20120301093000"
+ assert record['content_type'] == "text/html"
+ assert record['checksum'] == "a123456"
+ assert record['location'] == "http://www.archive.org"
+ assert record['offset'] == "300"
+ assert record['filename'] == "sample.arc.gz"
+ assert record['length'] == "500"
+ assert record.payload == "BlahBlah"
+
+

No commit comments for this range

Something went wrong with that request. Please try again.