-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add .getHttpStatus and .getFilename to ArchiveRecordImpl class #198 & #164 #292
Changes from 4 commits
82314ad
9bca66e
37ae8e0
46b32a2
739e06c
b722d1d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,16 +18,22 @@ | |
package io.archivesunleashed | ||
|
||
import java.text.SimpleDateFormat | ||
import java.io.ByteArrayInputStream | ||
|
||
import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils, ArchiveRecordWritable} | ||
import io.archivesunleashed.matchbox.{ExtractDate, ExtractDomain, RemoveHttpHeader} | ||
import org.apache.spark.SerializableWritable | ||
import org.archive.io.arc.ARCRecord | ||
import org.archive.io.warc.WARCRecord | ||
import org.archive.util.ArchiveUtils | ||
import scala.util.Try | ||
import org.apache.commons.httpclient.{Header, HttpParser, StatusLine} | ||
|
||
/** Trait for a record in a web archive. */ | ||
trait ArchiveRecord extends Serializable { | ||
/** Returns the filename containing the Archive Records */ | ||
def getFilename: String | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Just to confirm Resourcename (to mimic Filename) and not ResourceName (because resource name are two separate words)? I'm okay with either. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Current commit uses Resourcename. I think that's fine - I'm probably just overthinking it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
/** Returns the crawl date. */ | ||
def getCrawlDate: String | ||
|
||
|
@@ -51,6 +57,10 @@ trait ArchiveRecord extends Serializable { | |
|
||
/** Returns a raw array of bytes for an image. */ | ||
def getImageBytes: Array[Byte] | ||
|
||
/** Returns the http status of the crawl. */ | ||
def getHttpStatus: String | ||
|
||
} | ||
|
||
/** Default implementation of a record in a web archive. | ||
|
@@ -64,6 +74,7 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends | |
var arcRecord: ARCRecord = null | ||
var warcRecord: WARCRecord = null | ||
// scalastyle:on null | ||
var headerResponseFormat: String = "US-ASCII" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the rationale for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I suppose it this? https://tools.ietf.org/html/rfc7230#section-3.2.4 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes - this part was a mimic of @dportabella 's suggestion. Seemed weird to me at first as well. |
||
|
||
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { | ||
arcRecord = r.t.getRecord.asInstanceOf[ARCRecord] | ||
|
@@ -72,6 +83,14 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends | |
} | ||
val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX") | ||
|
||
val getFilename: String = { | ||
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC){ | ||
arcRecord.getMetaData.getReaderIdentifier() | ||
} else { | ||
warcRecord.getHeader.getReaderIdentifier() | ||
} | ||
} | ||
|
||
val getCrawlDate: String = { | ||
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC){ | ||
ExtractDate(arcRecord.getMetaData.getDate, ExtractDate.DateComponent.YYYYMMDD) | ||
|
@@ -107,9 +126,10 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends | |
|
||
val getMimeType: String = { | ||
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { | ||
arcRecord.getMetaData.getMimetype | ||
Option(arcRecord.getMetaData.getMimetype).getOrElse("unknown") | ||
} else { | ||
WarcRecordUtils.getWarcResponseMimeType(getContentBytes) | ||
Option(WarcRecordUtils.getWarcResponseMimeType(getContentBytes)) | ||
.getOrElse("unknown") | ||
} | ||
} | ||
|
||
|
@@ -121,6 +141,19 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends | |
} | ||
} | ||
|
||
val getHttpStatus: String = { | ||
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { | ||
Option(arcRecord.getMetaData.getStatusCode).getOrElse("000") | ||
} else { | ||
Try(new StatusLine(new String(HttpParser.readRawLine | ||
(new ByteArrayInputStream(getContentBytes)))) | ||
.getStatusCode).toOption match { | ||
case Some(x) => x.toString | ||
case None => "000" | ||
} | ||
} | ||
} | ||
|
||
val getDomain: String = { | ||
ExtractDomain(getUrl) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import org.apache.spark.{SparkConf, SparkContext} | |
import org.junit.runner.RunWith | ||
import org.scalatest.junit.JUnitRunner | ||
import org.scalatest.{BeforeAndAfter, FunSuite} | ||
import org.apache.commons.io.FilenameUtils | ||
|
||
@RunWith(classOf[JUnitRunner]) | ||
class ArchiveRecordTest extends FunSuite with BeforeAndAfter { | ||
|
@@ -37,17 +38,76 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { | |
.setAppName(appName) | ||
conf.set("spark.driver.allowMultipleContexts", "true"); | ||
sc = new SparkContext(conf) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove blank line. |
||
} | ||
|
||
test("count records") { | ||
assert(RecordLoader.loadArchives(arcPath, sc).count == 300L) | ||
assert(RecordLoader.loadArchives(warcPath, sc).count == 299L) | ||
} | ||
|
||
test("file name") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => FilenameUtils.getName(x.getFilename)) | ||
.take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => FilenameUtils.getName(x.getFilename)).take(3) | ||
assert(textSampleArc.deep == Array("example.arc.gz", | ||
"example.arc.gz", "example.arc.gz").deep) | ||
assert(textSampleWarc.deep == Array("example.warc.gz", | ||
"example.warc.gz", "example.warc.gz").deep) | ||
} | ||
|
||
test("Crawl Dates") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getCrawlDate).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getCrawlDate).take(3) | ||
assert(textSampleArc.deep == Array("20080430", "20080430", "20080430").deep) | ||
assert(textSampleWarc.deep == Array("20080430", "20080430", "20080430").deep) | ||
} | ||
|
||
test("Domains") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are a few extra test methods here. Are they scope for the issues posted in the original comment? Or do they cover other tickets? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No other tickets. Basically, last PR I had good test coverage, but failed to test particular cases and my code passed Travis with bugs. I decided to include these additional tests in case that happened again (it was unlikely, but I wanted this PR to go more smoothly). Since ArchiveRecord is used widely across the tests, I did not expect the tests to improve coverage (as per #260). |
||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getDomain).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getDomain).take(3) | ||
assert(textSampleArc.deep == Array("", "", "www.archive.org").deep) | ||
assert(textSampleWarc.deep == Array("", "www.archive.org", "www.archive.org").deep) | ||
} | ||
|
||
test("Urls") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getUrl).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getUrl).take(3) | ||
assert(textSampleArc.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc", | ||
"dns:www.archive.org", "http://www.archive.org/robots.txt").deep) | ||
assert(textSampleWarc.deep == Array("dns:www.archive.org", | ||
"http://www.archive.org/robots.txt", "http://www.archive.org/").deep) | ||
} | ||
|
||
test("Mime-Type") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getMimeType).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getMimeType).take(3) | ||
assert (textSampleArc.deep == Array ("text/plain", "text/dns", "text/plain").deep) | ||
assert (textSampleWarc.deep == Array("unknown", "text/plain", "text/html").deep) | ||
} | ||
|
||
test("Get Http Status") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getHttpStatus).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getHttpStatus).take(3) | ||
assert (textSampleArc.deep == Array("000", "000", "200").deep) | ||
assert (textSampleWarc.deep == Array("000", "200", "200").deep) | ||
} | ||
|
||
after { | ||
if (sc != null) { | ||
sc.stop() | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing fullstop.