Skip to content

Commit

Permalink
Add .getHttpStatus and .getArchiveFile to ArchiveRecordImpl class #198
Browse files Browse the repository at this point in the history
…& #164 (#292)

* Resolves #198
* Resolves #164
* Add getHttpStatus to ArchiveRecord class & trait
  - add .getHttpStatus to potential outputs
  - add tests for .getHttpStatus calls
  - improve ArchiveRecord testing overall.
* Add .getArchiveFile feature to ArchiveRecordImpl.
  - add getArchiveFile to trait
  - add getArchiveFile for ArchiveRecordImpl
  - add tests for getArchiveFile.
* Other code style fixes.
* Include updates to tests.
  • Loading branch information
greebie authored and ruebot committed Nov 28, 2018
1 parent 80b9e2b commit 7731b6d
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 3 deletions.
37 changes: 35 additions & 2 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,22 @@
package io.archivesunleashed

import java.text.SimpleDateFormat
import java.io.ByteArrayInputStream

import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils, ArchiveRecordWritable}
import io.archivesunleashed.matchbox.{ExtractDate, ExtractDomain, RemoveHttpHeader}
import org.apache.spark.SerializableWritable
import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord
import org.archive.util.ArchiveUtils
import scala.util.Try
import org.apache.commons.httpclient.{Header, HttpParser, StatusLine}

/** Trait for a record in a web archive. */
trait ArchiveRecord extends Serializable {
/** Returns the full path or url containing the Archive Records. */
def getArchiveFilename: String

/** Returns the crawl date. */
def getCrawlDate: String

Expand All @@ -51,6 +57,10 @@ trait ArchiveRecord extends Serializable {

/** Returns a raw array of bytes for an image. */
def getImageBytes: Array[Byte]

/** Returns the http status of the crawl. */
def getHttpStatus: String

}

/** Default implementation of a record in a web archive.
Expand All @@ -64,6 +74,7 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends
var arcRecord: ARCRecord = null
var warcRecord: WARCRecord = null
// scalastyle:on null
var headerResponseFormat: String = "US-ASCII"

if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
arcRecord = r.t.getRecord.asInstanceOf[ARCRecord]
Expand All @@ -72,6 +83,14 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends
}
val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")

val getArchiveFilename: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC){
arcRecord.getMetaData.getReaderIdentifier()
} else {
warcRecord.getHeader.getReaderIdentifier()
}
}

val getCrawlDate: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC){
ExtractDate(arcRecord.getMetaData.getDate, ExtractDate.DateComponent.YYYYMMDD)
Expand Down Expand Up @@ -107,9 +126,10 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends

val getMimeType: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
arcRecord.getMetaData.getMimetype
Option(arcRecord.getMetaData.getMimetype).getOrElse("unknown")
} else {
WarcRecordUtils.getWarcResponseMimeType(getContentBytes)
Option(WarcRecordUtils.getWarcResponseMimeType(getContentBytes))
.getOrElse("unknown")
}
}

Expand All @@ -121,6 +141,19 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends
}
}

val getHttpStatus: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
Option(arcRecord.getMetaData.getStatusCode).getOrElse("000")
} else {
Try(new StatusLine(new String(HttpParser.readRawLine
(new ByteArrayInputStream(getContentBytes))))
.getStatusCode).toOption match {
case Some(x) => x.toString
case None => "000"
}
}
}

val getDomain: String = {
ExtractDomain(getUrl)
}
Expand Down
61 changes: 60 additions & 1 deletion src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.apache.commons.io.FilenameUtils

@RunWith(classOf[JUnitRunner])
class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
Expand All @@ -44,10 +45,68 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
assert(RecordLoader.loadArchives(warcPath, sc).count == 299L)
}

test("Resource name produces expected result.") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => FilenameUtils.getName(x.getArchiveFilename))
.take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => FilenameUtils.getName(x.getArchiveFilename)).take(3)
assert(textSampleArc.deep == Array("example.arc.gz",
"example.arc.gz", "example.arc.gz").deep)
assert(textSampleWarc.deep == Array("example.warc.gz",
"example.warc.gz", "example.warc.gz").deep)
}

test("Crawl Dates") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getCrawlDate).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getCrawlDate).take(3)
assert(textSampleArc.deep == Array("20080430", "20080430", "20080430").deep)
assert(textSampleWarc.deep == Array("20080430", "20080430", "20080430").deep)
}

test("Domains") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getDomain).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getDomain).take(3)
assert(textSampleArc.deep == Array("", "", "www.archive.org").deep)
assert(textSampleWarc.deep == Array("", "www.archive.org", "www.archive.org").deep)
}

test("Urls") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getUrl).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getUrl).take(3)
assert(textSampleArc.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc",
"dns:www.archive.org", "http://www.archive.org/robots.txt").deep)
assert(textSampleWarc.deep == Array("dns:www.archive.org",
"http://www.archive.org/robots.txt", "http://www.archive.org/").deep)
}

test("Mime-Type") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getMimeType).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getMimeType).take(3)
assert (textSampleArc.deep == Array ("text/plain", "text/dns", "text/plain").deep)
assert (textSampleWarc.deep == Array("unknown", "text/plain", "text/html").deep)
}

test("Get Http Status") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getHttpStatus).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getHttpStatus).take(3)
assert (textSampleArc.deep == Array("000", "000", "200").deep)
assert (textSampleWarc.deep == Array("000", "200", "200").deep)
}

after {
if (sc != null) {
sc.stop()
}
}
}

0 comments on commit 7731b6d

Please sign in to comment.