Skip to content

Commit

Permalink
ArchiveRecord + impl moved into same Scala file; code cleanup. (#230)
Browse files Browse the repository at this point in the history
  • Loading branch information
lintool authored and ruebot committed May 21, 2018
1 parent a9649aa commit e57a99c
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 104 deletions.
91 changes: 91 additions & 0 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,111 @@

package io.archivesunleashed

import java.text.SimpleDateFormat

import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils, ArchiveRecordWritable}
import io.archivesunleashed.matchbox.{ExtractDate, ExtractDomain, RemoveHttpHeader}
import org.apache.spark.SerializableWritable
import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord
import org.archive.util.ArchiveUtils

/** Trait for a record in a web archive. */
trait ArchiveRecord extends Serializable {
/** Returns the crawl date. */
def getCrawlDate: String

/** Returns the crawl month. */
def getCrawlMonth: String

/** Returns the content of the record as an array of bytes. */
def getContentBytes: Array[Byte]

/** Returns the content of the record as a String. */
def getContentString: String

/** Returns the MIME type. */
def getMimeType: String

/** Returns the URL. */
def getUrl: String

/** Returns the domain. */
def getDomain: String

/** Returns a raw array of bytes for an image. */
def getImageBytes: Array[Byte]
}

/** Default implementation of a record in a web archive.
*
* @constructor an archive record.
* @param r the serialized record
*/
class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends ArchiveRecord {
var arcRecord: ARCRecord = null
var warcRecord: WARCRecord = null

if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
arcRecord = r.t.getRecord.asInstanceOf[ARCRecord]
else if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.WARC)
warcRecord = r.t.getRecord.asInstanceOf[WARCRecord]

val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")

val getCrawlDate: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
ExtractDate(arcRecord.getMetaData.getDate, ExtractDate.DateComponent.YYYYMMDD)
else
ExtractDate(
ArchiveUtils.get14DigitDate(
ISO8601.parse(warcRecord.getHeader.getDate)), ExtractDate.DateComponent.YYYYMMDD)
}

val getCrawlMonth: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
ExtractDate(arcRecord.getMetaData.getDate, ExtractDate.DateComponent.YYYYMM)
else
ExtractDate(
ArchiveUtils.get14DigitDate(
ISO8601.parse(warcRecord.getHeader.getDate)), ExtractDate.DateComponent.YYYYMM)
}

val getContentBytes: Array[Byte] = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
ArcRecordUtils.getBodyContent(arcRecord)
else
WarcRecordUtils.getContent(warcRecord)
}

val getContentString: String = {
new String(getContentBytes)
}

val getMimeType = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
arcRecord.getMetaData.getMimetype
else
WarcRecordUtils.getWarcResponseMimeType(getContentBytes)
}

val getUrl = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
arcRecord.getMetaData.getUrl
else
warcRecord.getHeader.getUrl
}

val getDomain: String = {
ExtractDomain(getUrl)
}

val getImageBytes: Array[Byte] = {
if (getContentString.startsWith("HTTP/"))
getContentBytes.slice(
getContentString.indexOf(RemoveHttpHeader.headerEnd)
+ RemoveHttpHeader.headerEnd.length, getContentBytes.length)
else
getContentBytes
}
}
104 changes: 0 additions & 104 deletions src/main/scala/io/archivesunleashed/ArchiveRecordImpl.scala

This file was deleted.

0 comments on commit e57a99c

Please sign in to comment.