Skip to content

Commit

Permalink
Adding getCrawlYear in ArchiveRecords, #104 (#105)
Browse files Browse the repository at this point in the history
Will test locally to see if this fits use case.
  • Loading branch information
ianmilligan1 authored and ruebot committed Oct 26, 2017
1 parent a539a1c commit 010fe24
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class ArcRecord(r: SerializableWritable[ArcRecordWritable]) extends ArchiveRecor

val getCrawlMonth: String = ExtractDate(r.t.getRecord.getMetaData.getDate, DateComponent.YYYYMM)

val getCrawlYear: String = ExtractDate(r.t.getRecord.getMetaData.getDate, DateComponent.YYYY)

val getMimeType: String = r.t.getRecord.getMetaData.getMimetype

val getUrl: String = r.t.getRecord.getMetaData.getUrl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ trait ArchiveRecord extends Serializable {

val getCrawlMonth: String

val getCrawlYear: String

val getUrl: String

val getDomain: String
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ class GenericArchiveRecord(r: SerializableWritable[GenericArchiveRecordWritable]
}
}

val getCrawlYear: String = {
if (r.t.getFormat == ArchiveFormat.ARC) {
ExtractDate(arcRecord.getMetaData.getDate, DateComponent.YYYY)
} else {
ExtractDate(ArchiveUtils.get14DigitDate(ISO8601.parse(warcRecord.getHeader.getDate)), DateComponent.YYYY)
}
}

val getContentBytes: Array[Byte] = {
if (r.t.getFormat == ArchiveFormat.ARC) {
ArcRecordUtils.getBodyContent(arcRecord)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class WarcRecord(r: SerializableWritable[WarcRecordWritable]) extends ArchiveRec

val getCrawlMonth: String = ExtractDate(ArchiveUtils.get14DigitDate(ISO8601.parse(r.t.getRecord.getHeader.getDate)), DateComponent.YYYYMM)

val getCrawlYear: String = ExtractDate(ArchiveUtils.get14DigitDate(ISO8601.parse(r.t.getRecord.getHeader.getDate)), DateComponent.YYYY)

val getContentBytes: Array[Byte] = WarcRecordUtils.getContent(r.t.getRecord)

val getContentString: String = new String(getContentBytes)
Expand Down

0 comments on commit 010fe24

Please sign in to comment.