Skip to content

Commit

Permalink
initial checkin of new KeepDate to support lists
Browse files Browse the repository at this point in the history
For #108
  • Loading branch information
ianmilligan1 committed Jan 6, 2018
1 parent 128b86d commit 597cd96
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 5 deletions.
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/spark/rdd/RecordRDD.scala
Expand Up @@ -73,8 +73,8 @@ object RecordRDD extends java.io.Serializable {
rdd.filter(r => mimeTypes.contains(r.getMimeType))
}

def keepDate(date: String, component: DateComponent = DateComponent.YYYYMMDD) = {
rdd.filter(r => ExtractDate(r.getCrawlDate, component) == date)
def keepDate(dates: List[String], component: DateComponent = DateComponent.YYYYMMDD) = {
rdd.filter(r => dates.contains(ExtractDate(r.getCrawlDate, component)))
}

def keepUrls(urls: Set[String]) = {
Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/io/archivesunleashed/spark/ArcTest.scala
Expand Up @@ -45,12 +45,12 @@ class ArcTest extends FunSuite with BeforeAndAfter {

test("filter date") {
val four = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false)
.keepDate("200804", DateComponent.YYYYMM)
.keepDate(List("200804"), DateComponent.YYYYMM)
.map(r => r.getCrawlDate)
.collect()

val five = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false)
.keepDate("200805", DateComponent.YYYYMM)
.keepDate(List("200805"), DateComponent.YYYYMM)
.map(r => r.getCrawlDate)
.collect()

Expand Down
Expand Up @@ -62,7 +62,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
val r = base
.filter (x => ExtractDate(x.getCrawlDate, component) == "2008")
.map ( mp => mp.getUrl).take(3)
val r2 = base.keepDate("2008", component)
val r2 = base.keepDate(List("2008"), component)
.map ( mp => mp.getUrl).take(3)
assert (r2.sameElements(r)) }

Expand Down

0 comments on commit 597cd96

Please sign in to comment.