-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add .getHttpStatus and .getFilename to ArchiveRecordImpl class #198 & #164 #292
Changes from all commits
82314ad
9bca66e
37ae8e0
46b32a2
739e06c
b722d1d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import org.apache.spark.{SparkConf, SparkContext} | |
import org.junit.runner.RunWith | ||
import org.scalatest.junit.JUnitRunner | ||
import org.scalatest.{BeforeAndAfter, FunSuite} | ||
import org.apache.commons.io.FilenameUtils | ||
|
||
@RunWith(classOf[JUnitRunner]) | ||
class ArchiveRecordTest extends FunSuite with BeforeAndAfter { | ||
|
@@ -44,10 +45,68 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { | |
assert(RecordLoader.loadArchives(warcPath, sc).count == 299L) | ||
} | ||
|
||
test("Resource name produces expected result.") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => FilenameUtils.getName(x.getArchiveFilename)) | ||
.take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => FilenameUtils.getName(x.getArchiveFilename)).take(3) | ||
assert(textSampleArc.deep == Array("example.arc.gz", | ||
"example.arc.gz", "example.arc.gz").deep) | ||
assert(textSampleWarc.deep == Array("example.warc.gz", | ||
"example.warc.gz", "example.warc.gz").deep) | ||
} | ||
|
||
test("Crawl Dates") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getCrawlDate).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getCrawlDate).take(3) | ||
assert(textSampleArc.deep == Array("20080430", "20080430", "20080430").deep) | ||
assert(textSampleWarc.deep == Array("20080430", "20080430", "20080430").deep) | ||
} | ||
|
||
test("Domains") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are a few extra test methods here. Are they scope for the issues posted in the original comment? Or do they cover other tickets? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No other tickets. Basically, last PR I had good test coverage, but failed to test particular cases and my code passed Travis with bugs. I decided to include these additional tests in case that happened again (it was unlikely, but I wanted this PR to go more smoothly). Since ArchiveRecord is used widely across the tests, I did not expect the tests to improve coverage (as per #260). |
||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getDomain).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getDomain).take(3) | ||
assert(textSampleArc.deep == Array("", "", "www.archive.org").deep) | ||
assert(textSampleWarc.deep == Array("", "www.archive.org", "www.archive.org").deep) | ||
} | ||
|
||
test("Urls") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getUrl).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getUrl).take(3) | ||
assert(textSampleArc.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc", | ||
"dns:www.archive.org", "http://www.archive.org/robots.txt").deep) | ||
assert(textSampleWarc.deep == Array("dns:www.archive.org", | ||
"http://www.archive.org/robots.txt", "http://www.archive.org/").deep) | ||
} | ||
|
||
test("Mime-Type") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getMimeType).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getMimeType).take(3) | ||
assert (textSampleArc.deep == Array ("text/plain", "text/dns", "text/plain").deep) | ||
assert (textSampleWarc.deep == Array("unknown", "text/plain", "text/html").deep) | ||
} | ||
|
||
test("Get Http Status") { | ||
val textSampleArc = RecordLoader.loadArchives(arcPath, sc) | ||
.map(x => x.getHttpStatus).take(3) | ||
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) | ||
.map(x => x.getHttpStatus).take(3) | ||
assert (textSampleArc.deep == Array("000", "000", "200").deep) | ||
assert (textSampleWarc.deep == Array("000", "200", "200").deep) | ||
} | ||
|
||
after { | ||
if (sc != null) { | ||
sc.stop() | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the rationale for
US-ASCII
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose it this? https://tools.ietf.org/html/rfc7230#section-3.2.4
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes - this part was a mimic of @dportabella 's suggestion. Seemed weird to me at first as well.