Skip to content

Commit

Permalink
Deal with final scalastyle assessments, and Convert nulls to Option(T…
Browse files Browse the repository at this point in the history
…). (#249)

* Fully resolves #196 
* Resolves #212
  • Loading branch information
greebie authored and ruebot committed Aug 9, 2018
1 parent 77dbd51 commit 004ce1f
Show file tree
Hide file tree
Showing 27 changed files with 152 additions and 122 deletions.
3 changes: 3 additions & 0 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Expand Up @@ -59,8 +59,11 @@ trait ArchiveRecord extends Serializable {
* @param r the serialized record
*/
class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends ArchiveRecord {
// Option<t> would require refactor of methods. Ignore.
// scalastyle:off null
var arcRecord: ARCRecord = null
var warcRecord: WARCRecord = null
// scalastyle:on null

if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
arcRecord = r.t.getRecord.asInstanceOf[ARCRecord]
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
Expand Up @@ -70,14 +70,14 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) {
*
* @param e exception that Scallop throws
*/
// scalastyle:off regex
override def onError(e: Throwable): Unit = e match {
case ScallopException(message) =>
// scalastyle:off
println(message)
// scalastyle:on
throw new IllegalArgumentException()
case other: Any => throw other
}
// scalastyle:on regex

mainOptions = Seq(input, output)
var extractor = opt[String](descr = "extractor", required = true)
Expand Down
13 changes: 9 additions & 4 deletions src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
Expand Up @@ -61,7 +61,7 @@ class NERCombinedJson extends Serializable {
val tmpPath = new Path(tmpFile)

// Merge part-files into single file.
FileUtil.copyMerge(hdfs, srcPath, hdfs, tmpPath, false, hadoopConfig, null)
FileUtil.copyMerge(hdfs, srcPath, hdfs, tmpPath, false, hadoopConfig, "")

// Read file of JSON arrays, write into single JSON array of arrays.
val fsInStream = hdfs.open(tmpPath)
Expand All @@ -71,9 +71,14 @@ class NERCombinedJson extends Serializable {
// now is a file of JSON
val outFile = new BufferedWriter(new OutputStreamWriter(fsOutStream))
outFile.write("[")
val line = inFile.readLine()
if (line != null) outFile.write(line)
Iterator.continually(inFile.readLine()).takeWhile(_ != null).foreach(s => {outFile.write(", " + s)})
val line: Option[String] = Option(inFile.readLine())
line match {
case Some(line) =>
outFile.write(line)
case None =>
}
Iterator.continually(inFile.readLine()).takeWhile(Option(_) != None)
.foreach(s => {outFile.write(", " + s)})
outFile.write("]")
outFile.close()

Expand Down
Expand Up @@ -35,9 +35,11 @@ object ComputeImageSize {
try {
val in = new ByteArrayInputStream(bytes)
val image = ImageIO.read(in)
// scalastyle:off null
if (image == null) {
nullImage
}
// scalastyle:on null
(image.getWidth(), image.getHeight())
} catch {
case e: Throwable => {
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala
Expand Up @@ -19,9 +19,9 @@ package io.archivesunleashed.matchbox
import java.security.MessageDigest

/** Compute MD5 checksum. */
// scalastyle: off
// scalastyle:off object.name
object ComputeMD5 {
// scalastyle: on
// scalastyle:on object.name
/** Computes the MD5 checksum of a byte array (eg. an image).
*
* For string data, it is better to use `StringUtils.computeHash()`.
Expand Down
Expand Up @@ -29,29 +29,13 @@ object ExtractBoilerpipeText {
* @return text with boilerplate removed or Nil if the text is empty.
*/
def apply(input: String): String = {
try {
if (input.isEmpty) {
null
} else {
extract(input)
}
} catch {
case e: Exception =>
throw new IOException("Caught exception processing input row " + e)
}
}

/** Extracts boilerplate.
*
* @param input an html string possibly containing boilerpipe text
* @return filtered text or Nil if the text is empty.
*/
def extract (input: String): String = {
val text = DefaultExtractor.INSTANCE.getText(input).replaceAll("[\\r\\n]+", " ").trim()
if (text.isEmpty) {
null
} else {
text
val maybeInput = Option(input)
maybeInput match {
case Some(text) =>
DefaultExtractor.INSTANCE
.getText(input).replaceAll("[\\r\\n]+", " ").trim()
case None =>
""
}
}
}
21 changes: 12 additions & 9 deletions src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
Expand Up @@ -37,15 +37,18 @@ object ExtractDate {
val yearSS = 4
val monthSS = 6
val daySS = 8
if (fullDate == null) {
fullDate
} else {dateFormat match {
case YYYY => fullDate.substring(startSS, yearSS)
case MM => fullDate.substring(yearSS, monthSS)
case DD => fullDate.substring(monthSS, daySS)
case YYYYMM => fullDate.substring(startSS, monthSS)
case _ => fullDate.substring(startSS, daySS)
}
val maybeFullDate: Option[String] = Option(fullDate)
maybeFullDate match {
case Some(fulldate) =>
dateFormat match {
case YYYY => fullDate.substring(startSS, yearSS)
case MM => fullDate.substring(yearSS, monthSS)
case DD => fullDate.substring(monthSS, daySS)
case YYYYMM => fullDate.substring(startSS, monthSS)
case _ => fullDate.substring(startSS, daySS)
}
case None =>
""
}
}
}
28 changes: 19 additions & 9 deletions src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
Expand Up @@ -27,18 +27,28 @@ object ExtractDomain {
* @return domain host, source or null if url is null.
*/
def apply(url: String, source: String = ""): String = {
if (url == null) return null
var host: String = null
try {
host = new URL(url).getHost
} catch {
case e: Exception => // it's okay
val maybeSource: Option[URL] = checkUrl(source)
val maybeHost: Option[URL] = checkUrl(url)
maybeSource match {
case Some(source) =>
source.getHost

case None =>
maybeHost match {
case Some(host) =>
host.getHost
case None =>
""
}
}
if (host != null || source == null) return host
}

def checkUrl(url: String): Option[URL] = {
try {
new URL(source).getHost
Some(new URL(url))
} catch {
case e: Exception => null
case e: Exception =>
None
}
}
}
47 changes: 25 additions & 22 deletions src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
Expand Up @@ -20,6 +20,7 @@ import java.io.IOException
import org.jsoup.Jsoup
import org.jsoup.select.Elements
import scala.collection.mutable
import scala.Option

/** Extracts links from a webpage given the HTML content (using Jsoup). */
object ExtractLinks {
Expand All @@ -32,28 +33,30 @@ object ExtractLinks {
* @return a sequence of (source, target, anchortext).
*/
def apply(src: String, html: String, base: String = ""): Seq[(String, String, String)] = {
try {
val output = mutable.MutableList[(String, String, String)]()

// Basic input checking, return empty list if we fail.
if (src == null) return output
if (html.isEmpty) return output

val doc = Jsoup.parse(html)
val links: Elements = doc.select("a[href]")
val it = links.iterator()
while (it.hasNext) {
val link = it.next()
if (base.nonEmpty) link.setBaseUri(base)
val target = link.attr("abs:href")
if (target.nonEmpty) {
output += ((src, target, link.text))
}
val srcMaybe: Option[String] = Option(src)
val htmlMaybe: Option[String] = Option(html)
val output = mutable.MutableList[(String, String, String)]()
srcMaybe match {
case Some(valid_src) =>
htmlMaybe match {
case Some (valid_html) =>
val doc = Jsoup.parse(valid_html)
val links: Elements = doc.select("a[href]")
val it = links.iterator()
while (it.hasNext) {
val link = it.next()
if (base.nonEmpty) link.setBaseUri(base)
val target = link.attr("abs:href")
if (target.nonEmpty) {
output += ((valid_src, target, link.text))
}
}
case None =>
// do nothing
}
case None =>
// do nothing
}
output
} catch {
case e: Exception =>
throw new IOException("Caught exception processing input ", e);
}
output
}
}
Expand Up @@ -25,6 +25,7 @@ import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import java.util
import scala.collection.mutable

// scalastyle:off
/** Reads in a text string, and returns entities identified by the configured Stanford NER classifier. */
object NERClassifier {

Expand Down Expand Up @@ -114,3 +115,4 @@ object NERClassifier {
}
}
}
// scalastyle:on
11 changes: 6 additions & 5 deletions src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
Expand Up @@ -28,11 +28,12 @@ object RemoveHTML {
* @return content without html markup.
*/
def apply(content: String): String = {
try {
Jsoup.parse(content).text().replaceAll("[\\r\\n]+", " ")
}
catch {
case e: Exception => throw new IOException("Caught exception processing input row ", e)
val maybeContent: Option[String] = Option(content)
maybeContent match {
case Some(content) =>
Jsoup.parse(content).text().replaceAll("[\\r\\n]+", " ")
case None =>
""
}
}
}
20 changes: 10 additions & 10 deletions src/main/scala/io/archivesunleashed/matchbox/RemoveHttpHeader.scala
Expand Up @@ -26,16 +26,16 @@ object RemoveHttpHeader {
* @return string with HTTP headers removed.
*/
def apply(content: String): String = {
try {
if (content.startsWith("HTTP/")){
content.substring(content.indexOf(headerEnd) + headerEnd.length)
} else {
content
}
} catch {
case e: Exception => {
null
}
val maybeContent: Option[String] = Option(content)
maybeContent match {
case Some(content) =>
if (content.startsWith("HTTP/")){
content.substring(content.indexOf(headerEnd) + headerEnd.length)
} else {
content
}
case None =>
""
}
}
}
Expand Up @@ -25,23 +25,25 @@ import ops.tuple.ToList
object TupleFormatter {
/** Borrowed from shapeless' flatten.scala example. */
trait LowPriorityFlatten extends Poly1 {
// scalastyle:off public.methods.have.type
implicit def default[T] = at[T](Tuple1(_))
}

/** Flattens nested tuples, taking an argument a tuple of any size. */
// scalastyle:off
// scalastyle:off object.name
object flatten extends LowPriorityFlatten {
// scalastyle:on
// scalastyle:on object.name
implicit def caseTuple[T <: Product](implicit fm: FlatMapper[T, flatten.type]) =
at[T](_.flatMap(flatten))
}

/** Transforms a tuple into a tab-delimited string, flattening any nesting,
* taking an argument a tuple of any size. */
// scalastyle:off
// scalastyle:off object.name
object tabDelimit extends Poly1 {
// scalastyle:on object.name
implicit def caseTuple[T <: Product, Lub](implicit tl: ToList[T, Lub], fm: FlatMapper[T, flatten.type]) =
at[T](flatten(_).asInstanceOf[Product].productIterator.mkString("\t"))
}
// scalastyle:on
// scalastyle:on public.methods.have.type
}
8 changes: 4 additions & 4 deletions src/main/scala/io/archivesunleashed/matchbox/package.scala
Expand Up @@ -28,10 +28,10 @@ import scala.xml.Utility._
package object matchbox {
implicit class WWWLink(s: String) {
def removePrefixWWW(): String = {
if (s == null) {
null
} else {
s.replaceAll("^\\s*www\\.", "")
val maybeString: Option[String] = Option(s)
maybeString match {
case Some(s) => s.replaceAll("^\\s*www\\.", "")
case None => ""
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/main/scala/io/archivesunleashed/package.scala
Expand Up @@ -60,8 +60,10 @@ package object archivesunleashed {
* @return an RDD of JValue (json objects) for mapping.
*/
def loadTweets(path: String, sc: SparkContext): RDD[JValue] =
// scalastyle:off null
sc.textFile(path).filter(line => !line.startsWith("{\"delete\":"))
.map(line => try { parse(line) } catch { case e: Exception => null }).filter(x => x != null)
// scalastyle:on null
}

/** A Wrapper class around RDD to simplify counting. */
Expand Down
10 changes: 5 additions & 5 deletions src/main/scala/io/archivesunleashed/util/TweetUtils.scala
Expand Up @@ -31,15 +31,15 @@ object TweetUtils {
val user = "user"
implicit lazy val formats = org.json4s.DefaultFormats
/** Get Twitter status id. */
def id(): String = try { (tweet \ "id_str").extract[String] } catch { case e: Exception => null}
def id(): String = try { (tweet \ "id_str").extract[String] } catch { case e: Exception => ""}
/** Get the date a status was created. */
def createdAt(): String = try { (tweet \ "created_at").extract[String] } catch { case e: Exception => null}
def createdAt(): String = try { (tweet \ "created_at").extract[String] } catch { case e: Exception => ""}
/** Get the status text. */
def text(): String = try { (tweet \ "text").extract[String] } catch { case e: Exception => null}
def text(): String = try { (tweet \ "text").extract[String] } catch { case e: Exception => ""}
/** Get the language code (ISO 639-1). */
def lang: String = try { (tweet \ "lang").extract[String] } catch { case e: Exception => null}
def lang: String = try { (tweet \ "lang").extract[String] } catch { case e: Exception => ""}
/** Get the username of the user who wrote the status. */
def username(): String = try { (tweet \ user \ "screen_name").extract[String] } catch { case e: Exception => null}
def username(): String = try { (tweet \ user \ "screen_name").extract[String] } catch { case e: Exception => ""}
/** Check if user of status is "verified" (true or false). */
def isVerifiedUser(): Boolean = try { (tweet \ user \ "verified").extract[Boolean] } catch { case e: Exception => false}
/** Get the number of followers the user has. */
Expand Down
Expand Up @@ -72,13 +72,13 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter {
}

test("command line app tests") {
for (a <- testSuccessCmds) {
for {a <- testSuccessCmds} {
app.CommandLineAppRunner.test(a, sc)
assert(Files.exists(Paths.get(outputDir)))
FileUtils.deleteDirectory(new File(outputDir))
}

for (a <- testFailCmds) {
for {a <- testFailCmds} {
try {
app.CommandLineAppRunner.test(a, sc)
assert(false)
Expand Down

0 comments on commit 004ce1f

Please sign in to comment.