Deal with final scalastyle assessments, and Convert nulls to Option(T…

…). (#249) * Fully resolves #196 * Resolves #212
archivesunleashed · Aug 9, 2018 · 004ce1f · 004ce1f
1 parent 77dbd51
commit 004ce1f
Show file tree

Hide file tree

Showing 27 changed files with 152 additions and 122 deletions.
diff --git a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala
@@ -59,8 +59,11 @@ trait ArchiveRecord extends Serializable {
  *  @param r the serialized record
  */
 class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends ArchiveRecord {
+  // Option<t> would require refactor of methods. Ignore.
+  // scalastyle:off null
   var arcRecord: ARCRecord = null
   var warcRecord: WARCRecord = null
+  // scalastyle:on null
 
   if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
     arcRecord = r.t.getRecord.asInstanceOf[ARCRecord]

diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
@@ -70,14 +70,14 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) {
     *
     * @param e exception that Scallop throws
     */
+  // scalastyle:off regex
   override def onError(e: Throwable): Unit = e match {
     case ScallopException(message) =>
-      // scalastyle:off
       println(message)
-      // scalastyle:on
       throw new IllegalArgumentException()
     case other: Any => throw other
   }
+  // scalastyle:on regex
 
   mainOptions = Seq(input, output)
   var extractor = opt[String](descr = "extractor", required = true)

diff --git a/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala b/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
@@ -61,7 +61,7 @@ class NERCombinedJson extends Serializable {
     val tmpPath = new Path(tmpFile)
 
     // Merge part-files into single file.
-    FileUtil.copyMerge(hdfs, srcPath, hdfs, tmpPath, false, hadoopConfig, null)
+    FileUtil.copyMerge(hdfs, srcPath, hdfs, tmpPath, false, hadoopConfig, "")
 
     // Read file of JSON arrays, write into single JSON array of arrays.
     val fsInStream = hdfs.open(tmpPath)
@@ -71,9 +71,14 @@ class NERCombinedJson extends Serializable {
                                                  // now is a file of JSON
     val outFile = new BufferedWriter(new OutputStreamWriter(fsOutStream))
     outFile.write("[")
-    val line = inFile.readLine()
-    if (line != null) outFile.write(line)
-    Iterator.continually(inFile.readLine()).takeWhile(_ != null).foreach(s => {outFile.write(", " + s)})
+    val line: Option[String] = Option(inFile.readLine())
+    line match {
+      case Some(line) =>
+        outFile.write(line)
+      case None =>
+    }
+    Iterator.continually(inFile.readLine()).takeWhile(Option(_) != None)
+      .foreach(s => {outFile.write(", " + s)})
     outFile.write("]")
     outFile.close()
 

diff --git a/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala
@@ -35,9 +35,11 @@ object ComputeImageSize {
     try {
       val in = new ByteArrayInputStream(bytes)
       val image = ImageIO.read(in)
+      // scalastyle:off null
       if (image == null) {
         nullImage
       }
+      // scalastyle:on null
       (image.getWidth(), image.getHeight())
     } catch {
       case e: Throwable => {

diff --git a/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala
@@ -19,9 +19,9 @@ package io.archivesunleashed.matchbox
 import java.security.MessageDigest
 
 /** Compute MD5 checksum. */
-// scalastyle: off
+// scalastyle:off object.name
 object ComputeMD5 {
-// scalastyle: on
+// scalastyle:on object.name
   /** Computes the MD5 checksum of a byte array (eg. an image).
     *
     * For string data, it is better to use `StringUtils.computeHash()`.

diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
@@ -29,29 +29,13 @@ object ExtractBoilerpipeText {
    * @return text with boilerplate removed or Nil if the text is empty.
    */
   def apply(input: String): String = {
-    try {
-      if (input.isEmpty) {
-        null
-      } else {
-        extract(input)
-      }
-    } catch {
-      case e: Exception =>
-        throw new IOException("Caught exception processing input row " + e)
-    }
-  }
-
-  /** Extracts boilerplate.
-   *
-   * @param input an html string possibly containing boilerpipe text
-   * @return filtered text or Nil if the text is empty.
-   */
-  def extract (input: String): String = {
-    val text = DefaultExtractor.INSTANCE.getText(input).replaceAll("[\\r\\n]+", " ").trim()
-    if (text.isEmpty) {
-      null
-    } else {
-      text
+    val maybeInput = Option(input)
+    maybeInput match {
+      case Some(text) =>
+        DefaultExtractor.INSTANCE
+          .getText(input).replaceAll("[\\r\\n]+", " ").trim()
+      case None =>
+        ""
     }
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
@@ -37,15 +37,18 @@ object ExtractDate {
     val yearSS = 4
     val monthSS = 6
     val daySS = 8
-    if (fullDate == null) {
-      fullDate
-    } else {dateFormat match {
-      case YYYY => fullDate.substring(startSS, yearSS)
-      case MM => fullDate.substring(yearSS, monthSS)
-      case DD => fullDate.substring(monthSS, daySS)
-      case YYYYMM => fullDate.substring(startSS, monthSS)
-      case _ => fullDate.substring(startSS, daySS)
-      }
+    val maybeFullDate: Option[String] = Option(fullDate)
+    maybeFullDate match {
+      case Some(fulldate) =>
+        dateFormat match {
+          case YYYY => fullDate.substring(startSS, yearSS)
+          case MM => fullDate.substring(yearSS, monthSS)
+          case DD => fullDate.substring(monthSS, daySS)
+          case YYYYMM => fullDate.substring(startSS, monthSS)
+          case _ => fullDate.substring(startSS, daySS)
+        }
+      case None =>
+        ""
     }
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
@@ -27,18 +27,28 @@ object ExtractDomain {
    * @return domain host, source or null if url is null.
    */
   def apply(url: String, source: String = ""): String = {
-    if (url == null) return null
-    var host: String = null
-    try {
-      host = new URL(url).getHost
-    } catch {
-      case e: Exception => // it's okay
+    val maybeSource: Option[URL] = checkUrl(source)
+    val maybeHost: Option[URL] = checkUrl(url)
+    maybeSource match {
+      case Some(source) =>
+        source.getHost
+
+      case None =>
+        maybeHost match {
+          case Some(host) =>
+            host.getHost
+          case None =>
+            ""
+      }
     }
-    if (host != null || source == null) return host
+  }
+
+  def checkUrl(url: String): Option[URL] = {
     try {
-      new URL(source).getHost
+      Some(new URL(url))
     } catch {
-      case e: Exception => null
+      case e: Exception =>
+        None
     }
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
@@ -20,6 +20,7 @@ import java.io.IOException
 import org.jsoup.Jsoup
 import org.jsoup.select.Elements
 import scala.collection.mutable
+import scala.Option
 
 /** Extracts links from a webpage given the HTML content (using Jsoup). */
 object ExtractLinks {
@@ -32,28 +33,30 @@ object ExtractLinks {
     * @return a sequence of (source, target, anchortext).
     */
   def apply(src: String, html: String, base: String = ""): Seq[(String, String, String)] = {
-    try {
-      val output = mutable.MutableList[(String, String, String)]()
-
-      // Basic input checking, return empty list if we fail.
-      if (src == null) return output
-      if (html.isEmpty) return output
-
-      val doc = Jsoup.parse(html)
-      val links: Elements = doc.select("a[href]")
-      val it = links.iterator()
-      while (it.hasNext) {
-        val link = it.next()
-        if (base.nonEmpty) link.setBaseUri(base)
-        val target = link.attr("abs:href")
-        if (target.nonEmpty) {
-          output += ((src, target, link.text))
-        }
+    val srcMaybe: Option[String] = Option(src)
+    val htmlMaybe: Option[String] = Option(html)
+    val output = mutable.MutableList[(String, String, String)]()
+    srcMaybe match {
+      case Some(valid_src) =>
+        htmlMaybe match {
+          case Some (valid_html) =>
+            val doc = Jsoup.parse(valid_html)
+            val links: Elements = doc.select("a[href]")
+            val it = links.iterator()
+            while (it.hasNext) {
+              val link = it.next()
+              if (base.nonEmpty) link.setBaseUri(base)
+              val target = link.attr("abs:href")
+              if (target.nonEmpty) {
+                output += ((valid_src, target, link.text))
+              }
+            }
+          case None =>
+            // do nothing
+          }
+      case None =>
+        // do nothing
       }
-      output
-    } catch {
-      case e: Exception =>
-        throw new IOException("Caught exception processing input ", e);
-    }
+    output
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/NERClassifier.scala b/src/main/scala/io/archivesunleashed/matchbox/NERClassifier.scala
@@ -25,6 +25,7 @@ import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
 import java.util
 import scala.collection.mutable
 
+// scalastyle:off
 /** Reads in a text string, and returns entities identified by the configured Stanford NER classifier. */
 object NERClassifier {
 
@@ -114,3 +115,4 @@ object NERClassifier {
     }
   }
 }
+// scalastyle:on
diff --git a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
@@ -28,11 +28,12 @@ object RemoveHTML {
    * @return content without html markup.
    */
   def apply(content: String): String = {
-    try {
-      Jsoup.parse(content).text().replaceAll("[\\r\\n]+", " ")
-    }
-    catch {
-      case e: Exception => throw new IOException("Caught exception processing input row ", e)
+    val maybeContent: Option[String] = Option(content)
+    maybeContent match {
+      case Some(content) =>
+        Jsoup.parse(content).text().replaceAll("[\\r\\n]+", " ")
+      case None =>
+        ""
     }
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/RemoveHttpHeader.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHttpHeader.scala
@@ -26,16 +26,16 @@ object RemoveHttpHeader {
    * @return string with HTTP headers removed.
    */
   def apply(content: String): String = {
-    try {
-      if (content.startsWith("HTTP/")){
-        content.substring(content.indexOf(headerEnd) + headerEnd.length)
-      } else {
-        content
-      }
-    } catch {
-      case e: Exception => {
-        null
-      }
+    val maybeContent: Option[String] = Option(content)
+    maybeContent match {
+      case Some(content) =>
+        if (content.startsWith("HTTP/")){
+          content.substring(content.indexOf(headerEnd) + headerEnd.length)
+        } else {
+          content
+        }
+      case None =>
+        ""
     }
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/TupleFormatter.scala b/src/main/scala/io/archivesunleashed/matchbox/TupleFormatter.scala
@@ -25,23 +25,25 @@ import ops.tuple.ToList
 object TupleFormatter {
   /** Borrowed from shapeless' flatten.scala example. */
   trait LowPriorityFlatten extends Poly1 {
+    // scalastyle:off public.methods.have.type
     implicit def default[T] = at[T](Tuple1(_))
   }
 
   /** Flattens nested tuples, taking an argument a tuple of any size. */
-  // scalastyle:off
+  // scalastyle:off object.name
   object flatten extends LowPriorityFlatten {
-  // scalastyle:on
+  // scalastyle:on object.name
     implicit def caseTuple[T <: Product](implicit fm: FlatMapper[T, flatten.type]) =
       at[T](_.flatMap(flatten))
   }
 
   /** Transforms a tuple into a tab-delimited string, flattening any nesting,
     * taking an argument a tuple of any size. */
-  // scalastyle:off
+  // scalastyle:off object.name
   object tabDelimit extends Poly1 {
+  // scalastyle:on object.name
     implicit def caseTuple[T <: Product, Lub](implicit tl: ToList[T, Lub], fm: FlatMapper[T, flatten.type]) =
       at[T](flatten(_).asInstanceOf[Product].productIterator.mkString("\t"))
   }
-  // scalastyle:on
+  // scalastyle:on public.methods.have.type
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/package.scala b/src/main/scala/io/archivesunleashed/matchbox/package.scala
@@ -28,10 +28,10 @@ import scala.xml.Utility._
 package object matchbox {
   implicit class WWWLink(s: String) {
     def removePrefixWWW(): String = {
-      if (s == null) {
-        null
-      } else {
-        s.replaceAll("^\\s*www\\.", "")
+      val maybeString: Option[String] = Option(s)
+      maybeString match {
+        case Some(s) => s.replaceAll("^\\s*www\\.", "")
+        case None => ""
       }
     }
 

diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala
@@ -60,8 +60,10 @@ package object archivesunleashed {
       * @return an RDD of JValue (json objects) for mapping.
       */
     def loadTweets(path: String, sc: SparkContext): RDD[JValue] =
+      // scalastyle:off null
       sc.textFile(path).filter(line => !line.startsWith("{\"delete\":"))
         .map(line => try { parse(line) } catch { case e: Exception => null }).filter(x => x != null)
+      // scalastyle:on null
   }
 
   /** A Wrapper class around RDD to simplify counting. */

diff --git a/src/main/scala/io/archivesunleashed/util/TweetUtils.scala b/src/main/scala/io/archivesunleashed/util/TweetUtils.scala
@@ -31,15 +31,15 @@ object TweetUtils {
     val user = "user"
     implicit lazy val formats = org.json4s.DefaultFormats
     /** Get Twitter status id. */
-    def id(): String = try { (tweet \ "id_str").extract[String] } catch { case e: Exception => null}
+    def id(): String = try { (tweet \ "id_str").extract[String] } catch { case e: Exception => ""}
     /** Get the date a status was created. */
-    def createdAt(): String = try { (tweet \ "created_at").extract[String] } catch { case e: Exception => null}
+    def createdAt(): String = try { (tweet \ "created_at").extract[String] } catch { case e: Exception => ""}
     /** Get the status text. */
-    def text(): String = try { (tweet \ "text").extract[String] } catch { case e: Exception => null}
+    def text(): String = try { (tweet \ "text").extract[String] } catch { case e: Exception => ""}
     /** Get the language code (ISO 639-1). */
-    def lang: String = try { (tweet \ "lang").extract[String] } catch { case e: Exception => null}
+    def lang: String = try { (tweet \ "lang").extract[String] } catch { case e: Exception => ""}
     /** Get the username of the user who wrote the status. */
-    def username(): String = try { (tweet \ user \ "screen_name").extract[String] } catch { case e: Exception => null}
+    def username(): String = try { (tweet \ user \ "screen_name").extract[String] } catch { case e: Exception => ""}
     /** Check if user of status is "verified" (true or false). */
     def isVerifiedUser(): Boolean = try { (tweet \ user \ "verified").extract[Boolean] } catch { case e: Exception => false}
     /** Get the number of followers the user has. */

diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
@@ -72,13 +72,13 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter {
   }
 
   test("command line app tests") {
-    for (a <- testSuccessCmds) {
+    for {a <- testSuccessCmds} {
       app.CommandLineAppRunner.test(a, sc)
       assert(Files.exists(Paths.get(outputDir)))
       FileUtils.deleteDirectory(new File(outputDir))
     }
 
-    for (a <- testFailCmds)  {
+    for {a <- testFailCmds}  {
       try {
         app.CommandLineAppRunner.test(a, sc)
         assert(false)