In [ ]:
val documents_tbl_name = ""
val documents_cracked_view_name = ""
val file_system = ""
val minted_tables_output_path = ""
// Parameters that will get overriden by the pipeline

In [ ]:
import scala.util.{Try,Success,Failure}

import org.apache.spark.sql.functions._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, FileSystem}

// Tika
import org.apache.tika.Tika
import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.{AutoDetectParser, Parser, ParseContext}
import org.apache.tika.sax.BodyContentHandler
import org.apache.tika.config.TikaConfig
import org.xml.sax.ContentHandler

def parseUnicodeText(is: java.io.InputStream): String = {
    var content = new java.io.BufferedReader(
            new java.io.InputStreamReader(is, java.nio.charset.StandardCharsets.UTF_8)
        )
        .lines()
        .collect(java.util.stream.Collectors.joining("\n")
    )
    content
}

def extractTextWithTika(is: java.io.InputStream): String = {
    // Allow Tika to not be restricted in the size of the content
    val handler:BodyContentHandler = new BodyContentHandler(-1)
    val metaData = new Metadata()

    // Parse the stream
    val tikaConfig = TikaConfig.getDefaultConfig();
    val parser:Parser = new AutoDetectParser(tikaConfig)

    parser.parse(is, handler, metaData, new ParseContext())
    handler.toString()
}

def crack(filePath: String, fileType: String): (String, String) = {
    val inputPath:Path = new Path(filePath)

    // Get a stream for the file being cracked
    val conf = new Configuration();
    conf.set("fs.defaultFS", file_system);
    var fs = FileSystem.get(conf)
    val is: java.io.InputStream = fs.open(inputPath)

    // Decide which method use for cracking
    var method: (java.io.InputStream => String) = fileType match {
        case "txt" | "html" | "htm" | "json" | "TXT" | "HTML" | "HTM" | "JSON" => {
            parseUnicodeText
        }
        case _ => {
            extractTextWithTika
        }
    }

    Try(method(is)) match {
        case Success(v) => {
            (v, null)
        }
        case Failure(e) => {
            (null, e.toString())
        }
    }
}

val crackUdf = udf[(String, String), String, String](crack)

In [ ]:
val docs = spark.read.parquet(minted_tables_output_path + documents_tbl_name).select("file_name","file_path","file_type")

val cracked = docs.
    withColumn("crack_result", crackUdf(docs.col("file_path"), docs.col("file_type")))
// Running an additional select to unfold the result of crackUdf (which is a tuple)
val extracted = cracked.select(
    col("crack_result").getItem("_1").alias("text_content"),
    col("crack_result").getItem("_2").alias("extraction_error"),
    col("file_name"),
    col("file_type"),
    col("file_path")
)

extracted.createOrReplaceTempView(documents_cracked_view_name)