### 1. prep

In [None]:
// import spark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.ArrayType
import org.apache.spark.sql.SaveMode
// create spark session
val spark = SparkSession.builder().appName("medistream-05").getOrCreate()
// get start time
val st = System.currentTimeMillis()
// read data
val path = data_path
val data = spark.read.json(path)

### 2. preprocessing hospital data

In [None]:
val hospitalData = data.select(explode(col("hospital")).alias("h"))
val hospitalDataSelected = hospitalData.select(
    col("h.id").alias("id"),
    col("h.name").alias("name"),
    col("h.category").alias("category"),
    col("h.category_code").alias("category_code"),
    col("h.category_code_list").alias("category_code_list"),
    col("h.category_count").alias("category_count"),
    col("h.description").alias("description"),
    col("h.road_address").alias("road_address"),
    col("h.road").alias("road"),
    col("h.rcode").alias("rcode"),
    col("h.virtual_phone").alias("virtual_phone"),
    col("h.phone").alias("phone"),
    col("h.payment_info").alias("payment_info"),
    col("h.conveniences").alias("conveniences"),
    col("h.review_setting.keyword").alias("review_keyword"),
    col("h.keywords").alias("keywords"),
    col("h.booking_business_id").alias("booking_business_id"),
    col("h.booking_display_name").alias("booking_display_name"),
    col("h.visitor_reviews_score").alias("visitor_reviews_score"),
    col("h.visitor_reviews_total").alias("visitor_reviews_total"),
    col("h.visitor_reviews_text_review_total").alias("visitor_reviews_text_review_total"),
    col("h.images").alias("images"),
    col("h.homepages.etc").alias("homepages_etc"),
    col("h.homepages.repr").alias("homepages_repr"),
    col("h.homepages.repr.url").alias("is_rep"), // isRep?
    col("h.booking_url").alias("booking_url"),
    col("h.talktalk_url").alias("talktalk_url"),
    col("h.coordinate.x").alias("lon"),
    col("h.coordinate.y").alias("lat")
).withColumn(
    "description", 
    regexp_replace(col("description"), "[\n\r*,]", "")
).withColumn(
    "road", 
    regexp_replace(col("road"), "[\n\r*,]", "")
).withColumn(
    "review_keyword", 
    regexp_replace(col("review_keyword"), "[\\\"]", "")
).withColumn(
    "description_length", length(col("description"))
).withColumn(
    "images_count", size(col("images"))
).withColumn(
    "photo_review_ratio", (col("visitor_reviews_total") - col("visitor_reviews_text_review_total")) / col("visitor_reviews_total")
).withColumn(
    "homepages_url", 
    flatten(array(
        array(col("homepages_repr.url")), 
        col("homepages_etc.url")
    ))
).withColumn(
    "homepages_type", 
    flatten(array(
        array(col("homepages_repr.type")), 
        col("homepages_etc.type")
    ))
).withColumn(
    "homepages_order", 
    when(col("homepages_repr.order").isNull, array(lit(0)))
    .otherwise(
        flatten(array(
            array(col("homepages_repr.order")), 
            col("homepages_etc.order")
        ))
    )
).withColumn(
    "is_smart_phone", col("phone").startsWith("010")
).withColumn(
    "is_zero_pay", array_contains(col("payment_info"), "제로페이")
).withColumn(
    "is_dead_url", 
    flatten(array(
        array(col("homepages_repr.isDeadUrl")), 
        col("homepages_etc.isDeadUrl")
    ))
).withColumn(
    "keywords_1", col("keywords")(0)
).withColumn(
      "keywords_2", col("keywords")(1)
).withColumn(
    "keywords_3", col("keywords")(2)
).withColumn(
    "keywords_4", col("keywords")(3)
).withColumn(
    "keywords_5", col("keywords")(4)
).drop(
    "images", 
    "keywords", 
    "homepages_repr", 
    "homepages_etc"
)
val arrColList = hospitalDataSelected.schema.fields.filter(_.dataType.isInstanceOf[ArrayType]).map(_.name)
var hospitalDf = hospitalDataSelected
arrColList.foreach { arrCol =>
  hospitalDf = hospitalDf.withColumn(arrCol, concat_ws(",", col(arrCol)))
}

### 3. preprocessing root data

In [None]:
val rootData = data.select(explode(col("root")).alias("r"))
val rootDf = rootData.select(
    regexp_extract(col("r.hospital.base.__ref"), "HospitalBase:([\\w]+)", 1).alias("root_id"),
    col("r.hospital.fsasReviews.total").alias("fsas_reviews_count"),
    col("r.hospital.kinQna.answerCount").alias("kin_qna_count")
)

### 4. join dataframes

In [None]:
val df = hospitalDf.join(rootDf, hospitalDf("id") === rootDf("root_id"), "left_outer").drop("root_id")

### 5. save dataframe

In [None]:
val savePath = savePath
df.dropDuplicates().write.mode("overwrite").parquet(savePath)

### 6. get task time

In [None]:
val ft = System.currentTimeMillis()
println(s"Spark task time: ${(ft - st)/1000} s")

### 7. upload to redshift

In [None]:
// set envs
val jdbcUrl = "<jdbc_url>"
val tempDir = "<temp_dir>"
val dbTable = "<db_table>"

// save
df.write
  .format("io.github.spark_redshift_community.spark.redshift")
  .option("driver", "com.amazon.redshift.jdbc42.Driver")
  .option("forward_spark_s3_credentials", "true")
  .option("url", jdbcUrl)
  .option("dbtable", dbTable)
  .option("tempdir", tempDir)
  .mode(SaveMode.Overwrite)
  .save()