### 1. 준비

In [1]:
from functools import reduce
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import col, concat_ws, split, regexp_replace, regexp_extract, when, length, get_json_object, explode, size, array_contains, array, flatten

In [2]:
spark = SparkSession.builder \
    .appName("medi_test") \
    .getOrCreate()

23/12/06 13:19:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
root_path = '/Users/b06/Desktop/yeardream/medi-05'
json_root_path = f'{root_path}/data/naverplace_meta'
text_root_path = f'{root_path}/spark-scala-project/test.txt'

In [4]:
test_json_path = f'{json_root_path}/naverplace_meta_1.json'

In [5]:
data = spark.read.option("multiline", "true").json(test_json_path)
# data = spark.read.option("multiline", "true").json('/Users/b06/Desktop/yeardream/medi-05/data/test.json')

23/12/06 13:19:43 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


### 2. hospital dataframe

In [6]:
hospital_data = data.select(explode("hospital").alias("hospital"))

In [7]:
hospital_df = hospital_data.select(
    col("hospital.id").alias("id"),
    col("hospital.name").alias("name"),
    col("hospital.reviewSettings.keyword").alias("review_keyword"),
    col("hospital.keywords").alias("keywords"),
    col("hospital.naverBookingUrl").alias("booking_url"),
    col("hospital.talktalkUrl").alias("talktalk_url"),
    col("hospital.category").alias("category"),
    col("hospital.categoryCode").alias("category_code"),
    col("hospital.categoryCodeList").alias("category_code_list"),
    col("hospital.categoryCount").alias("category_count"),
    col("hospital.conveniences").alias("conveniences"),
    col("hospital.paymentInfo").alias("payment_info"),
    col("hospital.rcode").alias("rcode"),
    col("hospital.virtualPhone").alias("virtual_phone"),
    col("hospital.phone").alias("phone"),
    col("hospital.roadAddress").alias("road_address"),
    col("hospital.road").alias("road"),
    col("hospital.description").alias("description"),
    col("hospital.bookingBusinessId").alias("booking_business_id"),
    col("hospital.bookingDisplayName").alias("booking_display_name"),
    col("hospital.visitorReviewsScore").alias("visitor_reviews_score"),
    col("hospital.visitorReviewsTotal").alias("visitor_reviews_total"),
    col("hospital.visitorReviewsTextReviewTotal").alias("visitor_reviews_text_review_total"),
    col("hospital.images").alias("images"),
    col("hospital.homepages.etc").alias("homepages_etc"),
    col("hospital.homepages.repr").alias("homepages_repr"),
    col("hospital.homepages.repr.url").alias("is_rep"), # isRep?
    col("hospital.coordinate.x").alias("lon"),
    col("hospital.coordinate.y").alias("lat"),
)

In [8]:
hospital_df = hospital_df.withColumn(
    "description",
    regexp_replace("description", "[\n\r*,]", "")
).withColumn(
    "road",
    regexp_replace("road", "[\n\r*,]", "")
)

In [9]:
hospital_df = hospital_df.withColumn(
    "description_length",
    length("description")
).withColumn(
    "images_count", 
    size("images")
).withColumn(
    'photo_review_ratio',
    (col('visitor_reviews_total')-col('visitor_reviews_text_review_total'))/col('visitor_reviews_total')
).withColumn(
    'is_smart_phone',
    col('phone').startswith('010')
).withColumn(
    'is_zero_pay',
    array_contains(col('payment_info'), '제로페이')
).withColumn(
    'homepages_url', 
    flatten(array(array('homepages_repr.url'), 'homepages_etc.url'))
).withColumn(
    'homepages_type', 
    flatten(array(array('homepages_repr.type'), 'homepages_etc.type'))
).withColumn(
    'homepages_order', 
    when(
        col('homepages_repr.order').isNull(), 0
    ).otherwise(
        size(flatten(array(array('homepages_repr.order'), 'homepages_etc.order')))
    )
).withColumn(
    'isDeadUrl',
    flatten(array(array('homepages_repr.isDeadUrl'), 'homepages_etc.isDeadUrl'))
).withColumn(
    'keywords_1',
    col('keywords')[0]
).withColumn(
    'keywords_2',
    col('keywords')[1]
).withColumn(
    'keywords_3',
    col('keywords')[2]
).withColumn(
    'keywords_4',
    col('keywords')[3]
).withColumn(
    'keywords_5',
    col('keywords')[4]
)

In [24]:
# hospital_df.columns

### 3. root dataframe

In [11]:
root_data = data.select(explode("root").alias("root"))
# root_df.select("root.base.__ref").show()
# root_df.select("root.fsasReviews.total").show()
# root_df.select("root.kinQna.answerCount").show()

In [12]:
root_df = root_data.select(
    col("root.base.__ref").alias("root_id"),
    col("root.fsasReviews.total").alias("fsas_reviews_count"),
    col("root.kinQna.answerCount").alias("kin_qna_count")
)

In [13]:
root_df = root_df.withColumn(
    "root_id",
    regexp_extract("root_id", "HospitalBase:([\\w]+)", 1)
)

In [26]:
# root_df.show()

### 4. join, save dataframe

In [25]:
df = hospital_df.join(root_df, hospital_df.id == root_df.root_id, "left_outer")
df = df.drop("root_id", "images", "keywords", "homepages_repr", "homepages_etc")
# id_check = df.filter(col("root_id") != col("id"))
# id_check.show()
arr_col = [field.name for field in df.schema.fields if isinstance(field.dataType, ArrayType)]
# arr_col

In [16]:
def arr_to_str(df, col):
    df = df.withColumn(col, concat_ws(",", col))
    return df

In [21]:
for c in arr_col:
    df = arr_to_str(df, c)

In [None]:
# save_root_path = f'{root_path}/spark-scala-project/output/pyspark/'
# save_path = '{save_root_path}/naverplace_{n}'
# df.write.parquet(save_path)

In [22]:
# print(n)
n=1

In [23]:
save_root_path = f'{root_path}/spark-scala-project/output/pyspark/test'
save_path = '{save_root_path}/naverplace_{n}.csv'
df.coalesce(1).write.mode('append').option("encoding", "utf-8").csv(save_path, header=True)