### 1. 라이브러리 불러오기 및 세션 생성

In [1]:
import json
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import explode, map_keys, col, first, get_json_object, array, to_json, struct, regexp_replace, split


# Spark 세션 생성
spark = SparkSession \
        .builder \
        .appName("processingJson") \
        .getOrCreate()

23/11/20 11:01:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### 2. 데이터 불러오기

In [2]:
# 분할된 JSON 파일 경로 선택하는 함수
def nth_json_path(n):
    return f'/Users/b06/Desktop/yeardream/medi-05/data/naverplace_meta/naverplace_meta_{n}.json'

In [3]:
# 첫번째 JSON 파일 데이터 불러오기
n = 1
data = spark.read.json(nth_json_path(n))

23/11/20 11:01:54 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


### 3. 변수

In [4]:
columns = data.columns

In [5]:
hospital_bases = [c for c in columns if "HospitalBase" in c]

In [6]:
target_columns = [
    'id',
    'name', 
    'road', 
    'reviewSettings', 
    'conveniences', 
    'keywords', 
    'phone', 
    'virtualPhone', 
    'naverBookingUrl', 
    'talktalkUrl', 
    'paymentInfo', 
    'homepages',
    'visitorReviewsTotal',
    'description',
    'Images'
]

In [7]:
string_columns = [
    'id',
    'name', 
    'road',
    'phone',
    'virtualPhone',
    'naverBookingUrl',
    'talktalkUrl',
    'description'
]

In [8]:
struct_columns = [
    'id',
    'reviewSettings',
    'homepages'
]

In [9]:
array_columns = [
    'id',
    'conveniences',
    'keywords',
    'Images'
]

In [10]:
review_keyword_column = [
    'id',
    'reviewSettings.keyword'
]

In [11]:
url_columns = [
    'id',
    'homepages.repr.isDeadUrl',
    'homepages.repr.landingUrl',
    'homepages.repr.url'
]

In [12]:
string_columns_schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("road", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("virtualPhone", StringType(), True),
    StructField("naverBookingUrl", StringType(), True),
    StructField("talktalkUrl", StringType(), True),
    StructField("description", StringType(), True)
])

In [13]:
review_keyword_column_schema = StructType([
    StructField("id", StringType(), True),
    StructField("review_keyword", StringType(), True)
])

In [14]:
# struct_schema
# array_schema

In [15]:
string_df = spark.createDataFrame([], string_columns_schema)

In [16]:
review_keyword_df = spark.createDataFrame([], review_keyword_column_schema)

### 4. 함수

In [17]:
# get string dataframe!
def get_string_df(df, string_columns, string_df):
    get_string_cols = df.select(string_columns)
    string_row = remove_null(get_string_cols)
    return string_df.union(string_row)

In [18]:
# 결측치 제거
def remove_null(df):
    return df.filter(~col('name').isNull())

In [19]:
def get_struct_df(df, struct_columns):
    return df.select(struct_columns)

In [20]:
def preprocessing_review_keyword(review_keyword_row):
    review_keyword_row = review_keyword_row.withColumnRenamed("keyword", "review_keyword")
    review_keyword_row = review_keyword_row.withColumn("review_keyword", regexp_replace("review_keyword", " & ", ", "))
    review_keyword_row = review_keyword_row.withColumn("review_keyword", regexp_replace("review_keyword", "[()]", ""))
    review_keyword_row = review_keyword_row.withColumn("review_keyword", explode(split(col("review_keyword"), ", ")))
    return review_keyword_row

In [21]:
def get_review_keyword_df(struct_df, review_keyword_column, review_keyword_df):
    get_review_keyword_col = struct_df.select(review_keyword_column)
    review_keyword_row = remove_null(get_review_keyword_col)
    review_keyword_row = preprocessing_review_keyword(review_keyword_row)
    return review_keyword_df.union(review_keyword_row)

### 5. 데이터 전처리

In [22]:
for hospital_base in hospital_bases:    
    hospital_base_data = data.select(hospital_base)

    get_columns = [col(hospital_base + "." + t).alias(t) for t in target_columns]
    df = hospital_base_data.select(get_columns)
    
    string_df = get_string_df(df, string_columns, string_df)
    struct_df = get_struct_df(df, struct_columns)
    review_keyword_df = get_review_keyword_df(struct_df, review_keyword_column, review_keyword_df)

### Test
---

In [24]:
hb = hospital_bases[0]
hb_data = data.select(hb)
get_columns = [col(hb + "." + t).alias(t) for t in target_columns]
df = hb_data.select(get_columns)

In [None]:
struct_df = df.select(struct_columns)
keyword_df = get_review_keyword_df(struct_df)
keyword_df.show()

In [None]:
from pyspark.sql.functions import col

# reviewSettings의 keyword 필드 추출
keyword_df = struct_df.select(col("reviewSettings.keyword"))

# homepages의 repr 내의 isDeadUrl, landingUrl, url 필드 추출
url_info_df = struct_df.select(
    col("homepages.repr.isDeadUrl"),
    col("homepages.repr.landingUrl"),
    col("homepages.repr.url")
)

keyword_df