### 1. 라이브러리 불러오기 및 세션 생성

In [1]:
import json
import pandas as pd
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType
from pyspark.sql.functions import explode, map_keys, col, first, get_json_object, array, to_json, struct, regexp_replace, split

In [2]:
# Spark 세션 생성
spark = SparkSession \
        .builder \
        .appName("processingJson") \
        .getOrCreate()

23/11/20 14:43:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### 2. 데이터 불러오기

In [3]:
# 분할된 JSON 파일 경로 선택하는 함수
def nth_json_path(n):
    return f'/Users/b06/Desktop/yeardream/medi-05/data/naverplace_meta/naverplace_meta_{n}.json'

In [4]:
# 첫번째 JSON 파일 데이터 불러오기
n = 1
data = spark.read.json(nth_json_path(n))

23/11/20 14:43:16 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


### 3. 변수

In [5]:
columns = data.columns

In [6]:
hospital_bases = [c for c in columns if "HospitalBase" in c]

In [7]:
target_columns = [
    'id',
    'name', 
    'road', 
    'reviewSettings', 
    'conveniences', 
    'keywords', 
    'phone', 
    'virtualPhone', 
    'naverBookingUrl', 
    'talktalkUrl', 
    'paymentInfo', 
    'homepages',
    'visitorReviewsTotal',
    'description',
    'Images'
]

In [8]:
string_columns = [
    'id',
    'name', 
    'road',
    'phone',
    'virtualPhone',
    'naverBookingUrl',
    'talktalkUrl',
    'visitorReviewsTotal',
    'description'
]

In [9]:
struct_columns = [
    'id',
    'reviewSettings',
    'homepages'
]

In [10]:
review_keyword_columns = [
    'id',
    'reviewSettings.keyword'
]

In [11]:
homepages_columns = [
    'id',
    'homepages.repr.url',
    'homepages.repr.type',
    'homepages.repr.isDeadUrl',
    'homepages.repr.landingUrl'
]

In [39]:
conveniences_columns = [
    'id',
    'conveniences'
]

In [40]:
keywords_columns = [
    'id',
    'keywords'
]

In [41]:
paymentInfo_columns = [
    'id',
    'paymentInfo'
]

In [13]:
string_columns_schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("road", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("virtualPhone", StringType(), True),
    StructField("naverBookingUrl", StringType(), True),
    StructField("talktalkUrl", StringType(), True),
    StructField("visitorReviewsTotal", IntegerType(), True),
    StructField("description", StringType(), True)
])

In [14]:
review_keyword_columns_schema = StructType([
    StructField("id", StringType(), True),
    StructField("review_keyword", StringType(), True)
])

In [15]:
homepages_columns_schema = StructType([
    StructField("id", StringType(), True),
    StructField("url", StringType(), True),
    StructField("type", StringType(), True),
    StructField("isDeadUrl", BooleanType(), True),
    StructField("landingUrl", StringType(), True)
])

In [None]:
conveniences_columns_schema = StructType([
    StructField("id", StringType(), True),
    StructField("conveniences", StringType(), True),
)

In [None]:
keywords_columns_schema = StructType([
    StructField("id", StringType(), True),
    StructField("keywords", StringType(), True),
)

In [16]:
string_table = spark.createDataFrame([], string_columns_schema)

In [17]:
review_keyword_table = spark.createDataFrame([], review_keyword_columns_schema)

In [18]:
homepages_table = spark.createDataFrame([], homepages_columns_schema)

In [None]:
conveniences_table = spark.createDataFrame([], conveniences_columns_schema)

In [None]:
keywords_table = spark.createDataFrame([], keywords_columns_schema)

### 4. 함수

In [19]:
def get_table(df, columns, table):
    get_columns = df.select(columns)
    row = remove_null(get_columns)
    return table.union(row)

In [20]:
def remove_null(df):
    return df.filter(~col('name').isNull())

In [21]:
def preprocessing_review_keyword(review_keyword_row):
    review_keyword_row = review_keyword_row.withColumnRenamed("keyword", "review_keyword")
    review_keyword_row = review_keyword_row.withColumn("review_keyword", regexp_replace("review_keyword", " & ", ", "))
    review_keyword_row = review_keyword_row.withColumn("review_keyword", regexp_replace("review_keyword", "[()]", ""))
    review_keyword_row = review_keyword_row.withColumn("review_keyword", explode(split(col("review_keyword"), ", ")))
    return review_keyword_row

In [22]:
def get_review_keyword_table(struct_df, review_keyword_columns, review_keyword_df):
    get_review_keyword_columns = struct_df.select(review_keyword_columns)
    review_keyword_row = remove_null(get_review_keyword_columns)
    review_keyword_row = preprocessing_review_keyword(review_keyword_row)
    return review_keyword_df.union(review_keyword_row)

In [23]:
def check_null(df, column):
    cnt = df.filter(col(column).isNull()).count()
    return True if cnt == 10 else False

In [24]:
def get_homepages_table(struct_df, homepages_columns, homepages_table):
    if check_null(struct_df, 'homepages.repr'):
        return homepages_table
    else:
        return get_table(struct_df, homepages_columns, homepages_table)

### 5. 데이터 전처리

In [25]:
for hospital_base in hospital_bases:    
    hospital_base_data = data.select(hospital_base)

    get_columns = [col(hospital_base + "." + t).alias(t) for t in target_columns]
    df = hospital_base_data.select(get_columns)
    
    string_table = get_table(df, string_columns, string_table)
    struct_df = df.select(struct_columns)
    review_keyword_table = get_review_keyword_table(struct_df, review_keyword_columns, review_keyword_table)
    homepages_table = get_homepages_table(df, homepages_columns, homepages_table)

In [32]:
# string_table.show(50)
# review_keyword_table.show(50)
# homepages_table.show(50)

### Test
---

In [28]:
hb = hospital_bases[1]
hb_data = data.select(hb)
get_columns = [col(hb + "." + t).alias(t) for t in target_columns]
df = hb_data.select(get_columns)

In [29]:
print(hospital_bases[0])

HospitalBase:11779766


In [30]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- road: string (nullable = true)
 |-- reviewSettings: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- blog: long (nullable = true)
 |    |-- cafe: long (nullable = true)
 |    |-- keyword: string (nullable = true)
 |    |-- showVisitorReviewScore: boolean (nullable = true)
 |-- conveniences: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- phone: string (nullable = true)
 |-- virtualPhone: string (nullable = true)
 |-- naverBookingUrl: string (nullable = true)
 |-- talktalkUrl: string (nullable = true)
 |-- paymentInfo: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- homepages: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- etc: array (nullable = true)
 |    |    |-- element: string (containsNull = t

In [31]:
HospitalBase:11779766