In [33]:
from functools import reduce
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, coalesce, split, regexp_replace, when, length, get_json_object, explode

In [34]:
spark = SparkSession.builder \
    .appName("medi_test") \
    .getOrCreate()

In [35]:
root_path = '/Users/b06/Desktop/yeardream/medi-05'
json_root_path = f'{root_path}/data/naverplace_meta'
save_root_path = f'{root_path}/spark-scala-project/output/pyspark'
text_root_path = f'{root_path}/spark-scala-project/test.txt'

In [36]:
chunk_size = 100
test_json_path = f'{json_root_path}/naverplace_meta_1.json'

In [37]:
# df = spark.read.option("multiline", "true").json('/Users/b06/Desktop/yeardream/medi-05/data/test/naverplace_meta_1.json')
df = spark.read.option("multiline", "true").json('/Users/b06/Desktop/yeardream/medi-05/data/test.json')

In [38]:
df.show()

+---------------------+--------------------+
|             Hospital|            Panorama|
+---------------------+--------------------+
|[{HospitalBase, 청...|[{Panorama, W1EwJ...|
+---------------------+--------------------+



In [39]:
df1 = df.select(explode("Hospital"))

In [40]:
df1.printSchema()

root
 |-- col: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- bookingBusinessId: string (nullable = true)
 |    |-- bookingButtonName: string (nullable = true)
 |    |-- bookingDisplayName: string (nullable = true)
 |    |-- bookingHubButtonName: string (nullable = true)
 |    |-- broadcastInfos: string (nullable = true)
 |    |-- businessHours: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- __typename: string (nullable = true)
 |    |    |    |-- day: string (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- endTime: string (nullable = true)
 |    |    |    |-- hourString: string (nullable = true)
 |    |    |    |-- index: long (nullable = true)
 |    |    |    |-- isDayOff: boolean (nullable = true)
 |    |    |    |-- startTime: string (nullable = true)
 |    |-- category: string (nullable = true)
 |    |-- cat

In [41]:
df1.select('col.name').filter(df1.col.id == '11779766').show()

+----------+
|      name|
+----------+
|다인한의원|
+----------+



In [None]:
# df_schema = StructType([
#     StructField('id', StringType(), True),
#     StructField('name', StringType(), True),
#     StructField('review_keywords', StringType(), True),
#     StructField('description', StringType(), True),
#     StructField('road', StringType(), True),
#     StructField('booking_business_id', StringType(), True),
#     StructField('booking_display_name', StringType(), True),
#     StructField('category', StringType(), True),
#     StructField('category_code', StringType(), True),
#     StructField('category_code_list', StringType(), True),
#     StructField('category_count', StringType(), True),
#     StructField('rcode', StringType(), True),
#     StructField('virtual_phone', StringType(), True),
#     StructField('phone', StringType(), True),
#     StructField('naver_booking_url', StringType(), True),
#     StructField('conveniences', StringType(), True),
#     StructField('talktalk_url', StringType(), True),
#     StructField('road_address', StringType(), True),
#     StructField('keywords', StringType(), True),
#     StructField('payment_info', StringType(), True),
#     StructField('ref', StringType(), True),
#     StructField('lon', StringType(), True),
#     StructField('lat', StringType(), True) 
# ])
# schema_df = spark.createDataFrame([], df_schema)

In [None]:
# target_keys = [
#     'id',
#     'name',
#     'bookingBusinessId',
#     'bookingDisplayName',
#     'category',
#     'categoryCode',
#     'categoryCount',    
#     'description',
#     'virtualPhone',
#     'phone',
#     'rcode',
#     'reviewSettings.keyword',
#     'roadAddress',
#     'streetPanorama.__ref',
#     'naverBookingUrl',
#     'talktalkUrl',
#     'categoryCodeList',
#     'conveniences',
#     'keywords',
#     'paymentInfo'
# ]

In [None]:
# def get_hospital_keys(df, str):
#     return [k for k in df.columns if k.startswith(str)]

In [None]:
# hospital_keys = get_hospital_keys(df, 'root.HospitalBase:')
# # hospital_keys

In [None]:
# hospital_df = df.select(hospital_keys)
# hospital_df

In [None]:
# hospital_df.select('HospitalBase:1024029483').dtypes

In [None]:
string_keys = [
    'id',
    'name',
    'bookingBusinessId',
    'bookingDisplayName',
    'category',
    'categoryCode',
    'categoryCount',    
    'description',
    'virtualPhone',
    'phone',
    'rcode',
    'reviewSettings.keyword',
    'roadAddress',
    'streetPanorama.__ref',
    'naverBookingUrl',
    'talktalkUrl',
]
array_keys = [
    'categoryCodeList',
    'conveniences',
    'keywords',
    'paymentInfo'
]

In [None]:
def select_target_cols(hospital_keys, target_keys):
    target_tuples_list = []
    for hospital_key in hospital_keys:
        for target_key in target_keys:
            new_target_key = target_key.replace('.', '_')
            _, id_num = hospital_key.split(':')
            target_tuple = (f'{hospital_key}.{target_key}', f'{id_num}_{new_target_key}')
            target_tuples_list.append(target_tuple)
    target_cols = [col(c) for _, c in target_tuples_list]
    select_target_cols = [col(id_key).alias(id_key) for key_key, id_key in target_tuples_list]
    return select_target_cols, target_cols

In [None]:
select_string_cols, string_cols = select_target_cols(hospital_keys, string_keys)
select_array_cols, array_cols = select_target_cols(hospital_keys, array_keys)

In [None]:
select_string_cols
# string_cols
#select_array_cols
# array_cols

In [None]:
# hospital_df.printSchema()

In [None]:
select_string_df = hospital_df.select(*select_string_cols)
select_string_df.printSchema()

In [None]:
temp = df.select('HospitalBase:1024029483')
temp.select("*").show()

In [None]:
select_string_df.select('1024029483_id').dropna().show()

In [None]:
lst = []
for c in string_cols:
    lst.append(select_string_df.select(c).dropna())

In [None]:
lst

23/12/04 20:11:29 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 179897 ms exceeds timeout 120000 ms
23/12/04 20:11:29 WARN SparkContext: Killing executors is not supported by current scheduler.
