In [23]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [24]:
import json
import pandas as pd
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import explode, map_keys, col, first, get_json_object, array, to_json, struct

In [25]:
# Spark 세션 생성
spark = SparkSession \
        .builder \
        .appName("processingJson") \
        .getOrCreate()

In [26]:
# 분할된 JSON 파일 경로 선택하는 함수
def nth_json_path(n):
    return f'/Users/b06/Desktop/yeardream/medi-05/data/naverplace_meta/naverplace_meta_{n}.json'

In [27]:
# 첫번째 JSON 파일 경로
n = 1
json_path_1 = nth_json_path(n)
json_path_1

'/Users/b06/Desktop/yeardream/medi-05/data/naverplace_meta/naverplace_meta_1.json'

In [28]:
data = spark.read.json(json_path_1)

In [29]:
## EDA
#df.show(1)
#df.printSchema(1)
#df.dtypes
columns = data.columns

In [30]:
hb_columns = [c for c in columns if "HospitalBase" in c]
hb_columns

['HospitalBase:11779766',
 'HospitalBase:1233961055',
 'HospitalBase:12857046',
 'HospitalBase:1344046290',
 'HospitalBase:1413208872',
 'HospitalBase:1934828030',
 'HospitalBase:19518318',
 'HospitalBase:19530874',
 'HospitalBase:265615453',
 'HospitalBase:33237939']

In [31]:
hb_column = hb_columns.pop(0)
hb_column

'HospitalBase:11779766'

In [32]:
df = data.select(hb_column)
df.printSchema(2)

root
 |-- HospitalBase:11779766: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- bookingBusinessId: string (nullable = true)
 |    |-- bookingButtonName: string (nullable = true)
 |    |-- bookingDisplayName: string (nullable = true)
 |    |-- bookingHubButtonName: string (nullable = true)
 |    |-- broadcastInfos: string (nullable = true)
 |    |-- businessHours: array (nullable = true)
 |    |-- category: string (nullable = true)
 |    |-- categoryCode: string (nullable = true)
 |    |-- categoryCodeList: array (nullable = true)
 |    |-- categoryCount: long (nullable = true)
 |    |-- cescoCheck: string (nullable = true)
 |    |-- cescoLink: string (nullable = true)
 |    |-- cescofsCheck: string (nullable = true)
 |    |-- cescofsLink: string (nullable = true)
 |    |-- chatBotUrl: string (nullable = true)
 |    |-- conveniences: array (nullable = true)
 |    |-- coordinate: struct (nullable = true)
 |   

In [33]:
targets = [
    'name', 
    'road', 
    'reviewSettings', 
    'conveniences', 
    'keywords', 
    'phone', 
    'virtualPhone', 
    'naverBookingUrl', 
    'talktalkUrl', 
    'paymentInfo', 
    'homepages',
    'visitorReviewsTotal',
    'description',
    'Images'
]
targets

['name',
 'road',
 'reviewSettings',
 'conveniences',
 'keywords',
 'phone',
 'virtualPhone',
 'naverBookingUrl',
 'talktalkUrl',
 'paymentInfo',
 'homepages',
 'visitorReviewsTotal',
 'description',
 'Images']

In [34]:
sel_cols = [col(hb_column + "." + t).alias(t) for t in targets]
sel_cols

[Column<'HospitalBase:11779766.name AS name'>,
 Column<'HospitalBase:11779766.road AS road'>,
 Column<'HospitalBase:11779766.reviewSettings AS reviewSettings'>,
 Column<'HospitalBase:11779766.conveniences AS conveniences'>,
 Column<'HospitalBase:11779766.keywords AS keywords'>,
 Column<'HospitalBase:11779766.phone AS phone'>,
 Column<'HospitalBase:11779766.virtualPhone AS virtualPhone'>,
 Column<'HospitalBase:11779766.naverBookingUrl AS naverBookingUrl'>,
 Column<'HospitalBase:11779766.talktalkUrl AS talktalkUrl'>,
 Column<'HospitalBase:11779766.paymentInfo AS paymentInfo'>,
 Column<'HospitalBase:11779766.homepages AS homepages'>,
 Column<'HospitalBase:11779766.visitorReviewsTotal AS visitorReviewsTotal'>,
 Column<'HospitalBase:11779766.description AS description'>,
 Column<'HospitalBase:11779766.Images AS Images'>]

In [35]:
sel_df = df.select(sel_cols)
sel_df.printSchema(1)

root
 |-- name: string (nullable = true)
 |-- road: string (nullable = true)
 |-- reviewSettings: struct (nullable = true)
 |-- conveniences: array (nullable = true)
 |-- keywords: array (nullable = true)
 |-- phone: string (nullable = true)
 |-- virtualPhone: string (nullable = true)
 |-- naverBookingUrl: string (nullable = true)
 |-- talktalkUrl: string (nullable = true)
 |-- paymentInfo: array (nullable = true)
 |-- homepages: struct (nullable = true)
 |-- visitorReviewsTotal: long (nullable = true)
 |-- description: string (nullable = true)
 |-- Images: array (nullable = true)



In [36]:
string_cols = [
    'name', 
    'road',
    'phone',
    'virtualPhone',
    'naverBookingUrl',
    'talktalkUrl',
    'description'
]

In [37]:
struct_cols = [
    'reviewSettings',
    'homepages'
]

In [38]:
array_cols = [
    'conveniences',
    'keywords',
    'Images'
]

In [52]:
string_schema = StructType([
    StructField("name", StringType(), True),
    StructField("road", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("virtualPhone", StringType(), True),
    StructField("naverBookingUrl", StringType(), True),
    StructField("talktalkUrl", StringType(), True),
    StructField("description", StringType(), True)
])
df_str = spark.createDataFrame([], string_schema)

In [53]:
sel_str_df = sel_df.select
str_df_row = sel_str_df.filter(~col('name').isNull())
str_df_row.show()

+----------+--------------------------------+--------------------+-----------------------------+---------------------------------+-----------+------------+--------------------+-----------+-----------+--------------------+-------------------+---------------------------------+--------------------+
|      name|                            road|      reviewSettings|                 conveniences|                         keywords|      phone|virtualPhone|     naverBookingUrl|talktalkUrl|paymentInfo|           homepages|visitorReviewsTotal|                      description|              Images|
+----------+--------------------------------+--------------------+-----------------------------+---------------------------------+-----------+------------+--------------------+-----------+-----------+--------------------+-------------------+---------------------------------+--------------------+
|다인한의원|지하철 숙대입구역 10번에서 뒤...|{ReviewSettings, ...|[주차, 예약, 무선 인터넷, ...|[통증퇴행성질환, 다이어트비만,...|02-711-9557|      

In [54]:
df_str = df_str.union(str_df_row)

AnalysisException: [NUM_COLUMNS_MISMATCH] UNION can only be performed on inputs with the same number of columns, but the first input has 7 columns and the second input has 14 columns.;
'Union false, false
:- LogicalRDD [name#1298, road#1299, phone#1300, virtualPhone#1301, naverBookingUrl#1302, talktalkUrl#1303, description#1304], false
+- Filter NOT isnull(name#1082)
   +- Project [HospitalBase:11779766#645.name AS name#1082, HospitalBase:11779766#645.road AS road#1083, HospitalBase:11779766#645.reviewSettings AS reviewSettings#1084, HospitalBase:11779766#645.conveniences AS conveniences#1085, HospitalBase:11779766#645.keywords AS keywords#1086, HospitalBase:11779766#645.phone AS phone#1087, HospitalBase:11779766#645.virtualPhone AS virtualPhone#1088, HospitalBase:11779766#645.naverBookingUrl AS naverBookingUrl#1089, HospitalBase:11779766#645.talktalkUrl AS talktalkUrl#1090, HospitalBase:11779766#645.paymentInfo AS paymentInfo#1091, HospitalBase:11779766#645.homepages AS homepages#1092, HospitalBase:11779766#645.visitorReviewsTotal AS visitorReviewsTotal#1093L, HospitalBase:11779766#645.description AS description#1094, HospitalBase:11779766#645.Images AS Images#1095]
      +- Project [HospitalBase:11779766#645]
         +- Relation [BaseNaverBlog:betbetter#574,BaseNaverBlog:bondiolsc#575,BaseNaverBlog:dainhani#576,BaseNaverBlog:kundaeclinic#577,BaseNaverBlog:memeetsworld#578,BusStation:104094#579,BusStation:104154#580,BusStation:104172#581,BusStation:104181#582,BusStation:104212#583,BusStation:104222#584,BusStation:104231#585,BusStation:104321#586,BusStation:104459#587,BusStation:104500#588,BusStation:104532#589,BusStation:104554#590,BusStation:104573#591,BusStation:104578#592,BusStation:104582#593,BusStation:104602#594,BusStation:104773#595,BusStation:123595#596,BusStation:123596#597,... 229 more fields] json
