In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("จาก raw data สู่ Disk").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "2000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "6").\
        getOrCreate()

23/01/30 17:14:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
from pyspark.sql.functions import explode
from pyspark.sql.types import StructType, StructField, StringType, MapType

# Define the schema for the dataframe
df_schema = StructType([
	StructField("business_id", StringType(), True),
	StructField("name", StringType(), True),
	StructField("address", StringType(), True),
	StructField("city", StringType(), True),
	StructField("state", StringType(), True),
	StructField("postal_code", StringType(), True),
	StructField("latitude", StringType(), True),
	StructField("longitude", StringType(), True),
	StructField("stars", StringType(), True),
	StructField("review_count", StringType(), True),
	StructField("is_open", StringType(), True),
	StructField("attributes", MapType(StringType(), StringType()), True),
	StructField("categories", StringType(), True),
	StructField("hours", MapType(StringType(), StringType()), True)
])


In [3]:
df = spark.read.json('yelp_academic_dataset_business.json', schema=df_schema)


In [4]:
df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- is_open: string (nullable = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- categories: string (nullable = true)
 |-- hours: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [5]:
# Explode the nested hours column
hours_df = df.select("business_id", explode("hours").alias("day", "hours"))

# Join the two dataframes on business_id
result_df = df.join(hours_df, "business_id")


In [6]:
result_df.count()

                                                                                

801015

In [7]:
result_df.orderBy('business_id').limit(10).toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
business_id,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,--7jw19RH9JKXgFohspgQw,--7jw19RH9JKXgFohspgQw,--7jw19RH9JKXgFohspgQw,--7jw19RH9JKXgFohspgQw
name,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,McIlwain Family Dentistry & Ahrens Orthodontics,McIlwain Family Dentistry & Ahrens Orthodontics,McIlwain Family Dentistry & Ahrens Orthodontics,McIlwain Family Dentistry & Ahrens Orthodontics
address,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,"26908 Foggy Creek Rd, Ste 101","26908 Foggy Creek Rd, Ste 101","26908 Foggy Creek Rd, Ste 101","26908 Foggy Creek Rd, Ste 101"
city,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,Wesley Chapel,Wesley Chapel,Wesley Chapel,Wesley Chapel
state,FL,FL,FL,FL,FL,FL,FL,FL,FL,FL
postal_code,34652,34652,34652,34652,34652,34652,33544,33544,33544,33544
latitude,28.2172884,28.2172884,28.2172884,28.2172884,28.2172884,28.2172884,28.1891564,28.1891564,28.1891564,28.1891564
longitude,-82.7333444,-82.7333444,-82.7333444,-82.7333444,-82.7333444,-82.7333444,-82.3714207,-82.3714207,-82.3714207,-82.3714207
stars,4.5,4.5,4.5,4.5,4.5,4.5,4.0,4.0,4.0,4.0
review_count,24,24,24,24,24,24,13,13,13,13
