In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("จาก raw data สู่ Disk").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "2000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "6").\
        getOrCreate()

23/01/30 14:50:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, MapType

# Create a schema for the dataframe
schema = StructType([
    StructField("business_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("stars", DoubleType(), True),
    StructField("review_count", IntegerType(), True),
    StructField("is_open", IntegerType(), True),
    StructField("attributes", MapType(StringType(), StringType()), True),
    StructField("categories", StringType(), True),
    StructField("hours", MapType(StringType(), StringType()), True)
])

# Read the JSON text into a dataframe
df = spark.read.json('yelp_academic_dataset_business.json', schema=schema)

# Show the dataframe
df.show()


[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|          city|state|postal_code|     latitude|     longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...| Santa Barbara|   CA|      93101|   34.4266787|  -119.7111968|  5.0|           7|      0|[ByAppointmentOnl...|Doctors, Traditio...|                null|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|        Affton|   MO|      63123|    38.551126|    -90.335695|  3.0|

                                                                                

In [3]:
df.take(5)

                                                                                

[Row(business_id='Pns2l4eNsfO8kk83dixA6A', name='Abby Rappoport, LAC, CMQ', address='1616 Chapala St, Ste 2', city='Santa Barbara', state='CA', postal_code='93101', latitude=34.4266787, longitude=-119.7111968, stars=5.0, review_count=7, is_open=0, attributes={'ByAppointmentOnly': 'True'}, categories='Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists', hours=None),
 Row(business_id='mpf3x-BjTdTEA3yCZrAYPw', name='The UPS Store', address='87 Grasso Plaza Shopping Center', city='Affton', state='MO', postal_code='63123', latitude=38.551126, longitude=-90.335695, stars=3.0, review_count=15, is_open=1, attributes={'BusinessAcceptsCreditCards': 'True'}, categories='Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services', hours={'Tuesday': '8:0-18:30', 'Monday': '0:0-0:0', 'Thursday': '8:0-18:30', 'Friday': '8:0-18:30', 'Wednesday': '8:0-18:30', 'Saturday': '8:0-14:0'}),
 Row(business_id='tUFrWirKiKi_TAnsVWINQQ', 

# Hour Split

In [4]:
from pyspark.sql.functions import explode, col
# Explode the nested hours column
hours_df = df.select("business_id", explode("hours").alias("day", "hours"))

In [5]:
hours_df.show(truncate=False)

+----------------------+---------+---------+
|business_id           |day      |hours    |
+----------------------+---------+---------+
|mpf3x-BjTdTEA3yCZrAYPw|Monday   |0:0-0:0  |
|mpf3x-BjTdTEA3yCZrAYPw|Tuesday  |8:0-18:30|
|mpf3x-BjTdTEA3yCZrAYPw|Wednesday|8:0-18:30|
|mpf3x-BjTdTEA3yCZrAYPw|Thursday |8:0-18:30|
|mpf3x-BjTdTEA3yCZrAYPw|Friday   |8:0-18:30|
|mpf3x-BjTdTEA3yCZrAYPw|Saturday |8:0-14:0 |
|tUFrWirKiKi_TAnsVWINQQ|Monday   |8:0-22:0 |
|tUFrWirKiKi_TAnsVWINQQ|Tuesday  |8:0-22:0 |
|tUFrWirKiKi_TAnsVWINQQ|Wednesday|8:0-22:0 |
|tUFrWirKiKi_TAnsVWINQQ|Thursday |8:0-22:0 |
|tUFrWirKiKi_TAnsVWINQQ|Friday   |8:0-23:0 |
|tUFrWirKiKi_TAnsVWINQQ|Saturday |8:0-23:0 |
|tUFrWirKiKi_TAnsVWINQQ|Sunday   |8:0-22:0 |
|MTSW4McQd7CbVtyjqoe9mw|Monday   |7:0-20:0 |
|MTSW4McQd7CbVtyjqoe9mw|Tuesday  |7:0-20:0 |
|MTSW4McQd7CbVtyjqoe9mw|Wednesday|7:0-20:0 |
|MTSW4McQd7CbVtyjqoe9mw|Thursday |7:0-20:0 |
|MTSW4McQd7CbVtyjqoe9mw|Friday   |7:0-21:0 |
|MTSW4McQd7CbVtyjqoe9mw|Saturday |7:0-21:0 |
|MTSW4McQd

In [6]:
# Join the two dataframes on business_id
result_df = df.join(hours_df, "business_id")

In [7]:
result_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- is_open: integer (nullable = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- categories: string (nullable = true)
 |-- hours: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- day: string (nullable = false)
 |-- hours: string (nullable = true)



In [8]:
df.describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
business_id,150346,,,---kPU91CF4Lq2-WlRu9Lw,zzyx5x0Z7xXWWvWnZFuxlQ
name,150346,1252.4,811.1275005954502,Grow Academy,​​Transformational Abdominal Massage by Jada D...
address,150346,7369.333333333333,8738.777641447725,,​185 E State St
city,150346,,,AB Edmonton,​Lithia
state,150346,,,AB,XMS
postal_code,150346,45177.81755426108,26395.882085856494,,T9E 0V3
latitude,150346,36.67115006414579,5.872758917014046,27.555127,53.6791969
longitude,150346,-89.3573394897143,14.91850167993061,-120.095137,-73.2004570502
stars,150346,3.5967235576603303,0.9744207509201366,1.0,5.0


In [9]:
df.select('business_id').distinct().count()

                                                                                

150346

In [10]:
df.filter(col('hours').isNull()).count()

23223

In [11]:
result_df.select('business_id').distinct().count()

                                                                                

127123

In [12]:
result_df.orderBy('business_id').limit(10).toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
business_id,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,--7jw19RH9JKXgFohspgQw,--7jw19RH9JKXgFohspgQw,--7jw19RH9JKXgFohspgQw,--7jw19RH9JKXgFohspgQw
name,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,McIlwain Family Dentistry & Ahrens Orthodontics,McIlwain Family Dentistry & Ahrens Orthodontics,McIlwain Family Dentistry & Ahrens Orthodontics,McIlwain Family Dentistry & Ahrens Orthodontics
address,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,"26908 Foggy Creek Rd, Ste 101","26908 Foggy Creek Rd, Ste 101","26908 Foggy Creek Rd, Ste 101","26908 Foggy Creek Rd, Ste 101"
city,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,Wesley Chapel,Wesley Chapel,Wesley Chapel,Wesley Chapel
state,FL,FL,FL,FL,FL,FL,FL,FL,FL,FL
postal_code,34652,34652,34652,34652,34652,34652,33544,33544,33544,33544
latitude,28.217288,28.217288,28.217288,28.217288,28.217288,28.217288,28.189156,28.189156,28.189156,28.189156
longitude,-82.733344,-82.733344,-82.733344,-82.733344,-82.733344,-82.733344,-82.371421,-82.371421,-82.371421,-82.371421
stars,4.5,4.5,4.5,4.5,4.5,4.5,4.0,4.0,4.0,4.0
review_count,24,24,24,24,24,24,13,13,13,13


In [13]:
result_df.take(1)

                                                                                

[Row(business_id='-0iIxySkp97WNlwK66OGWg', name='Truckee Bagel Company - Midtown', address='538 S Virginia St, Ste B', city='Reno', state='NV', postal_code='89501', latitude=39.5202401, longitude=-119.810022, stars=3.5, review_count=219, is_open=1, attributes={'BikeParking': 'True', 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}", 'RestaurantsGoodForGroups': 'True', 'RestaurantsPriceRange2': '1', 'Alcohol': "'none'", 'RestaurantsTakeOut': 'True', 'ByAppointmentOnly': 'False', 'Caters': 'True', 'RestaurantsAttire': "'casual'", 'GoodForKids': 'True', 'HasTV': 'False', 'BusinessAcceptsCreditCards': 'True', 'RestaurantsDelivery': 'True', 'BusinessAcceptsBitcoin': 'False', 'NoiseLevel': "u'average'", 'RestaurantsTableService': 'False', 'RestaurantsReservations': 'False', 'WheelchairAccessible': 'True', 'HappyHour': 'False', 'BYOB': 'False', 'Ambience': "{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate

# Attribute Split

In [14]:
from pyspark.sql.functions import explode, col
# Explode the nested attributes column
attributes_df = df.select("business_id", explode("attributes").alias("businessAttrType", "businessAttrValue"))

In [15]:
attributes_df.count()

1206820

In [16]:
attributes_df.orderBy('business_id').limit(30).toPandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
business_id,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,...,--0iUa4sNDFiZFrAdIWhZQ,--0iUa4sNDFiZFrAdIWhZQ,--0iUa4sNDFiZFrAdIWhZQ,--0iUa4sNDFiZFrAdIWhZQ,--0iUa4sNDFiZFrAdIWhZQ,--0iUa4sNDFiZFrAdIWhZQ,--0iUa4sNDFiZFrAdIWhZQ,--0iUa4sNDFiZFrAdIWhZQ,--30_8IhuyMHbSOcNWd6DQ,--30_8IhuyMHbSOcNWd6DQ
businessAttrType,WheelchairAccessible,BusinessAcceptsBitcoin,Ambience,RestaurantsTakeOut,BikeParking,OutdoorSeating,Alcohol,Corkage,HasTV,DogsAllowed,...,RestaurantsPriceRange2,GoodForKids,BusinessAcceptsCreditCards,RestaurantsReservations,RestaurantsAttire,RestaurantsDelivery,RestaurantsTakeOut,RestaurantsGoodForGroups,GoodForKids,BusinessAcceptsCreditCards
businessAttrValue,True,False,"{'touristy': False, 'hipster': False, 'romanti...",True,True,True,u'none',False,False,True,...,1,True,False,True,'casual',False,True,True,True,True


In [17]:
# Join the two dataframes on business_id
result_df = df.join(attributes_df, "business_id")

In [18]:
result_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- is_open: integer (nullable = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- categories: string (nullable = true)
 |-- hours: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- businessAttrType: string (nullable = false)
 |-- businessAttrValue: string (nullable = true)



In [19]:
result_df.count()

1206820

In [20]:
result_df.orderBy('business_id').limit(10).toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
business_id,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw,---kPU91CF4Lq2-WlRu9Lw
name,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar,Frankie's Raw Bar
address,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54,4903 State Rd 54
city,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey,New Port Richey
state,FL,FL,FL,FL,FL,FL,FL,FL,FL,FL
postal_code,34652,34652,34652,34652,34652,34652,34652,34652,34652,34652
latitude,28.217288,28.217288,28.217288,28.217288,28.217288,28.217288,28.217288,28.217288,28.217288,28.217288
longitude,-82.733344,-82.733344,-82.733344,-82.733344,-82.733344,-82.733344,-82.733344,-82.733344,-82.733344,-82.733344
stars,4.5,4.5,4.5,4.5,4.5,4.5,4.5,4.5,4.5,4.5
review_count,24,24,24,24,24,24,24,24,24,24
