In [1]:
# document of connector: https://neo4j.com/docs/spark/4.1/writing/#strategies
# description of dataset: https://www.yelp.com/dataset/documentation/main

In [2]:
filterCategories = 'Beauty & Spas'

In [3]:
#! wget https://repo1.maven.org/maven2/org/neo4j/neo4j-connector-apache-spark_2.12/4.1.5_for_spark_3/neo4j-connector-apache-spark_2.12-4.1.5_for_spark_3.jar

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("จาก raw data สู่ Disk").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "2000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "6").\
        config('spark.jars', 'neo4j-connector-apache-spark_2.12-4.1.5_for_spark_3.jar').\
        getOrCreate()

23/02/15 13:19:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
from pyspark.sql.functions import explode
from pyspark.sql.types import StructType, StructField, StringType, MapType

# Define the schema for the dataframe
df_schema = StructType([
	StructField("business_id", StringType(), True),
	StructField("name", StringType(), True),
	StructField("address", StringType(), True),
	StructField("city", StringType(), True),
	StructField("state", StringType(), True),
	StructField("postal_code", StringType(), True),
	StructField("latitude", StringType(), True),
	StructField("longitude", StringType(), True),
	StructField("stars", StringType(), True),
	StructField("review_count", StringType(), True),
	StructField("is_open", StringType(), True),
	StructField("attributes", MapType(StringType(), StringType()), True),
	StructField("categories", StringType(), True),
	StructField("hours", MapType(StringType(), StringType()), True)
])


In [7]:
df = spark.read.json('yelp_academic_dataset_business.json', schema=df_schema).sample(0.3)
#df = spark.read.json('yelp_academic_dataset_business.json', schema=df_schema)

In [8]:
df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- is_open: string (nullable = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- categories: string (nullable = true)
 |-- hours: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [9]:
from pyspark.sql.functions import split

business_df = df.withColumn("categories_split", split(df.categories, ", "))

In [10]:
business_df.count()

                                                                                

44920

In [11]:
business_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- is_open: string (nullable = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- categories: string (nullable = true)
 |-- hours: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- categories_split: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [12]:
from pyspark.sql import functions as sparkf

In [13]:
business_df.select("business_id", explode("attributes").alias("attrBusiness", "attrValue"))\
.groupBy('attrBusiness').count().orderBy('count',ascending=False).show(100,truncate=False)

                                                                                

+--------------------------+-----+
|attrBusiness              |count|
+--------------------------+-----+
|BusinessAcceptsCreditCards|35845|
|BusinessParking           |27243|
|RestaurantsPriceRange2    |25534|
|BikeParking               |21686|
|RestaurantsTakeOut        |17828|
|WiFi                      |17057|
|RestaurantsDelivery       |16790|
|GoodForKids               |15971|
|OutdoorSeating            |14629|
|RestaurantsReservations   |13578|
|HasTV                     |13539|
|Ambience                  |13294|
|RestaurantsGoodForGroups  |13221|
|Alcohol                   |12894|
|ByAppointmentOnly         |12747|
|Caters                    |12027|
|RestaurantsAttire         |11772|
|NoiseLevel                |11380|
|GoodForMeal               |8797 |
|WheelchairAccessible      |8753 |
|RestaurantsTableService   |6150 |
|DogsAllowed               |5441 |
|BusinessAcceptsBitcoin    |5257 |
|HappyHour                 |4578 |
|DriveThru                 |2332 |
|Music              

In [14]:
# Explode the nested attributes column
attributesWiFi_df = business_df.select("business_id", explode("attributes").alias("attrBusiness", "attrValue")).distinct().filter(sparkf.col('attrBusiness') == 'WiFi')

# Join the two dataframes on business_id
business_attributesWiFi_df = business_df.join(attributesWiFi_df, "business_id")


In [15]:
business_attributesWiFi_df.count()

17057

In [16]:
from pyspark.sql.functions import regexp_replace

# Convert the values 'u'no'' and 'no' to 'no'
business_attributesWiFi_df = business_attributesWiFi_df.withColumn("attrValue", regexp_replace(business_attributesWiFi_df.attrValue, "u'no'", "no"))
business_attributesWiFi_df = business_attributesWiFi_df.withColumn("attrValue", regexp_replace(business_attributesWiFi_df.attrValue, "'no'", "no"))

# Convert the values 'u'paid'' and 'paid' to 'paid'
business_attributesWiFi_df = business_attributesWiFi_df.withColumn("attrValue", regexp_replace(business_attributesWiFi_df.attrValue, "u'paid'", "paid"))
business_attributesWiFi_df = business_attributesWiFi_df.withColumn("attrValue", regexp_replace(business_attributesWiFi_df.attrValue, "'paid'", "paid"))

# Convert the values 'u'free'' and 'free' to 'free'
business_attributesWiFi_df = business_attributesWiFi_df.withColumn("attrValue", regexp_replace(business_attributesWiFi_df.attrValue, "u'free'", "free"))
business_attributesWiFi_df = business_attributesWiFi_df.withColumn("attrValue", regexp_replace(business_attributesWiFi_df.attrValue, "'free'", "free"))

#business_attributesWiFi_df = business_attributesWiFi_df.withColumn('attrValue',sparkf.when(sparkf.col('attrValue').isNull(),'N/A').otherwise(sparkf.col('attrValue')))

business_attributesWiFi_df = business_attributesWiFi_df.withColumn('attrValue',sparkf.when(sparkf.col('attrValue') == 'None','N/A').otherwise(sparkf.col('attrValue')))

# Show the resulting DataFrame
business_attributesWiFi_df.groupBy('attrValue').count().show()


                                                                                

+---------+-----+
|attrValue|count|
+---------+-----+
|     free|10357|
|     paid|  181|
|      N/A|   17|
|       no| 6502|
+---------+-----+



In [17]:
business_attributesWiFi_df.groupBy('categories_split').count()\
.orderBy('count',ascending=False).show(truncate=False)

+----------------------------------------------------+-----+
|categories_split                                    |count|
+----------------------------------------------------+-----+
|[Restaurants, Chinese]                              |160  |
|[Restaurants, Pizza]                                |158  |
|[Food, Coffee & Tea]                                |156  |
|[Restaurants, Mexican]                              |147  |
|[Chinese, Restaurants]                              |143  |
|[Coffee & Tea, Food]                                |132  |
|[Pizza, Restaurants]                                |129  |
|[Mexican, Restaurants]                              |125  |
|[Event Planning & Services, Hotels, Hotels & Travel]|82   |
|[Food, Ice Cream & Frozen Yogurt]                   |74   |
|[Hotels, Hotels & Travel, Event Planning & Services]|70   |
|[Event Planning & Services, Hotels & Travel, Hotels]|68   |
|[Italian, Restaurants]                              |67   |
|[Hotels & Travel, Hotel

In [18]:
business_attributesWiFi_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- is_open: string (nullable = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- categories: string (nullable = true)
 |-- hours: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- categories_split: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- attrBusiness: string (nullable = false)
 |-- attrValue: string (nullable = true)



In [19]:
business_attributesWiFi_df.count()

17057

In [20]:
business_attributesWiFi_df.select(['business_id','categories','attrValue'])\
.filter(sparkf.array_contains(sparkf.col('categories_split'),filterCategories))\
.distinct().count()

1400

In [21]:
final_business_df = business_attributesWiFi_df.select(['business_id','categories','attrValue'])\
.filter(sparkf.array_contains(sparkf.col('categories_split'),filterCategories))\
.distinct()

In [22]:
final_business_df.coalesce(1).write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", "neo4j://192.168.1.104:7687") \
  .option("labels", "business_id").option("node.keys", "business_id") \
  .option("authentication.basic.username", "neo4j").option("authentication.basic.password", "xxxxxxxxxxxxxxxx")\
  .save()

23/02/15 13:19:21 WARN SchemaService: Switching to query schema resolution
23/02/15 13:19:21 WARN SchemaService: For the following exception
org.neo4j.driver.exceptions.ClientException: Unable to compute the resulting schema from APOC
	at org.neo4j.spark.service.SchemaService.retrieveSchemaFromApoc(SchemaService.scala:95)
	at org.neo4j.spark.service.SchemaService.liftedTree1$1(SchemaService.scala:47)
	at org.neo4j.spark.service.SchemaService.structForNode(SchemaService.scala:36)
	at org.neo4j.spark.service.SchemaService.struct(SchemaService.scala:249)
	at org.neo4j.spark.DataSource.$anonfun$inferSchema$1(DataSource.scala:29)
	at org.neo4j.spark.util.Neo4jUtil$.callSchemaService(Neo4jUtil.scala:329)
	at org.neo4j.spark.DataSource.inferSchema(DataSource.scala:29)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:81)
	at org.apache.spark.sql.DataFrameWriter.getTable$1(DataFrameWriter.scala:304)
	at org.apache.spark.sql.DataFr

In [23]:
reviews_df = spark.read.json("yelp_academic_dataset_review.json")

                                                                                

In [24]:
reviews_df.count()

                                                                                

6990280

In [25]:
! wc -l yelp_academic_dataset_review.json

6990280 yelp_academic_dataset_review.json


In [26]:
reviews_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [27]:
final_reviews_df = reviews_df.groupBy('business_id','user_id').agg(sparkf.avg('stars').alias('avgStars'))

In [28]:
final_reviews_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- avgStars: double (nullable = true)



In [29]:
final_reviews_df.groupBy('business_id','avgStars').count().orderBy('business_id',ascending=False).show()



+--------------------+--------+-----+
|         business_id|avgStars|count|
+--------------------+--------+-----+
|zzyx5x0Z7xXWWvWnZ...|     4.0|    1|
|zzyx5x0Z7xXWWvWnZ...|     2.0|    2|
|zzyx5x0Z7xXWWvWnZ...|     3.0|    2|
|zzyx5x0Z7xXWWvWnZ...|     1.0|    1|
|zzw66H6hVjXQEt0Js...|     1.0|    1|
|zzw66H6hVjXQEt0Js...|     2.0|    1|
|zzw66H6hVjXQEt0Js...|     5.0|    3|
|zzu6_r3DxBJuXcjnO...|     2.0|    1|
|zzu6_r3DxBJuXcjnO...|     3.0|    2|
|zzu6_r3DxBJuXcjnO...|     5.0|    1|
|zzu6_r3DxBJuXcjnO...|     4.0|    4|
|zztOG2cKm87I6Iw_t...|     5.0|    5|
|zztOG2cKm87I6Iw_t...|     4.0|    1|
|zznZqH9CiAznbkV6f...|     5.0|   11|
|zznZqH9CiAznbkV6f...|     4.0|    1|
|zznJox6-nmXlGYNWg...|     2.0|    2|
|zznJox6-nmXlGYNWg...|     1.0|   23|
|zznJox6-nmXlGYNWg...|     3.0|    1|
|zznJox6-nmXlGYNWg...|     4.0|    1|
|zznJox6-nmXlGYNWg...|     5.0|    3|
+--------------------+--------+-----+
only showing top 20 rows



                                                                                

In [30]:
final_reviews_df.distinct().count()

                                                                                

6745760

In [31]:
final_reviews_df.count()

                                                                                

6745760

In [32]:
final_reviews_df.orderBy('business_id').show()



+--------------------+--------------------+--------+
|         business_id|             user_id|avgStars|
+--------------------+--------------------+--------+
|---kPU91CF4Lq2-Wl...|7DKv40qZEDctS9G0M...|     5.0|
|---kPU91CF4Lq2-Wl...|YhbCO1DVlNYkmVv8D...|     5.0|
|---kPU91CF4Lq2-Wl...|Q-ia5eY9smWBTwYOZ...|     5.0|
|---kPU91CF4Lq2-Wl...|V8oYXtc0hMuYzG5Hf...|     3.0|
|---kPU91CF4Lq2-Wl...|5r_jlIQvSr7VG5YFb...|     5.0|
|---kPU91CF4Lq2-Wl...|lEPRGtvLLfJ2BQcU_...|     5.0|
|---kPU91CF4Lq2-Wl...|7F5NSUrKYEPdrewd2...|     5.0|
|---kPU91CF4Lq2-Wl...|i48cHEyRBl5g9_npY...|     4.0|
|---kPU91CF4Lq2-Wl...|WINRnvRO7iGEhrf9i...|     5.0|
|---kPU91CF4Lq2-Wl...|zmgsdGzOp08BWJZ2y...|     5.0|
|---kPU91CF4Lq2-Wl...|zEe31kAcPExQgFR-A...|     5.0|
|---kPU91CF4Lq2-Wl...|goqGoC76zemDagYvR...|     4.0|
|---kPU91CF4Lq2-Wl...|TIx1jZXl57mY-JnS3...|     5.0|
|---kPU91CF4Lq2-Wl...|nRy6-tnPPeShNhZg9...|     5.0|
|---kPU91CF4Lq2-Wl...|YUVbBNr_dSJNP2pwD...|     1.0|
|---kPU91CF4Lq2-Wl...|jtPb5gfrvYixrC0ax...|   

                                                                                

In [33]:
final_reviews_df.count()

                                                                                

6745760

In [34]:
# Join the two dataframes on business_id
abt_df = final_business_df.join(final_reviews_df, "business_id").select('business_id','user_id','avgStars','categories')

In [35]:
abt_df.count()

                                                                                

47051

In [36]:
abt_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- avgStars: double (nullable = true)
 |-- categories: string (nullable = true)



In [40]:
abt_df.columns

['business_id', 'user_id', 'avgStars', 'categories']

In [41]:
abt_df.select(['user_id']).distinct().count()

                                                                                

43631

In [42]:
abt_df.count()

                                                                                

47051

In [43]:
## 1 ธุรกิจต้องมีคะแนนเฉลี่ย rate โดย user 1 คน เพียงคะแนนเดียว (เฉลี่ยคะแนนจากหลายครั้ง)
abt_df.select(['business_id', 'user_id', 'avgStars', 'categories']).groupBy('business_id', 'user_id', 'avgStars', 'categories').count()\
.orderBy('count',ascending=False).show()



+--------------------+--------------------+--------+--------------------+-----+
|         business_id|             user_id|avgStars|          categories|count|
+--------------------+--------------------+--------+--------------------+-----+
|Bdmym_Ozztd6rekAs...|UamE2yMBWkGKzOY0Q...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|2cWv0uwoRIcoTYidY...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|VHGNAjutVRUiSsdEF...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|7DEyddzwcFD1bbt1_...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|5fDYBzmZ9kXuFTYGR...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|tuPapfxmp514bR2Hf...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|tzEU6o8PW0jpKRE8w...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|8nxAYcpD12tdbyx7E...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|gBXTKk0iMUblYx3uk...|     5.0|Skin Care, Waxing...|    1|
|Bdmym_Ozztd6rekAs...|IHKD0ixPkzQbUBeQ5.

                                                                                

In [44]:
abt_df.select(['user_id', 'categories']).distinct().count()

                                                                                

47035

In [45]:
abt_df.select(['business_id', 'user_id','categories']).distinct().count()

                                                                                

47051

In [46]:
abt_df.select(['user_id']).distinct().coalesce(1).write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", "neo4j://192.168.1.104:7687") \
  .option("labels", "user_id").option("node.keys", "user_id") \
  .option("authentication.basic.username", "neo4j").option("authentication.basic.password", "xxxxxxxxxxxxxxxx")\
  .save()

23/02/15 13:21:34 WARN SchemaService: Switching to query schema resolution
23/02/15 13:21:34 WARN SchemaService: For the following exception
org.neo4j.driver.exceptions.ClientException: Unable to compute the resulting schema from APOC
	at org.neo4j.spark.service.SchemaService.retrieveSchemaFromApoc(SchemaService.scala:95)
	at org.neo4j.spark.service.SchemaService.liftedTree1$1(SchemaService.scala:47)
	at org.neo4j.spark.service.SchemaService.structForNode(SchemaService.scala:36)
	at org.neo4j.spark.service.SchemaService.struct(SchemaService.scala:249)
	at org.neo4j.spark.DataSource.$anonfun$inferSchema$1(DataSource.scala:29)
	at org.neo4j.spark.util.Neo4jUtil$.callSchemaService(Neo4jUtil.scala:329)
	at org.neo4j.spark.DataSource.inferSchema(DataSource.scala:29)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:81)
	at org.apache.spark.sql.DataFrameWriter.getTable$1(DataFrameWriter.scala:304)
	at org.apache.spark.sql.DataFr

In [47]:
abt_df.select(['user_id', 'categories']).distinct()\
.groupBy('user_id').count()\
.orderBy('count',ascending=False).show()



+--------------------+-----+
|             user_id|count|
+--------------------+-----+
|ET8n-r7glWYqZhuR6...|   11|
|A5o65BbfpKkt0Bv-r...|    9|
|ylE_w4QR7JCz9cr9u...|    9|
|NRRPUnrCEIjDSt4uG...|    7|
|TFWtU4CQU8l84Uj88...|    7|
|5BIhkthDtf7Gt2QUd...|    6|
|OdEaOuyA6bI_0cErz...|    6|
|_BcWyKQL16ndpBdgg...|    6|
|s9fb6K62gxO9ZYWn8...|    6|
|ETJZm6PJVc9ANZbT8...|    6|
|hwB9QTmjZRd48mbGP...|    6|
|1kdXLwdjvuk7FIS3c...|    6|
|X3Z2Kml_8aCMiJnPA...|    6|
|G9P3h7ZGdbdc_Zt6I...|    6|
|h7p-GuaHFGsiKCF4g...|    6|
|iJdknUsOPUDA2BTfT...|    6|
|1PJ-RjMqXHeymSkLR...|    6|
|MN1f6pyuuDvfmd9LL...|    5|
|sE0natqYMuJ0cmLys...|    5|
|eNtU9lv7gCb-qNy7n...|    5|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [48]:
from pyspark.sql import functions as F


abt_df.select(['business_id', 'user_id', 'avgStars', 'categories']).distinct().coalesce(1).write \
  .format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", "neo4j://192.168.1.104:7687") \
  .option("relationship", "RATED_BY") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", "business_id").option("relationship.source.node.keys", "business_id") \
  .option("relationship.target.labels", "user_id").option("relationship.target.node.keys", "user_id") \
  .option("relationship.properties", "avgStars") \
  .option("authentication.basic.username", "neo4j").option("authentication.basic.password", "xxxxxxxxxxxxxxxx")\
  .save()


23/02/15 13:29:51 WARN SchemaService: Switching to query schema resolution
23/02/15 13:29:51 WARN SchemaService: For the following exception
org.neo4j.driver.exceptions.ClientException: Unable to compute the resulting schema from APOC
	at org.neo4j.spark.service.SchemaService.retrieveSchemaFromApoc(SchemaService.scala:95)
	at org.neo4j.spark.service.SchemaService.liftedTree2$1(SchemaService.scala:177)
	at org.neo4j.spark.service.SchemaService.structForRelationship(SchemaService.scala:161)
	at org.neo4j.spark.service.SchemaService.struct(SchemaService.scala:250)
	at org.neo4j.spark.DataSource.$anonfun$inferSchema$1(DataSource.scala:29)
	at org.neo4j.spark.util.Neo4jUtil$.callSchemaService(Neo4jUtil.scala:329)
	at org.neo4j.spark.DataSource.inferSchema(DataSource.scala:29)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:81)
	at org.apache.spark.sql.DataFrameWriter.getTable$1(DataFrameWriter.scala:304)
	at org.apache.spark.