In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.functions import split, col
from pyspark.sql.types import ArrayType, TimestampType
from pyspark.sql.types import StructType, MapType, StructField, StringType, DoubleType, LongType

In [20]:
spark = (
    SparkSession
    .builder
    .appName("Project App")
    .config("spark.executor.memory", "8000mb")
    .config('spark.sql.shuffle.partitions', 4)
    .config('spark.default.parallelism', 4)
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")
spark

In [10]:
baseInputPath = "/Users/hims/Downloads/yelp_dataset/"
baseOutputPath = "/tmp/test-1"

In [11]:
def process_business_data():
    try:
        businessDf = spark.read.parquet(f"{baseOutputPath}/business")
    except Exception as e:
        
        schema = StructType([
            StructField("address", StringType(), True),
            StructField("attributes", MapType(StringType(), StringType()), True),
            StructField("business_id", StringType(), True),
            StructField("categories", StringType(), True),
            StructField("city", StringType(), True),
            StructField("hours", MapType(StringType(), StringType()), True),
            StructField("is_open", LongType(), True),
            StructField("latitude", DoubleType(), True),
            StructField("longitude", DoubleType(), True),
            StructField("name", StringType(), True),
            StructField("postal_code", StringType(), True),
            StructField("review_count", LongType(), True),
            StructField("stars", DoubleType(), True),
            StructField("state", StringType(), True),
        ])
        
        businessDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_business.json', schema)
        businessDf.printSchema()
        businessDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/business")
        businessDf = spark.read.parquet(f"{baseOutputPath}/business")
        
    businessDf.show(4)
    return businessDf

In [12]:
def process_user_data():
    try:
        userDf = spark.read.parquet(f"{baseOutputPath}/user")
    except Exception as e:
        userDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_user.json') \
            .drop("friends") \
            .withColumn("elite", split(col("elite"), ",")) \
            .withColumn("yelping_since", col("yelping_since").cast("timestamp"))
        
        userDf.printSchema()
        userDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/user")
        userDf = spark.read.parquet(f"{baseOutputPath}/user")
        
    userDf.show(3)
    return userDf

In [13]:
def process_friends_data():
    try:
        friendsDf = spark.read.parquet(f"{baseOutputPath}/friends")
    except Exception as e:
        friendsDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_user.json') \
            .select("user_id", split(col("friends"), ", ").alias("friends"))
        
        friendsDf.printSchema()
        friendsDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/friends")
        friendsDf = spark.read.parquet(f"{baseOutputPath}/friends")
    
    friendsDf.show(4)
    return friendsDf


In [14]:
def process_checkin_data():
    try:
        checkinDf = spark.read.parquet(f"{baseOutputPath}/checkin")
    except Exception as e:
        checkinDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_checkin.json') \
            .withColumn("date", expr("transform(split(date, ', '), d -> to_timestamp(d))").cast(ArrayType(TimestampType())))

        checkinDf.printSchema()

        checkinDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/checkin")
        checkinDf = spark.read.parquet(f"{baseOutputPath}/checkin")

    checkinDf.show(4)
    return checkinDf

In [15]:
def process_tip_data():
    try:
        tipDf = spark.read.parquet(f"{baseOutputPath}/checkin")
    except Exception as e:
        tipDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_tip.json') \
            .withColumn("date", col("date").cast("timestamp"))
        
        tipDf.printSchema()
        tipDf.coalesce(1).write.mode("overwrite").parquet(f"{baseOutputPath}/tip")
        tipDf = spark.read.parquet(f"{baseOutputPath}/tip")
    
    tipDf.show(4, truncate=False)
    return tipDf

In [16]:
def process_review_data():
    try:
        reviewDf = spark.read.parquet(f"{baseOutputPath}/review")
    except Exception as e:
        reviewDf = spark.read.json(f'{baseInputPath}/yelp_academic_dataset_review.json') \
            .withColumn("date", col("date").cast("timestamp"))
        
        reviewDf.printSchema()
        reviewDf.write.mode("overwrite").parquet(f"{baseOutputPath}/review")
        reviewDf = spark.read.parquet(f"{baseOutputPath}/review")
        
    reviewDf.show(4)
    return reviewDf

In [17]:
business_df = process_business_data()
user_df = process_user_data()
friends_df = process_friends_data()
checkin_df = process_checkin_data()
tip_df = process_tip_data()
review_df = process_review_data()

                                                                                

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|  latitude|   longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{ByAppointmentOnl...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...|Santa Barbara|                NULL|      0|34.4266787|-119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{BusinessAcceptsC...|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|       Affton|{Monday -> 0:0-0:...|      1| 38.551126|  -90.335695|    