In [0]:
#import
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.functions import col, trim, to_date
from pyspark.sql.functions import trim, lower, regexp_replace

In [0]:
# Define schema for Reviews.csv
reviews_schema = StructType([
    StructField("listing_id", IntegerType(), True),
    StructField("id", IntegerType(), True),
    StructField("date", TimestampType(), True),
    StructField("reviewer_id", IntegerType(), True),
    StructField("reviewer_name", StringType(), True),
    StructField("comments", StringType(), True)
])

# Load into DataFrame from managed volume
bronze_reviews_df = spark.read.csv(
    "/Volumes/diggibyte/airbnb/bronze_vol/Reviews.csv",
    schema=reviews_schema,
    header=True
)

# Define schema for Listings_data_dictionary.csv
listings_schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("description", StringType(), True),
    StructField("host_id", StringType(), True),
    StructField("host_name", StringType(), True),
    StructField("neighbourhood", StringType(), True),
    StructField("room_type", StringType(), True),
    StructField("price", StringType(), True)
])

# Load into DataFrame from managed volume
bronze_listings_df = spark.read.csv(
    "/Volumes/diggibyte/airbnb/bronze_vol/Listings.csv",
    schema=listings_schema,
    header=True
)

bronze_reviews_df.show()
bronze_listings_df.show()

In [0]:
%sql
SHOW VOLUMES;

In [0]:
#Silver layer

#Review
silver_reviews_df = (
    bronze_reviews_df
    .filter(col("listing_id").isNotNull() & col("date").isNotNull())
    .withColumn("reviewer_name", trim(col("reviewer_name")))
    .withColumn("date", to_date(col("date")))  # convert to proper date
    .dropDuplicates(["id"])  # drop duplicate reviews
)

#Listing

silver_listings_df = (
    bronze_listings_df
    .withColumn("price", regexp_replace(col("price"), "[$,]", "").cast("double"))
    .withColumn("neighbourhood", trim(lower(col("neighbourhood"))))
    .withColumn("room_type", trim(col("room_type")))
    .filter(col("room_type").isin("Entire home/apt", "Private room", "Shared room", "Hotel room"))b
)

#Register as tables in Unity Catalog:
 
silver_reviews_df.write.saveAsTable("diggibyte.airbnb.silver_reviews", mode="overwrite")
silver_listings_df.write.saveAsTable("diggibyte.airbnb.silver_listings", mode="overwrite")

### Summary of Gold Layer Outputs

- Host Performance KPIs → top hosts, avg pricing, reviews.
- Neighbourhood Insights → avg price, popularity, demand hotspots.
- Room Type Trends → what’s popular and at what cost.
- Time-based Review Trends → seasonality, growth.
- Revenue Estimation → host & neighborhood revenue.

In [0]:
#Gold layer imports
from pyspark.sql.functions import (
    count,
    avg,
    sum as _sum,
    year,
    month,
    col
)


In [0]:
# 1. Host Performance
review_counts_df = (
    silver_reviews_df
    .groupBy("listing_id")
    .agg(count("*").alias("review_count"))
)

gold_host_perf = (
    silver_listings_df
    .groupBy("host_id", "host_name")
    .agg(
        count("id").alias("total_listings"),
        avg("price").alias("avg_price")
    )
    .join(
        silver_listings_df
        .join(review_counts_df, silver_listings_df.id == review_counts_df.listing_id, "left")
        .groupBy("host_id", "host_name")
        .agg(_sum("review_count").alias("total_reviews")),
        ["host_id", "host_name"],
        "left"
    )
)

# 2. Neighbourhood Insights
gold_neigh = (
    silver_listings_df
    .groupBy("neighbourhood")
    .agg(
        avg("price").alias("avg_price"),
        count("id").alias("total_listings")
    )
)

# 3. Room Type Trends
gold_roomtype = (
    silver_listings_df
    .groupBy("room_type")
    .agg(
        count("id").alias("total_listings"),
        avg("price").alias("avg_price")
    )
)

# 4. Revenue Estimation
gold_revenue = (
    silver_listings_df
    .join(silver_reviews_df, silver_listings_df.id == silver_reviews_df.listing_id, "inner")
    .groupBy("host_id", "host_name")
    .agg(_sum("price").alias("est_revenue"))
)

# Register as tables in Unity Catalog:
gold_host_perf.write.saveAsTable(
    "diggibyte.airbnb.gold_host_perf",
    mode="overwrite"
)
gold_neigh.write.saveAsTable(
    "diggibyte.airbnb.gold_neigh",
    mode="overwrite"
)
gold_roomtype.write.saveAsTable(
    "diggibyte.airbnb.gold_roomtype",
    mode="overwrite"
)
 
gold_revenue.write.saveAsTable(
    "diggibyte.airbnb.gold_revenue",
    mode="overwrite"
)

In [0]:
%sql
SELECT * FROM diggibyte.airbnb.silver_reviews