In [3]:
zone_path = "Files/nyc_taxi/taxi_zone_lookup"
z = spark.read.option("header", True).option("inferSchema", True).csv(zone_path)
z.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("dbo.gold_dim_zone_v2")

print("gold_dim_zone_v2:", spark.table("dbo.gold_dim_zone_v2").count())
spark.table("dbo.gold_dim_zone_v2").show(5, truncate=False)

StatementMeta(, 5dd3e9f8-ecd9-463b-be3e-30549e10807e, 5, Finished, Available, Finished)

gold_dim_zone_v2: 265
+----------+-------------+-----------------------+------------+
|LocationID|Borough      |Zone                   |service_zone|
+----------+-------------+-----------------------+------------+
|1         |EWR          |Newark Airport         |EWR         |
|2         |Queens       |Jamaica Bay            |Boro Zone   |
|3         |Bronx        |Allerton/Pelham Gardens|Boro Zone   |
|4         |Manhattan    |Alphabet City          |Yellow Zone |
|5         |Staten Island|Arden Heights          |Boro Zone   |
+----------+-------------+-----------------------+------------+
only showing top 5 rows



In [1]:
from pyspark.sql import functions as F
d = (
    spark.table("dbo.gold_fact_taxi_daily_v2")
    .select(F.col("trip_day").alias("date"))
    .dropDuplicates()
    .orderBy("date")
    .withColumn("date_key", F.date_format("date", "yyyyMMdd").cast("int"))
    .withColumn("year", F.year("date"))
    .withColumn("month", F.month("date"))
    .withColumn("day", F.dayofmonth("date"))
    .withColumn("dow", F.dayofweek("date"))
)
d.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("dbo.gold_dim_date_v2")
print("gold_dim_date_v2:", spark.table("dbo.gold_dim_date_v2").count())
spark.table("dbo.gold_dim_date_v2").show(5, truncate=False)

StatementMeta(, 1bb2d0c9-6f1d-4dd2-b95e-b402571ed2b2, 3, Finished, Available, Finished)

gold_dim_date_v2: 31
+----------+--------+----+-----+---+---+
|date      |date_key|year|month|day|dow|
+----------+--------+----+-----+---+---+
|2019-01-01|20190101|2019|1    |1  |3  |
|2019-01-02|20190102|2019|1    |2  |4  |
|2019-01-03|20190103|2019|1    |3  |5  |
|2019-01-04|20190104|2019|1    |4  |6  |
|2019-01-05|20190105|2019|1    |5  |7  |
+----------+--------+----+-----+---+---+
only showing top 5 rows



In [2]:
from pyspark.sql import functions as F
#
fx = (
    spark.table("dbo.silver_ecb_fx_usd_eur_daily_v2")
    .select(F.col("day").alias("date"), F.col("usd_eur_rate"))
    .filter((F.col("date") >= F.lit("2019-01-01")) & (F.col("date") < F.lit("2019-02-01")))
    .withColumn("date_key", F.date_format("date", "yyyyMMdd").cast("int"))
)
fx.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("dbo.gold_dim_fx_v2")
print("gold_dim_fx_v2:", spark.table("dbo.gold_dim_fx_v2").count())

gdp = spark.table("dbo.silver_worldbank_gdp_us_v2").select("year", "gdp_usd")
gdp.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("dbo.gold_dim_gdp_v2")
print("gold_dim_gdp_v2:", spark.table("dbo.gold_dim_gdp_v2").count())

StatementMeta(, 1bb2d0c9-6f1d-4dd2-b95e-b402571ed2b2, 4, Finished, Available, Finished)

gold_dim_fx_v2: 22
gold_dim_gdp_v2: 50


In [3]:
from pyspark.sql import functions as F

date_dim = spark.table("dbo.gold_dim_date_v2").select("date", "date_key", "year")
fx_dim   = spark.table("dbo.gold_dim_fx_v2").select("date_key", "usd_eur_rate")
gdp_dim  = spark.table("dbo.gold_dim_gdp_v2").select("year", "gdp_usd")
taxi_daily = spark.table("dbo.gold_fact_taxi_daily_v2")
fact_daily = (
    taxi_daily
    .withColumnRenamed("trip_day", "date")
    .join(date_dim, on="date", how="left")
    .join(fx_dim, on="date_key", how="left")
    .join(gdp_dim, on="year", how="left")
    .withColumn("revenue_usd", F.col("revenue_total").cast("double"))
    .withColumn("revenue_eur", F.col("revenue_usd") / F.col("usd_eur_rate"))
    .drop("date")  # оставим date_key как ключ
)

fact_daily.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("dbo.gold_fact_taxi_daily_star_v2")
print("gold_fact_taxi_daily_star_v2:", spark.table("dbo.gold_fact_taxi_daily_star_v2").count())
zone_dim = spark.table("dbo.gold_dim_zone_v2").select(
    F.col("LocationID").cast("int").alias("PULocationID"),
    F.col("Borough").alias("pu_borough"),
    F.col("Zone").alias("pu_zone"),
    F.col("service_zone").alias("pu_service_zone")
)

taxi_zone = spark.table("dbo.gold_fact_taxi_zone_daily_v2").withColumnRenamed("trip_day", "date")
fact_zone = (
    taxi_zone
    .join(date_dim, on="date", how="left")
    .join(fx_dim, on="date_key", how="left")
    .join(zone_dim, on="PULocationID", how="left")
    .withColumn("revenue_usd", F.col("revenue_total").cast("double"))
    .withColumn("revenue_eur", F.col("revenue_usd") / F.col("usd_eur_rate"))
    .drop("date")
)
fact_zone.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("dbo.gold_fact_taxi_zone_daily_star_v2")
print("gold_fact_taxi_zone_daily_star_v2:", spark.table("dbo.gold_fact_taxi_zone_daily_star_v2").count())

StatementMeta(, 1bb2d0c9-6f1d-4dd2-b95e-b402571ed2b2, 5, Finished, Available, Finished)

gold_fact_taxi_daily_star_v2: 31
gold_fact_taxi_zone_daily_star_v2: 7671


In [4]:
from pyspark.sql import functions as F

date_dim = spark.table("dbo.gold_dim_date_v2").select("date", "date_key")
aq = spark.table("dbo.gold_fact_air_quality_daily_v2").withColumnRenamed("day_utc", "date")
fact_aq = aq.join(date_dim, on="date", how="left").drop("date")
fact_aq.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("dbo.gold_fact_air_quality_daily_star_v2")
print("gold_fact_air_quality_daily_star_v2:", spark.table("dbo.gold_fact_air_quality_daily_star_v2").count())

StatementMeta(, 1bb2d0c9-6f1d-4dd2-b95e-b402571ed2b2, 6, Finished, Available, Finished)

gold_fact_air_quality_daily_star_v2: 490


In [1]:
from pyspark.sql import functions as F

date_dim = spark.table("dbo.gold_dim_date_v2").select(F.col("date").alias("day_utc"),"date_key")
aq = spark.table("dbo.gold_fact_air_quality_daily_v2")  #day_utc
aq_star = aq.join(date_dim, on="day_utc", how="inner")
aq_star.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("dbo.gold_fact_air_quality_daily_star_v2")
df = spark.table("dbo.gold_fact_air_quality_daily_star_v2")
print("rows:", df.count())
print("null date_key:", df.filter(F.col("date_key").isNull()).count())

StatementMeta(, 5e91a36d-bfd9-4a16-bff5-b5c476e805a6, 3, Finished, Available, Finished)

rows: 474
null date_key: 0
