# Анализ больших данных - лабораторная работа №2 - ETL реализованный с помощью Spark

## Работа с Spark для преобразования данных в снежинку

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder \
    .appName("BigDataLab2") \
    .config("spark.jars", 
            "/opt/spark/jars/postgresql-42.7.6.jar") \
    .getOrCreate()

In [3]:
pg_url = "jdbc:postgresql://postgres:5432/database"
pg_properties = {
    "user": "postgres",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

Функции для чтения из БД и записи в БД таблиц

In [7]:
def read_table(table_name):
    return spark.read.jdbc(url=pg_url, table=table_name, properties=pg_properties)

In [8]:
def write_table(df, table_name):
    df.write \
        .format("jdbc") \
        .option("url", pg_url) \
        .option("dbtable", table_name) \
        .option("user", pg_properties["user"]) \
        .option("password", pg_properties["password"]) \
        .option("driver", "org.postgresql.Driver") \
        .mode("append") \
        .save()

In [4]:
mock_data = read_table("mock_data")

In [5]:
mock_data.show(5)
mock_data.count()

+-------------------+------------------+------------+--------------------+----------------+--------------------+-----------------+-----------------+------------------+-----------------+----------------+--------------------+--------------+------------------+------------+----------------+--------------------+----------------+----------+----------------+--------------+---------------+-------------+--------------------+----------+--------------+----------+-----------+-------------+------------+--------------------+------------+--------------------+-------------+------------+-------------+----------------+--------------------+--------------------+---------------+--------------------+-------------------+-------------+----------------+--------------------+--------------+----------------+------------------+----------------+---+
|customer_first_name|customer_last_name|customer_age|      customer_email|customer_country|customer_postal_code|customer_pet_type|customer_pet_name|customer_pet_breed|s

10000

In [6]:
ddl_queries = [
    "CREATE TABLE dim_country (country_id SERIAL PRIMARY KEY, country_name VARCHAR(100));",
    "CREATE TABLE dim_city (city_id SERIAL PRIMARY KEY, city_name VARCHAR(100));",
    "CREATE TABLE dim_pet_type (pet_type_id SERIAL PRIMARY KEY, pet_type_name VARCHAR(100));",
    "CREATE TABLE dim_pet_breed (pet_breed_id SERIAL PRIMARY KEY, pet_breed_name VARCHAR(100));",
    "CREATE TABLE dim_pet_category (pet_category_id SERIAL PRIMARY KEY, pet_category_name VARCHAR(100));",
    "CREATE TABLE dim_pet (pet_id SERIAL PRIMARY KEY, name VARCHAR(50), pet_type_id INT REFERENCES dim_pet_type(pet_type_id), pet_breed_id INT REFERENCES dim_pet_breed(pet_breed_id), pet_category_id INT REFERENCES dim_pet_category(pet_category_id));",
    "CREATE TABLE dim_customer (customer_id SERIAL PRIMARY KEY, first_name VARCHAR(100), last_name VARCHAR(100), age INT, email VARCHAR(255) UNIQUE, country_id INT REFERENCES dim_country(country_id), postal_code VARCHAR(20), pet_id INT REFERENCES dim_pet(pet_id));",
    "CREATE TABLE dim_seller (seller_id SERIAL PRIMARY KEY, first_name VARCHAR(100), last_name VARCHAR(100), email VARCHAR(255) UNIQUE, country_id INT REFERENCES dim_country(country_id), postal_code VARCHAR(20));",
    "CREATE TABLE dim_supplier (supplier_id SERIAL PRIMARY KEY, name VARCHAR(200), contact VARCHAR(200), email VARCHAR(255) UNIQUE, phone VARCHAR(50), address TEXT, city_id INT REFERENCES dim_city(city_id), country_id INT REFERENCES dim_country(country_id));",
    "CREATE TABLE dim_store (store_id SERIAL PRIMARY KEY, name VARCHAR(200), location VARCHAR(200), city_id INT REFERENCES dim_city(city_id), state VARCHAR(100), country_id INT REFERENCES dim_country(country_id), phone VARCHAR(50), email VARCHAR(255));",
    "CREATE TABLE dim_product_name (product_name_id SERIAL PRIMARY KEY, product_name VARCHAR(50) UNIQUE);",
    "CREATE TABLE dim_product_category (category_id SERIAL PRIMARY KEY, category_name VARCHAR(100) UNIQUE);",
    "CREATE TABLE dim_brand (brand_id SERIAL PRIMARY KEY, brand_name VARCHAR(100) UNIQUE);",
    "CREATE TABLE dim_material (material_id SERIAL PRIMARY KEY, material_name VARCHAR(100) UNIQUE);",
    "CREATE TABLE dim_color (color_id SERIAL PRIMARY KEY, color_name VARCHAR(50) UNIQUE);",
    "CREATE TABLE dim_size (size_id SERIAL PRIMARY KEY, size_name VARCHAR(50) UNIQUE);",
    "CREATE TABLE dim_product (product_id SERIAL PRIMARY KEY, product_name_id INT REFERENCES dim_product_name(product_name_id), category_id INT REFERENCES dim_product_category(category_id), price NUMERIC(12,2), quantity INT, weight NUMERIC(12,2), color_id INT REFERENCES dim_color(color_id), size_id INT REFERENCES dim_size(size_id), brand_id INT REFERENCES dim_brand(brand_id), material_id INT REFERENCES dim_material(material_id), description TEXT, rating NUMERIC(3,2), reviews INT, release_date DATE, expiry_date DATE, supplier_id INT REFERENCES dim_supplier(supplier_id));",
    "CREATE TABLE fact_sales (sale_id SERIAL PRIMARY KEY, sale_date DATE, customer_id INT REFERENCES dim_customer(customer_id), seller_id INT REFERENCES dim_seller(seller_id), store_id INT REFERENCES dim_store(store_id), product_id INT REFERENCES dim_product(product_id), sale_quantity INT, sale_total_price NUMERIC(14,2));"
]

for query in ddl_queries:
    spark.sparkContext._jvm.java.sql.DriverManager.getConnection(
        pg_url, pg_properties["user"], pg_properties["password"]
    ).createStatement().execute(query)

Таблица dim_country

In [9]:
dim_country = mock_data.select(col("customer_country").alias("country_name")) \
    .union(
        mock_data.select(col("seller_country").alias("country_name"))
    ) \
    .union(
        mock_data.select(col("store_country").alias("country_name"))
    ) \
    .union(
        mock_data.select(col("supplier_country").alias("country_name"))
    ) \
    .distinct()



In [10]:
dim_country.show()
dim_country.count()

+--------------------+
|        country_name|
+--------------------+
|                Chad|
|              Russia|
|            Paraguay|
|               Yemen|
| U.S. Virgin Islands|
|             Senegal|
|              Sweden|
|Svalbard and Jan ...|
|              Guyana|
|         Philippines|
|             Eritrea|
|            Djibouti|
|            Malaysia|
|              Turkey|
|              Malawi|
|                Iraq|
|             Germany|
|Northern Mariana ...|
|             Comoros|
|         Afghanistan|
+--------------------+
only showing top 20 rows



230

In [11]:
write_table(dim_country, "dim_country")

Таблица dim_city

In [12]:
dim_city = mock_data.select(col("store_city").alias("city_name")) \
    .union(
        mock_data.select(col("supplier_city").alias("city_name"))
    ) \
    .distinct()

In [13]:
dim_city.show()
dim_city.count()

+-----------+
|  city_name|
+-----------+
|     Takefu|
|Trollhättan|
|   Żyrardów|
|   Jaboatão|
|    Nanshan|
|Jiujianfang|
|      Tocok|
|  Sułkowice|
|   Borūjerd|
|      Pakel|
|      Tyler|
|      Trzin|
|    Palermo|
| Curpahuasi|
|     Raheny|
|      Apodi|
|       Īlām|
|   Tokarnia|
|    Odawara|
|      Bicaj|
+-----------+
only showing top 20 rows



14181

In [14]:
write_table(dim_city, "dim_city")

Таблица dim_pet_type

In [15]:
dim_pet_type = mock_data \
    .select(
        col("customer_pet_type").alias("pet_type_name")
    ) \
    .distinct()

In [16]:
dim_pet_type.show()
dim_pet_type.count()

+-------------+
|pet_type_name|
+-------------+
|          dog|
|          cat|
|         bird|
+-------------+



3

In [17]:
write_table(dim_pet_type, "dim_pet_type")

Таблица dim_pet_breed

In [18]:
dim_pet_breed = mock_data \
    .select(
        col("customer_pet_breed").alias("pet_breed_name")
    ) \
    .distinct()

In [19]:
dim_pet_breed.show()
dim_pet_breed.count()

+------------------+
|    pet_breed_name|
+------------------+
|Labrador Retriever|
|          Parakeet|
|           Siamese|
+------------------+



3

In [20]:
write_table(dim_pet_breed, "dim_pet_breed")

Таблица dim_pet_category

In [21]:
dim_pet_category = mock_data.alias("md") \
    .select(
        col("md.pet_category").alias("pet_category_name")
    ) \
    .distinct()

In [22]:
dim_pet_category.show()
dim_pet_category.count()

+-----------------+
|pet_category_name|
+-----------------+
|         Reptiles|
|             Fish|
|            Birds|
|             Dogs|
|             Cats|
+-----------------+



5

In [23]:
write_table(dim_pet_category, "dim_pet_category")

Таблица dim_pet

In [24]:
dim_pet_type_df = read_table("dim_pet_type")
dim_pet_breed_df = read_table("dim_pet_breed")
dim_pet_category_df = read_table("dim_pet_category")

dim_pet = mock_data.alias("md") \
    .join(dim_pet_type_df.alias("pt"), col("md.customer_pet_type") == col("pt.pet_type_name")) \
    .join(dim_pet_breed_df.alias("pb"), col("md.customer_pet_breed") == col("pb.pet_breed_name")) \
    .join(dim_pet_category_df.alias("pc"), col("md.pet_category") == col("pc.pet_category_name")) \
    .select(
        col("md.customer_pet_name").alias("name"),
        col("pt.pet_type_id"),
        col("pb.pet_breed_id"),
        col("pc.pet_category_id")   
    ) \
    .distinct()

In [25]:
dim_pet.show()
dim_pet.count()

+--------+-----------+------------+---------------+
|    name|pet_type_id|pet_breed_id|pet_category_id|
+--------+-----------+------------+---------------+
|    Bili|          3|           2|              3|
|   Cordi|          3|           2|              3|
|    Addy|          3|           2|              3|
|Isabelle|          1|           2|              3|
| Hedvige|          1|           2|              3|
|   Gayle|          3|           3|              3|
|   Worth|          1|           3|              3|
|   Maddy|          2|           1|              5|
| Inesita|          2|           1|              5|
|    Perl|          3|           2|              5|
|   Moria|          3|           2|              5|
|   Shawn|          3|           2|              5|
|  Lamont|          3|           2|              5|
|     Ira|          1|           2|              5|
|Catriona|          1|           2|              5|
| Ethelyn|          2|           3|              5|
| Florian|  

9850

In [26]:
write_table(dim_pet, "dim_pet")

Таблица dim_customer

In [27]:
dim_country_df = read_table("dim_country")
dim_pet_df = read_table("dim_pet")
dim_pet_type_df = read_table("dim_pet_type")
dim_pet_breed_df = read_table("dim_pet_breed")
dim_pet_category_df = read_table("dim_pet_category")

dim_customer = mock_data.alias("md") \
    .join(dim_country_df.alias("c"), col("md.customer_country") == col("c.country_name")) \
    .join(dim_pet_category_df.alias("dpc"), col("md.pet_category") == col("dpc.pet_category_name")) \
    .join(dim_pet_breed_df.alias("dpb"), col("md.customer_pet_breed") == col("dpb.pet_breed_name")) \
    .join(dim_pet_type_df.alias("dpt"), col("md.customer_pet_type") == col("dpt.pet_type_name")) \
    .join(
        dim_pet_df.alias("pt"),
        (col("md.customer_pet_name") == col("pt.name")) &
        (col("pt.pet_category_id") == col("dpc.pet_category_id")) &
        (col("pt.pet_breed_id") == col("dpb.pet_breed_id")) &
        (col("pt.pet_type_id") == col("dpt.pet_type_id"))
    ) \
    .select(
        col("md.customer_first_name").alias("first_name"),
        col("md.customer_last_name").alias("last_name"),
        col("md.customer_age").alias("age"),
        col("md.customer_email").alias("email"),
        col("c.country_id"),
        col("md.customer_postal_code").alias("postal_code"),
        col("pt.pet_id")
    ) \
    .distinct()


In [28]:
dim_customer.show()
dim_customer.count()

+----------+------------+---+--------------------+----------+-------------+------+
|first_name|   last_name|age|               email|country_id|  postal_code|pet_id|
+----------+------------+---+--------------------+----------+-------------+------+
|   Karlene|   Suermeier| 40|dlafaye6t@moonfru...|        28|37032 CEDEX 1|  1492|
| Hyacintha|     Marmyon| 21|ebewly1b@e-recht2...|        97|        34110|  5526|
|   Ulberto|    Seiffert| 27|syeldingh@zimbio.com|         7|       334 80|  2917|
|     Daron|      Dubois| 38|pmoggle3y@weebly.com|       160|    83750-000|  2928|
|     Hedda|     Enrrico| 45|rdodswell7s@googl...|        10|         1219|  3037|
|   Anderea|   Izakovitz| 59|     dcappsmv@si.edu|       171|       L-6562|  3479|
|   Charles|     Selburn| 25|ogarlettix@jiathi...|         2|       666679|  6670|
|     Molli|    McGeorge| 24|mmaccreacb@alibab...|       176|       87-410|  8550|
|   Marlena|     Huthart| 34|btolletb@blogline...|       169|     839-1301|   359|
|   

10000

In [29]:
write_table(dim_customer, "dim_customer")

Таблица dim_seller

In [31]:
dim_city_df = read_table("dim_city")

dim_seller = mock_data.alias("md") \
    .join(dim_country_df.alias("c"), col("md.seller_country") == col("c.country_name")) \
    .select(
        col("md.seller_first_name").alias("first_name"),
        col("md.seller_last_name").alias("last_name"),
        col("md.seller_email").alias("email"),
        col("c.country_id"),
        col("md.seller_postal_code").alias("postal_code"),
    ) \
    .distinct()

In [32]:
dim_seller.show()
dim_seller.count()

+----------+------------+--------------------+----------+-------------+
|first_name|   last_name|               email|country_id|  postal_code|
+----------+------------+--------------------+----------+-------------+
|   Mitchel|  Chadderton|mchaddertoncq@tin...|         2|       141309|
|     Sukey|       Pashe|    spasher8@163.com|         2|       632147|
| Inglebert|       Hearn|    ihearn5b@sun.com|         2|       427439|
|    Joshua|      Castan|jcastanei@photobu...|        10|         2614|
|      Nero|       Slyde|   nslydeoz@ucsd.edu|        10|         6116|
|    Daveen|  MacIlraith|dmacilraith27@123...|        10|         8407|
|     Berke|    Prestner| bprestnerp2@ucoz.ru|        10|         3813|
|   Filippa|     Gatchel|fgatchelpl@e-rech...|        10|         1470|
|   Feodora|        Meek|fmeekk9@engadget.com|        13|        01606|
|      Nari|     Fitchew|nfitchewfz@refere...|        28|79049 CEDEX 9|
|     Brody|Margaritelli|bmargaritelliof@r...|        28|61891 C

10000

In [33]:
write_table(dim_seller, "dim_seller")

Таблица dim_supplier

In [34]:
dim_supplier = mock_data.alias("md") \
    .join(dim_country_df.alias("c"), col("md.supplier_country") == col("c.country_name")) \
    .join(dim_city_df.alias("ct"), col("md.supplier_city") == col("ct.city_name")) \
    .select(
        col("md.supplier_name").alias("name"),
        col("md.supplier_contact").alias("contact"),
        col("md.supplier_email").alias("email"),
        col("md.supplier_phone").alias("phone"),
        col("md.supplier_address").alias("address"),
        col("ct.city_id"),
        col("c.country_id")
    ) \
    .distinct()

In [35]:
dim_supplier.show()
dim_supplier.count()

+----------+-------------------+--------------------+------------+------------+-------+----------+
|      name|            contact|               email|       phone|     address|city_id|country_id|
+----------+-------------------+--------------------+------------+------------+-------+----------+
|     Yamia|       Levy Duferie|lduferie7a@typepa...|690-727-9072|   Room 1745|   9584|         2|
| Browsebug|   Humphrey Kelinge|hkelinge2b@scienc...|587-999-7122|    Apt 1654|   4457|         7|
|   Zoombox|    Jordanna Klambt|jklambtb8@feedbur...|987-203-7278|   Room 1901|  12473|         7|
|   Dabtype|     Erma Charrette|  echarretteay@is.gd|590-264-6444|   Room 1682|  13176|        10|
|  Realcube|      Cybill Busain|cbusainex@paypal.com|742-902-5584|    Apt 1496|  14168|        10|
| Rhynoodle|     Giovanna Buzza|   gbuzzahc@hibu.com|743-170-5278|   9th Floor|  12665|        17|
|      Jayo|       Rossy Leedes|rleedesaj@unicef.org|226-483-8338|    Suite 43|  11983|        20|
|    Voont

10000

In [36]:
write_table(dim_supplier, "dim_supplier")

Табилца dim_store

In [37]:
dim_store = mock_data.alias("md") \
    .join(dim_country_df.alias("c"), col("md.store_country") == col("c.country_name")) \
    .join(dim_city_df.alias("ct"), col("md.store_city") == col("ct.city_name")) \
    .select(
        col("md.store_name").alias("name"),
        col("md.store_location").alias("location"),
        col("ct.city_id"),
        col("md.store_state").alias("state"),
        col("c.country_id"),
        col("md.store_phone").alias("phone"),
        col("md.store_email").alias("email")
    ) \
    .distinct()

In [38]:
dim_store.show()
dim_store.count()

+----------+------------+-------+-----+----------+------------+--------------------+
|      name|    location|city_id|state|country_id|       phone|               email|
+----------+------------+-------+-----+----------+------------+--------------------+
| Linklinks|    Suite 76|    680|   CA|        29|760-109-4086|hblackbourn7g@clo...|
|  Innotype|   Room 1471|   6980|   A5|        97|767-675-9070|cfreelandh8@about...|
|   Skippad|   Room 1421|    453|  VER|       123|923-120-1631|alamprecht89@geoc...|
|    Talane|    Room 770|   6170|   BD|       169|119-915-7580|kborgesio54@csmon...|
|    Leenti|   2nd Floor|   4382|   11|       169|369-916-3645|   cchandlaral@is.gd|
| Skipstorm|PO Box 35455|   6189|   11|       177|722-142-5422|    eloosra@bing.com|
| Photolist|  16th Floor|   3030|   ON|       199|719-912-7433|   mhawesky@digg.com|
|   Youfeed|     Apt 633|   7836|   NC|         2|704-280-5105|wzappelo4@merriam...|
|Brainverse|    Suite 83|   1939|   BE|        28|196-979-6650|mr

10000

In [39]:
write_table(dim_store, "dim_store")

Таблица dim_product_name

In [40]:
dim_product_name = mock_data \
    .select(
        col("product_name")
    ) \
    .distinct()

In [41]:
dim_product_name.show()
dim_product_name.count()

+------------+
|product_name|
+------------+
|   Bird Cage|
|    Dog Food|
|     Cat Toy|
+------------+



3

In [42]:
write_table(dim_product_name, "dim_product_name")

Таблица dim_product_category

In [43]:
dim_product_category = mock_data \
    .select(
        col("product_category").alias("category_name")
    ) \
    .distinct()

In [44]:
dim_product_category.show()
dim_product_category.count()

+-------------+
|category_name|
+-------------+
|         Cage|
|         Food|
|          Toy|
+-------------+



3

In [45]:
write_table(dim_product_category, "dim_product_category")

Табилца dim_brand

In [46]:
dim_brand = mock_data \
    .select(
        col("product_brand").alias("brand_name")
    ) \
    .distinct()

In [47]:
dim_brand.show()
dim_brand.count()

+------------+
|  brand_name|
+------------+
|     Jetwire|
|    Jaxworks|
|   Reallinks|
| Brainlounge|
|    Snaptags|
|    Feedfish|
|       Kamba|
|    Skipfire|
|      Quimba|
|       Quaxo|
|    Realfire|
|      Oyondu|
|      BlogXS|
|Thoughtworks|
|  Browsezoom|
|     Voonder|
|   Photofeed|
|   Babbleset|
|       Yabox|
|       Einti|
+------------+
only showing top 20 rows



383

In [48]:
write_table(dim_brand, "dim_brand")

Таблица dim_material

In [49]:
dim_material = mock_data \
    .select(
        col("product_material").alias("material_name")
    ) \
    .distinct()

In [50]:
dim_material.show()
dim_material.count()

+-------------+
|material_name|
+-------------+
|        Steel|
|        Vinyl|
|      Granite|
|        Glass|
|      Plastic|
|     Aluminum|
|         Wood|
|       Rubber|
|        Stone|
|   Plexiglass|
|        Brass|
+-------------+



11

In [51]:
write_table(dim_material, "dim_material")

Таблица dim_material

In [52]:
dim_color = mock_data \
    .select(
        col("product_color").alias("color_name")
    ) \
    .distinct()

In [53]:
dim_color.show()
dim_color.count()

+----------+
|color_name|
+----------+
|      Teal|
|     Khaki|
|   Crimson|
|    Orange|
|    Indigo|
|      Puce|
|    Fuscia|
| Turquoise|
|     Green|
|    Purple|
|Aquamarine|
|      Blue|
|    Violet|
|    Yellow|
|       Red|
|      Pink|
| Goldenrod|
|      Mauv|
|    Maroon|
+----------+



19

In [54]:
write_table(dim_color, "dim_color")

Таблица dim_size

In [55]:
dim_size = mock_data \
    .select(
        col("product_size").alias("size_name")
    ) \
    .distinct()

In [56]:
dim_size.show()
dim_size.count()

+---------+
|size_name|
+---------+
|   Medium|
|    Small|
|    Large|
+---------+



3

In [57]:
write_table(dim_size, "dim_size")

Таблица dim_product

In [58]:
dim_product_name_df = read_table("dim_product_name")
dim_product_category_df = read_table("dim_product_category")
dim_brand_df = read_table("dim_brand")
dim_material_df = read_table("dim_material")
dim_color_df = read_table("dim_color")
dim_size_df = read_table("dim_size")
dim_supplier_df = read_table("dim_supplier")

In [59]:
dim_product = mock_data.alias("md") \
    .join(dim_product_name_df.alias("dpn"), col("md.product_name") == col("dpn.product_name")) \
    .join(dim_product_category_df.alias("dpc"), col("md.product_category") == col("dpc.category_name")) \
    .join(dim_brand_df.alias("db"), col("md.product_brand") == col("db.brand_name")) \
    .join(dim_material_df.alias("dm"), col("md.product_material") == col("dm.material_name")) \
    .join(dim_color_df.alias("dc"), col("md.product_color") == col("dc.color_name")) \
    .join(dim_size_df.alias("ds"), col("md.product_size") == col("ds.size_name")) \
    .join(dim_supplier_df.alias("ppp"), col("md.supplier_email") == col("ppp.email")) \
    .select(
        col("dpn.product_name_id"),
        col("dpc.category_id"),
        col("md.product_price").alias("price"),
        col("md.product_quantity").alias("quantity"),
        col("md.product_weight").alias("weight"),
        col("dc.color_id"),
        col("ds.size_id"),
        col("db.brand_id"),
        col("dm.material_id"),
        col("md.product_description").alias("description"),
        col("md.product_rating").alias("rating"),
        col("md.product_reviews").alias("reviews"),
        col("md.product_release_date").alias("release_date"),
        col("md.product_expiry_date").alias("expiry_date"),
        col("ppp.supplier_id")
    ) \
    .distinct()

In [60]:
dim_product.show()
dim_product.count()

+---------------+-----------+--------------------+--------+--------------------+--------+-------+--------+-----------+--------------------+--------------------+-------+------------+-----------+-----------+
|product_name_id|category_id|               price|quantity|              weight|color_id|size_id|brand_id|material_id|         description|              rating|reviews|release_date|expiry_date|supplier_id|
+---------------+-----------+--------------------+--------+--------------------+--------+-------+--------+-----------+--------------------+--------------------+-------+------------+-----------+-----------+
|              3|          3|15.68000000000000...|      47|4.200000000000000000|      11|      3|     240|          5|Duis bibendum, fe...|3.300000000000000000|    991|  2020-07-06| 2023-02-13|       4133|
|              3|          1|25.33000000000000...|      46|45.90000000000000...|      12|      3|     277|          2|Sed sagittis. Nam...|4.700000000000000000|    192|  2021-0

10000

In [61]:
write_table(dim_product, "dim_product")

Таблица fact_sales

In [62]:
dim_customer_df = read_table("dim_customer")
dim_seller_df = read_table("dim_seller")
dim_store_df = read_table("dim_store")
dim_product_df = read_table("dim_product")

In [64]:
fact_sales = mock_data.alias("md") \
    .join(dim_customer_df.alias("c"), col("md.customer_email") == col("c.email")) \
    .join(dim_seller_df.alias("s"), col("md.seller_email") == col("s.email")) \
    .join(dim_store_df.alias("st"), 
          (col("md.store_name") == col("st.name")) &
          (col("md.store_location") == col("st.location")) &
          (col("md.store_phone") == col("st.phone"))
    ) \
    .join(dim_product_name_df.alias("dpn"), col("md.product_name") == col("dpn.product_name")) \
    .join(dim_product_df.alias("pr"),
          (col("dpn.product_name_id") == col("pr.product_name_id")) &
          (col("md.product_price") == col("pr.price")) &
          (col("md.product_quantity") == col("pr.quantity")) &
          (col("md.product_weight") == col("pr.weight"))
    ) \
    .select(
        col("md.sale_date"),
        col("md.sale_quantity"),
        col("md.sale_total_price"),
        col("c.customer_id"),
        col("s.seller_id"),
        col("st.store_id"),
        col("pr.product_id")
    )

In [65]:
fact_sales.show()
fact_sales.count()

+----------+-------------+--------------------+-----------+---------+--------+----------+
| sale_date|sale_quantity|    sale_total_price|customer_id|seller_id|store_id|product_id|
+----------+-------------+--------------------+-----------+---------+--------+----------+
|2021-01-14|            7|19.18000000000000...|       3101|     2707|    3006|      8413|
|2021-08-11|            4|134.3400000000000...|       3563|      435|     155|      7032|
|2021-01-22|            5|322.1600000000000...|       9084|     1279|    7210|      1808|
|2021-07-06|            1|271.1000000000000...|       1161|     1651|     811|      6702|
|2021-12-03|            6|89.39000000000000...|       4742|     6091|    9672|      5174|
|2021-06-28|            6|380.5500000000000...|       9484|     5410|    3761|      7564|
|2021-03-31|            6|80.96000000000000...|       9057|     5432|    6208|      6659|
|2021-02-25|            6|223.0900000000000...|       4848|     6878|    5077|      8975|
|2021-06-1

10000

In [66]:
write_table(fact_sales, "fact_sales")

## Работа с ClickHouse 