In [1]:
import os
from datetime import date, timedelta

from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col, max, to_date, expr

from delta.tables import DeltaTable

conf = (
    SparkConf()
    .setAppName("Spark minIO Test")
    .set("spark.hadoop.fs.s3a.endpoint", "http://192.168.86.192:9000")
    .set("spark.hadoop.fs.s3a.access.key", os.getenv('MINIO_ROOT_USER'))
    .set("spark.hadoop.fs.s3a.secret.key", os.getenv('MINIO_ROOT_PASSWORD'))
    .set("spark.hadoop.fs.s3a.path.style.access", True)
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.driver.memory", "8g")
    .set("spark.executor.memory", "8g")
    .set("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") 
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .set('spark.sql.catalog.spark_catalog.type', 'hive')
    .set("spark.sql.catalog.spark_catalog.uri ", "thrift://192.168.86.192:9083")
    .set("spark.sql.catalog.spark_catalog.warehouse ", "s3a://warehouse")
    .set("spark.sql.catalog.spark_catalog.s3.endpoint", "http://192.168.86.192:9000")  

#    .set("spark.sql.catalog.warehouse", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    
    #added to test hive
#    .set("spark.sql.defaultCatalog", "warehouse")
#     .set('spark.sql.catalog.warehouse.type', 'hive')
#     .set("spark.sql.catalog.warehouse.uri ", "thrift://192.168.86.192:9083")
#     .set("spark.sql.catalog.warehouse.warehouse ", "s3a://warehouse")
#     .set("spark.sql.catalog.warehouse.s3.endpoint", "http://192.168.86.192:9000")      
    



)
sc = SparkContext(conf=conf).getOrCreate()
spark = SparkSession(sc).builder.enableHiveSupport().getOrCreate()

22/08/06 07:09:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Define date range

In [2]:
start_date = date(1900, 1, 1)
end_date = date(2100, 2, 1)

In [3]:
def get_is_leap_year(year: int):
    if year % 400 == 0 or (year % 4 == 0 and not (year % 100 == 0)):
        return 1
    else:
        return 0

In [4]:
days_of_week=["dummy", "Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
months=['dummy', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
month_abbrs = ['dummy', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
date_series = []
for day in range((end_date - start_date).days):
    sql_date = start_date + timedelta(days=day)
    day_of_week_number = sql_date.isocalendar()[2]
    day_of_week = days_of_week[day_of_week_number]
    is_week_day = 1 if day_of_week_number < 6 else 0
    day_of_month = sql_date.day
    week = sql_date.isocalendar()[1]
    month_number = sql_date.month
    month_abbr = month_abbrs[month_number]
    month = months[month_number]
    year = sql_date.year
    is_leap_year = get_is_leap_year(year)
    
    row = [sql_date, day_of_week_number, day_of_week, is_week_day, day_of_month, week, month_number, month_abbr, month, year, is_leap_year]
    date_series.append(row)
    

In [5]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
schema = (
            StructType([
                StructField("date", DateType(), True),
                StructField("day_of_week_number", IntegerType(), True),
                StructField("day_of_week", StringType(), True),
                StructField("is_weekday", IntegerType(), True),
                StructField("day_of_month", IntegerType(), True),
                StructField("week", IntegerType(), True),
                StructField("month_number", IntegerType(), True),
                StructField("month_abbr", StringType(), True),
                StructField("month", StringType(), True),
                StructField("year", IntegerType(), True),
                StructField("is_leap_year", IntegerType(), True)
            ])
)

In [6]:
df = spark.createDataFrame(date_series, schema=schema)
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----------+------------------+-----------+----------+------------+----+------------+----------+-------+----+------------+
|      date|day_of_week_number|day_of_week|is_weekday|day_of_month|week|month_number|month_abbr|  month|year|is_leap_year|
+----------+------------------+-----------+----------+------------+----+------------+----------+-------+----+------------+
|1900-01-01|                 1|     Monday|         1|           1|   1|           1|       Jan|January|1900|           0|
|1900-01-02|                 2|    Tuesday|         1|           2|   1|           1|       Jan|January|1900|           0|
|1900-01-03|                 3|  Wednesday|         1|           3|   1|           1|       Jan|January|1900|           0|
|1900-01-04|                 4|   Thursday|         1|           4|   1|           1|       Jan|January|1900|           0|
|1900-01-05|                 5|     Friday|         1|           5|   1|           1|       Jan|January|1900|           0|
|1900-01-06|    

                                                                                

In [7]:
df.write.format("delta").mode("overwrite").save("s3a://silver-generated/dim_date")

22/08/06 07:09:34 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [10]:
spark.sql(
'''
CREATE TABLE date
USING DELTA
LOCATION 's3a://silver-generated/dim_date';
'''
)


DataFrame[]

In [11]:
spark.sql('''SHOW TABLES''').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|     date|      false|
+--------+---------+-----------+

