### Data functions in PySpark-SQL

In [0]:
txns_df = spark.read.format("csv") \
    .load("dbfs:/Volumes/inceptez_catalog/inputdb/customerdata/txns") \
    .toDF("txnid","txndate","custid","amount","prodcategory","prodname","city","state","paymenttype")


In [0]:
txns_df.printSchema()
txns_df.display()

- **to_date** function is use to convert from string to date type

In [0]:
from pyspark.sql.functions import to_date

txns_df = txns_df.withColumn("txndate", to_date("txndate", "MM-dd-yyyy"))
txns_df.printSchema()
txns_df.display()

In [0]:
#Extracting Parts of Date
from pyspark.sql.functions import year, month, dayofmonth, dayofweek, dayofyear, weekofyear, quarter

txns_df_parts = txns_df.select(
    "txnid", "txndate",
    year("txndate").alias("year"),
    month("txndate").alias("month"),
    dayofmonth("txndate").alias("day"),
    dayofweek("txndate").alias("weekday"),
    dayofyear("txndate").alias("day_of_year"),
    weekofyear("txndate").alias("week"),
    quarter("txndate").alias("quarter")
)
display(txns_df_parts)


In [0]:
# Current Date & Timestamp Functions

from pyspark.sql.functions import current_date, current_timestamp, date_format, date_trunc, trunc

df_current = txns_df.select(
    current_date().alias("today"),
    current_timestamp().alias("now"),
    date_format("txndate", "dd-MM-yyyy").alias("formatted"),
    date_trunc("month", "txndate").alias("month_start"),
    trunc("txndate", "year").alias("year_start"),
    trunc("txndate", "month").alias("month_start_dt")

)
df_current.display()

In [0]:
# Date Arithmetic Functions
from pyspark.sql.functions import date_add, date_sub, add_months, months_between, datediff, next_day, last_day

df_arith = txns_df.select(
    "txnid", "txndate",
    date_add("txndate", 10).alias("plus_10_days"),
    date_sub("txndate", 5).alias("minus_5_days"),
    add_months("txndate", 2).alias("plus_2_months"),
    months_between(current_date(), "txndate").alias("months_since_txn"),
    datediff(current_date(), "txndate").alias("days_since_txn"),
    next_day("txndate", "Sunday").alias("next_sunday"),
    last_day("txndate").alias("last_day_month")
)
display(df_arith)