In [None]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
spark = SparkSession.builder\
    .appName("Date Column Cleaning")\
        .master("local[2]")\
            .config("spark.executor.memory", "2g")\
                .config("spark.driver.memory", "1g")\
                    .config("spark.sql.shuffle.partitions", "4")\
                        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")\
                            .getOrCreate()

In [None]:
import pyspark.sql.functions  as F

# Create a Spark session
spark = SparkSession.builder.appName("DateFormatsExample").getOrCreate()

# Sample data with a date column in a standard format (yyyy-MM-dd)
data = [("2023-01-15",)]
df = spark.createDataFrame(data, ["date_column"])

# Add columns with different date formats
df = df.withColumn("standard_date", F.to_date(F.col("date_column"), "yyyy-MM-dd"))
df = df.withColumn("short_date", F.to_date(F.col("date_column"), "dd-MM-yy"))
df = df.withColumn("long_date", F.to_date(F.col("date_column"), "dd MMMM yyyy"))
df = df.withColumn("iso_date", F.to_date(F.col("date_column"), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
df = df.withColumn("custom_date", F.to_date(F.col("date_column"), "yyyy-MM-dd HH:mm:ss z"))
df = df.withColumn("epoch_timestamp", F.from_unixtime(F.lit(1642185600), "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("quarter", F.quarter(F.col("standard_date")))
df = df.withColumn("year", F.year(F.col("standard_date")))
df = df.withColumn("day_of_week", F.date_format(F.col("standard_date"), "EEEE"))
df = df.withColumn("week_of_year", F.date_format(F.col("standard_date"), "w"))
df = df.withColumn("month_abbreviation", F.date_format(F.col("standard_date"), "MMM"))

# Show the resulting DataFrame
df.show()

In [None]:
# Define the options and configurations
options = {
    "header": "true",          # Use the first row as the header
    "inferSchema": "true",     # Infer the schema of the CSV file
    "sep": ",",                # Specify the separator (default is ',')
    "encoding": "UTF-8",       # Set the character encoding
    "mode": "DROPMALFORMED"    # Handle malformed rows by dropping them
}

In [None]:
csv_file_path = 'input-data/date_formats.csv'

In [None]:
# Read the CSV file with the specified options
df = spark.read.csv(csv_file_path, **options)
df.printSchema()

In [None]:
def date_to_standard_date_format(df, to_standard_format=True):
    for col in df.columns:
        new_col_name = col

        if to_standard_format:
            df = df.withColumn(new_col_name, F.to_date(F.col(new_col_name), "yyyy-MM-dd"))
            df = df.withColumnRenamed(col, new_col_name)

    return df

# Apply the date format conversion function
df = date_to_standard_date_format(df)

# Print the schema to see the changes
df.printSchema()