# 1. Read "clean_me.csv" file with marked unknown columns.

In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Spark CSV file reader") \
    .getOrCreate()

filePath = "input_csv_files/clean_me.csv"

# Read CSV file with the specified options
clean_meDF = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .csv(filePath)

# Register the DataFrame as a SQL temporary view
clean_meDF.createOrReplaceTempView("clean_me")

clean_meDF.show()


+--------+----------------+--------+------+------------+-------------------+--------------------+--------------------+-----+-----+
|order_id|delivery_company|quantity| price|ordered_date|            address|                  x1|                  x2|   x3|   x4|
+--------+----------------+--------+------+------------+-------------------+--------------------+--------------------+-----+-----+
|       1| delivery_comp_1|       1| 245.0|          52|           9/2/2022|  Cedar Lane Houston|            CA 90001| null| null|
|       2| delivery_comp_2|       2| 114.0|          77|               null|         Main Street|   New York CA 60601| null| null|
|       3| delivery_comp_3|    null| 739.0|          43|          14-3-2022|         Main Street|    Chicago TX 10001| null| null|
|       4| delivery_comp_0|       1|878.93|   20/4/2022|         Oak Avenue|Los Angeles FL 90001|                null| null| null|
|       5| delivery_comp_1|       2| 481.0|          44|               null| Maple 

# 2. CSV Format Fixing

In [5]:
fixedFormatDF = spark.sql(
    "SELECT " +
      "order_id, " +
      "delivery_company, " +
      "CASE " +
        "WHEN (quantity IS NULL OR quantity IN ('1', '2', 'NA', 'null', '#NA', 'NaN', 'NULL')) AND ordered_date NOT LIKE '%-%' AND ordered_date NOT LIKE '%/%' " +
        "THEN ordered_date " +
        "ELSE quantity " +
      "END AS quantity, " +
      "price, " +
      "CASE " +
        "WHEN ordered_date NOT LIKE '%-%' AND ordered_date NOT LIKE '%/%' AND address = 'null' " +
        "THEN address " +
        "WHEN ordered_date NOT LIKE '%-%' AND ordered_date NOT LIKE '%/%' AND (address LIKE '%-%' OR address LIKE '%/%') " +
        "THEN address " +
        "ELSE ordered_date " +
      "END AS ordered_date, " +
      "CASE " +
        "WHEN CONTAINS( " +
          "CONCAT(IFNULL(x1,''), ' ', IFNULL(x2,''), ' ', IFNULL(x3,''), ' ', IFNULL(x4,'')), " + 
          "CASE " +
            "WHEN address != 'null' AND address NOT LIKE '%-%' AND address NOT LIKE '%/%' " +
            "THEN address " +
            "ELSE x1 " +
          "END) = TRUE " +
        "THEN CONCAT(IFNULL(x1,''), ' ', IFNULL(x2,''), ' ', IFNULL(x3,''), ' ', IFNULL(x4,'')) " +
        "ELSE CONCAT( " +
          "CASE " +
            "WHEN address != 'null' AND address NOT LIKE '%-%' AND address NOT LIKE '%/%' " +
            "THEN address " +
            "ELSE x1 " +
          "END, " + 
          "' ', " +
          "CONCAT(IFNULL(x1,''), ' ', IFNULL(x2,''), ' ', IFNULL(x3,''), ' ', IFNULL(x4,''))) " +
      "END AS address " +
    "FROM clean_me"
).show(20, truncate=False)

+--------+----------------+--------+------+------------+----------------------------------+
|order_id|delivery_company|quantity|price |ordered_date|address                           |
+--------+----------------+--------+------+------------+----------------------------------+
|1       |delivery_comp_1 |52      |245.0 |9/2/2022    |Cedar Lane Houston CA 90001       |
|2       |delivery_comp_2 |77      |114.0 |null        |Main Street New York CA 60601     |
|3       |delivery_comp_3 |43      |739.0 |14-3-2022   |Main Street Chicago TX 10001      |
|4       |delivery_comp_0 |1       |878.93|20/4/2022   |Oak Avenue Los Angeles FL 90001   |
|5       |delivery_comp_1 |44      |481.0 |null        |Maple Drive Chicago FL 60601      |
|6       |delivery_comp_2 |13      |78.0  |null        |Main Street Houston NY 77001      |
|7       |delivery_comp_3 |1       |832.17|20-2-2022   |Oak Avenue New York CA 10001      |
|8       |delivery_comp_0 |8       |687.0 |1/4/2022    |Maple Drive Los Angeles 