In [0]:
############
# Answer to question 
# csv parsing
#-------------
#Author: AdrianJ
#V1.0 Created(2023-09-01)
############

In [0]:
from datetime import datetime

### CSV Parsing
The following examples involve working with simple CSV data.

#### **Question #1**: CSV Header Rows
Given the simple RDD `full_csv` below, write the most efficient Spark job you can to remove the header row.

In [0]:
%scala
val full_csv = sc.parallelize(Array(
  "col_1, col_2, col_3",
  "1, ABC, Foo1",
  "2, ABCD, Foo2",
  "3, ABCDE, Foo3",
  "4, ABCDEF, Foo4",
  "5, DEF, Foo5",
  "6, DEFGHI, Foo6",
  "7, GHI, Foo7",
  "8, GHIJKL, Foo8",
  "9, JKLMNO, Foo9",
  "10, MNO, Foo10",
))

//remove by index to avoid accidentally removing rows that contain the same value than the header
val no_header_csv = full_csv.zipWithIndex().filter { case (_, index) => index > 0 }.map { case (line, _) => line }

println("Example of the csv file without the first row")
println(no_header_csv.toDF().show(3))

#### Question #2 DataFrame UDFs and DataFrame SparkSQL Functions

Below we've created a small DataFrame. You should use DataFrame API functions and UDFs to accomplish two tasks.

1. You need to parse the State and city into two different columns.
2. You need to get the number of days in between the start and end dates. You need to do this two ways.
  - Firstly, you should use SparkSQL functions to get this date difference.
  - Secondly, you should write a udf that gets the number of days between the end date and the start date.

In [0]:
%python

from pyspark.sql import functions as F
from pyspark.sql.types import *

# Build an example DataFrame dataset to work with. 
dbutils.fs.rm("/tmp/dataframe_sample.csv", True)
dbutils.fs.put("/tmp/dataframe_sample.csv", """id|end_date|start_date|location
1|2015-10-14 00:00:00|2015-09-14 00:00:00|CA-SF
2|2015-10-15 01:00:20|2015-08-14 00:00:00|CA-SD
3|2015-10-16 02:30:00|2015-01-14 00:00:00|NY-NY
4|2015-10-17 03:00:20|2015-02-14 00:00:00|NY-NY
5|2015-10-18 04:30:00|2014-04-14 00:00:00|CA-SD
""", True)

formatPackage = "csv" if sc.version > '1.6' else "com.databricks.spark.csv"
df = sqlContext.read.format(formatPackage).options(
  header='true', 
  delimiter = '|',
).load("/tmp/dataframe_sample.csv")
df.printSchema()


Wrote 272 bytes.
root
 |-- id: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- location: string (nullable = true)



In [0]:
#parse the city and state in two different columns
df = (
    df
    .select(
        F.col("id"),
        F.col("end_date"),
        F.col("start_date"),
        F.split("location", "-")[0].alias("city"),
        F.split("location", "-")[1].alias("State")
    )
)

print("INFO: Showing the state and city parsed into two columns")
df.show()

#Create a udf that return number of days between the end and start date

def calculate_days_between(start_date: str, end_date:str):
    start_date = datetime.strptime(start_date, '%Y-%m-%d %H:%M:%S')
    end_date = datetime.strptime(end_date, '%Y-%m-%d %H:%M:%S')
    diff = end_date - start_date
    return diff.days

#register udf
calculate_days_between_udf = F.udf(calculate_days_between)

print("INFO: Showing the number of days between to dates using and UDF")
df.withColumn("days_between", calculate_days_between_udf(F.col("start_date"), F.col("end_date"))).show()

INFO: Showing the state and city parsed into two columns
+---+-------------------+-------------------+----+-----+
| id|           end_date|         start_date|city|State|
+---+-------------------+-------------------+----+-----+
|  1|2015-10-14 00:00:00|2015-09-14 00:00:00|  CA|   SF|
|  2|2015-10-15 01:00:20|2015-08-14 00:00:00|  CA|   SD|
|  3|2015-10-16 02:30:00|2015-01-14 00:00:00|  NY|   NY|
|  4|2015-10-17 03:00:20|2015-02-14 00:00:00|  NY|   NY|
|  5|2015-10-18 04:30:00|2014-04-14 00:00:00|  CA|   SD|
+---+-------------------+-------------------+----+-----+

INFO: Showing the number of days between to dates using and UDF
+---+-------------------+-------------------+----+-----+------------+
| id|           end_date|         start_date|city|State|days_between|
+---+-------------------+-------------------+----+-----+------------+
|  1|2015-10-14 00:00:00|2015-09-14 00:00:00|  CA|   SF|          30|
|  2|2015-10-15 01:00:20|2015-08-14 00:00:00|  CA|   SD|          62|
|  3|2015-10-16