In [0]:
# spark.read.csv
# header
# inferSchema
# printSchema
# describe
# show
# df schema > struct type / struct fields
# select columns - 4 methods
# renaming column using .withColumnRenamed > using alias mehtod in select statement
# new column using .withColumn("column_name", value) 
# current_timestamp function spark - try printing the value directly
# adding a literal value column
# 

In [0]:
from pyspark.sql.types import IntegerType, DoubleType, StringType, StructType, StructField
from pyspark.sql.functions import col, current_timestamp, lit

In [0]:
circuits_schema = StructType(
    [
        StructField(name="circuitId", dataType=IntegerType(), nullable=False),
        StructField(name="circuitRef", dataType=StringType(), nullable=True),
        StructField(name="name", dataType=StringType(), nullable=True),
        StructField(name="location", dataType=StringType(), nullable=True),
        StructField(name="country", dataType=StringType(), nullable=True),
        StructField(name="lat", dataType=DoubleType(), nullable=True),
        StructField(name="lng", dataType=DoubleType(), nullable=True),
        StructField(name="alt", dataType=IntegerType(), nullable=True),
        StructField(name="url", dataType=StringType(), nullable=True)
    ]
)

In [0]:
dbutils.fs.ls("/mnt/sanformula1dl/raw")

In [0]:
# Read Circuits.csv
circuits_df = spark.read.csv(path="/mnt/sanformula1dl/raw/circuits.csv", header=True, schema=circuits_schema)

In [0]:
circuits_df.show(truncate=False)

In [0]:
display(circuits_df)

In [0]:
circuits_df.printSchema()

Select Column examples / Options

In [0]:
# #1 > This method only allows to select the columns without allowing any column methods like alias...
circuits_selected_df = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt")

In [0]:
# #2 > Allows column based operations
circuits_selected_df = circuits_df.select(circuits_df.circuitId, circuits_df.circuitRef, circuits_df.name, circuits_df.location, circuits_df.country, circuits_df.lat, circuits_df.lng, circuits_df)

In [0]:
# #3 > Allows column based operations
circuit_selected_df = circuits_df.select(circuits_df['circuitId'], circuits_df['circuitRef'], circuits_df['name'], circuits_df['location'], circuits_df['country'], circuits_df['lat'], circuits_df['lng'])

In [0]:
# #4 > Allows column based operations
circuits_selected_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"))

Renaming a column

In [0]:
circuits_selected_df = circuits_df.select(col("circuitId").alias("circuit_id"), col("circuitRef").alias("circuit_ref"), col("name"), col("location"), col("country"), col("lat").alias("latitude"), col("lng").alias("longitude"), col("alt").alias("altitude"))

In [0]:
# #2 Using withColumnRenamed
circuits_selected_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
    .withColumnRenamed("circuitRef", "circuit_ref") \
    .withColumnRenamed("lat", "latitude") \
    .withColumnRenamed("lng", "longitude") \
    .withColumnRenamed("alt", "altitude")

In [0]:
circuits_final_df = circuits_selected_df.withColumn("ingestion_date", current_timestamp()) \
    .withColumn("data_source", lit("manual"))

In [0]:
display(circuits_final_df)

In [0]:
circuits_final_df.write.mode("overwrite").parquet("/mnt/sanformula1dl/processed/circuits")

In [0]:
%fs
ls /mnt/sanformula1dl/processed