### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/"))

Let's load the dataframe

In [0]:
parquet_path="abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/customers_data.parquet"
df_customers = spark.read.parquet(parquet_path)

print("Parquet Data:")
display(df_customers.limit(20))


Let's try some filters

In [0]:
from pyspark.sql import functions as F

df_over_40 = df_customers.filter(F.col("age") > 40)
df_over_40.limit(5).display()

# from pyspark.sql import functions as F

# df_over_40 = df_customers.filter(F.col("age") > 40)
# df_over_40.limit(5).display()


In [0]:
df_over_40 = df_customers.filter(df_customers['age'] > 40)
df_over_40.limit(5).display()

# # An alternate syntax using []
# df_over_40 = df_customers.filter(df_customers['age'] > 40)
# df_over_40.limit(5).display()


In [0]:
df_over_40 = df_customers.filter(df_customers.age > 40)
df_over_40.limit(5).display()

# # Another alternate syntax using dataframe.column notation
# df_over_40 = df_customers.filter(df_customers.age > 40)
# df_over_40.limit(5).display()

In [0]:
df_usa = df_customers.filter(F.col("country") == "USA")
print(df_usa.count())

# df_usa = df_customers.filter(F.col("country") == "USA")
# df_usa.limit(5).display()


In [0]:
df_usa_over_40 = df_customers.filter((F.col("country") == "USA") & (F.col("Age") > 40))
display(df_usa_over_40)

# df_usa_over_40 = df_customers.filter((F.col("country") == "USA") & (F.col("age") > 40))
# df_usa_over_40.limit(5).display()

We can easily sort the dataset as well

In [0]:
df_sorted = df_customers.orderBy(F.col("age").asc())
display(df_sorted)

# df_sorted_age = df_customers.orderBy(F.col("age").asc())
# df_sorted_age.limit(5).display()


In [0]:
df_sorted_country_age = df_customers.orderBy(F.col("country").asc(), F.col("age").desc())
display(df_sorted_country_age)

# df_sorted_country_age = df_customers.orderBy(F.col("country").asc(), F.col("age").desc())
# df_sorted_country_age.limit(5).display()


We can do column operations

In [0]:
df_select_columns = df_customers.select('customer_id','first_name','country')
df_select_columns.limit(5).display()

# df_select_columns = df_customers.select("customer_id", "first_name", "country")
# df_select_columns.limit(5).display()


In [0]:
# print(df_customers.count())
# print(df_customers.filter(F.col('age') > 40).count())
# df_customers = df_customers.filter(F.col('age') > 40)

# print(df_customers.count())

df_renamed = df_customers.withColumnRenamed("first_name", "fname") \
                         .withColumnRenamed("last_name", "lname")
df_renamed.limit(5).display()


We can also cast data types if needed

In [0]:
df_casted = df_customers.withColumn("ageAsString", F.col("age").cast("string"))
df_casted.printSchema()
df_casted.limit(5).display()

# df_casted = df_customers.withColumn("age_str", F.col("age").cast("string"))
# df_casted.printSchema()
# df_casted.limit(5).display()

We can add and drop columns

In [0]:
df_age_group = df_customers.withColumn(
    'AgeGroup',
    F.when(F.col('age') < 30, 'Young')
    .when((F.col('age') >= 30) & (F.col('age') < 50), 'Middle-aged')
    .otherwise('Senior')
)

df_age_group.limit(5).display()  
           
                                       
     


# df_age_group = df_customers.withColumn(
#     "age_group", 
#     F.when(F.col("age") < 30, "Young")
#      .when((F.col("age") >= 30) & (F.col("age") < 50), "Middle-aged")
#      .otherwise("Senior")
# )
# df_age_group.limit(5).display()


In [0]:
df_dropped = df_customers.drop('email')
df_dropped.limit(5).display()


# df_dropped = df_customers.drop("email")
# df_dropped.limit(5).display()

And we can combine all these operations if needed

In [0]:
df_transformed = (
    df_customers
        .filter(F.col("age") > 30)
        .orderBy(F.col("age").desc())
        .select("customer_id", "first_name", "last_name", "age", "country")
        .withColumnRenamed("first_name", "fname")
        .withColumnRenamed("last_name", "lname")
        .withColumn("age_str", F.col("age").cast("string"))
        .withColumn(
            "age_group", 
            F.when(F.col("age") < 30, "Young")
             .when((F.col("age") >= 30) & (F.col("age") < 50), "Middle-aged")
             .otherwise("Senior")
        )
)

df_transformed.limit(5).display()