### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
dbutils.fs.ls("abfss://pyspark@warnerdatalake.dfs.core.windows.net/")

Let's load the dataframe

In [0]:
parquet_path="abfss://pyspark@warnerdatalake.dfs.core.windows.net//imports//customers_data.parquet"
df_customers = spark.read.parquet(parquet_path)

print("Parquet Data:")
df_customers.show(5)

Parquet Data:
+-----------+----------+---------+--------------------+---+-------+
|customer_id|first_name|last_name|               email|age|country|
+-----------+----------+---------+--------------------+---+-------+
|          1|   First_1|   Last_1|First_1.Last_1@ex...| 40| Canada|
|          2|   First_2|   Last_2|First_2.Last_2@ex...| 55|    USA|
|          3|   First_3|   Last_3|First_3.Last_3@ex...| 59|    USA|
|          4|   First_4|   Last_4|First_4.Last_4@ex...| 49| Canada|
|          5|   First_5|   Last_5|First_5.Last_5@ex...| 58| Canada|
+-----------+----------+---------+--------------------+---+-------+
only showing top 5 rows


Let's try some filters

In [0]:
from pyspark.sql import functions as F

df_over_40 = df_customers.filter(F.col("age") > 40)
df_over_40.limit(5).display()


customer_id,first_name,last_name,email,age,country
2,First_2,Last_2,First_2.Last_2@example.com,55,USA
3,First_3,Last_3,First_3.Last_3@example.com,59,USA
4,First_4,Last_4,First_4.Last_4@example.com,49,Canada
5,First_5,Last_5,First_5.Last_5@example.com,58,Canada
6,First_6,Last_6,First_6.Last_6@example.com,55,USA


In [0]:
# An alternate syntax using []
df_over_40 = df_customers.filter(df_customers['age'] > 40)
df_over_40.limit(5).display()


customer_id,first_name,last_name,email,age,country
2,First_2,Last_2,First_2.Last_2@example.com,55,USA
3,First_3,Last_3,First_3.Last_3@example.com,59,USA
4,First_4,Last_4,First_4.Last_4@example.com,49,Canada
5,First_5,Last_5,First_5.Last_5@example.com,58,Canada
6,First_6,Last_6,First_6.Last_6@example.com,55,USA


In [0]:
# Another alternate syntax using dataframe.column notation
df_over_40 = df_customers.filter(df_customers.age > 40)
df_over_40.limit(5).display()

customer_id,first_name,last_name,email,age,country
2,First_2,Last_2,First_2.Last_2@example.com,55,USA
3,First_3,Last_3,First_3.Last_3@example.com,59,USA
4,First_4,Last_4,First_4.Last_4@example.com,49,Canada
5,First_5,Last_5,First_5.Last_5@example.com,58,Canada
6,First_6,Last_6,First_6.Last_6@example.com,55,USA


In [0]:
df_usa = df_customers.filter(F.col("country") == "USA")
df_usa.limit(5).display()


customer_id,first_name,last_name,email,age,country
2,First_2,Last_2,First_2.Last_2@example.com,55,USA
3,First_3,Last_3,First_3.Last_3@example.com,59,USA
6,First_6,Last_6,First_6.Last_6@example.com,55,USA
7,First_7,Last_7,First_7.Last_7@example.com,32,USA
8,First_8,Last_8,First_8.Last_8@example.com,56,USA


In [0]:
df_usa_over_40 = df_customers.filter((F.col("country") == "USA") & (F.col("age") > 40))
df_usa_over_40.limit(5).display()

customer_id,first_name,last_name,email,age,country
2,First_2,Last_2,First_2.Last_2@example.com,55,USA
3,First_3,Last_3,First_3.Last_3@example.com,59,USA
6,First_6,Last_6,First_6.Last_6@example.com,55,USA
8,First_8,Last_8,First_8.Last_8@example.com,56,USA
9,First_9,Last_9,First_9.Last_9@example.com,47,USA


We can easily sort the dataset as well

In [0]:
df_sorted_age = df_customers.orderBy(F.col("age").asc())
df_sorted_age.limit(5).display()


customer_id,first_name,last_name,email,age,country
107,First_107,Last_107,First_107.Last_107@example.com,18,USA
50,First_50,Last_50,First_50.Last_50@example.com,18,USA
69,First_69,Last_69,First_69.Last_69@example.com,18,USA
32,First_32,Last_32,First_32.Last_32@example.com,18,USA
157,First_157,Last_157,First_157.Last_157@example.com,18,USA


In [0]:
df_sorted_country_age = df_customers.orderBy(F.col("country").asc(), F.col("age").desc())
df_sorted_country_age.limit(5).display()


customer_id,first_name,last_name,email,age,country
557,First_557,Last_557,First_557.Last_557@example.com,60,Canada
987,First_987,Last_987,First_987.Last_987@example.com,60,Canada
669,First_669,Last_669,First_669.Last_669@example.com,60,Canada
512,First_512,Last_512,First_512.Last_512@example.com,60,Canada
825,First_825,Last_825,First_825.Last_825@example.com,60,Canada


We can do column operations

In [0]:
df_select_columns = df_customers.select("customer_id", "first_name", "country")
df_select_columns.limit(5).display()


customer_id,first_name,country
1,First_1,Canada
2,First_2,USA
3,First_3,USA
4,First_4,Canada
5,First_5,Canada


In [0]:
df_renamed = df_customers.withColumnRenamed("first_name", "fname") \
                         .withColumnRenamed("last_name", "lname")
df_renamed.limit(5).display()


customer_id,fname,lname,email,age,country
1,First_1,Last_1,First_1.Last_1@example.com,40,Canada
2,First_2,Last_2,First_2.Last_2@example.com,55,USA
3,First_3,Last_3,First_3.Last_3@example.com,59,USA
4,First_4,Last_4,First_4.Last_4@example.com,49,Canada
5,First_5,Last_5,First_5.Last_5@example.com,58,Canada


We can also cast data types if needed

In [0]:
df_casted = df_customers.withColumn("age_str", F.col("age").cast("string"))
df_casted.printSchema()
df_casted.limit(5).display()

root
 |-- customer_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- age: long (nullable = true)
 |-- country: string (nullable = true)
 |-- age_str: string (nullable = true)



customer_id,first_name,last_name,email,age,country,age_str
1,First_1,Last_1,First_1.Last_1@example.com,40,Canada,40
2,First_2,Last_2,First_2.Last_2@example.com,55,USA,55
3,First_3,Last_3,First_3.Last_3@example.com,59,USA,59
4,First_4,Last_4,First_4.Last_4@example.com,49,Canada,49
5,First_5,Last_5,First_5.Last_5@example.com,58,Canada,58


We can add and drop columns

In [0]:
df_age_group = df_customers.withColumn(
    "age_group", 
    F.when(F.col("age") < 30, "Young")
     .when((F.col("age") >= 30) & (F.col("age") < 50), "Middle-aged")
     .otherwise("Senior")
)
df_age_group.limit(5).display()


customer_id,first_name,last_name,email,age,country,age_group
1,First_1,Last_1,First_1.Last_1@example.com,40,Canada,Middle-aged
2,First_2,Last_2,First_2.Last_2@example.com,55,USA,Senior
3,First_3,Last_3,First_3.Last_3@example.com,59,USA,Senior
4,First_4,Last_4,First_4.Last_4@example.com,49,Canada,Middle-aged
5,First_5,Last_5,First_5.Last_5@example.com,58,Canada,Senior


In [0]:
df_dropped = df_customers.drop("email")
df_dropped.limit(5).display()

customer_id,first_name,last_name,age,country
1,First_1,Last_1,40,Canada
2,First_2,Last_2,55,USA
3,First_3,Last_3,59,USA
4,First_4,Last_4,49,Canada
5,First_5,Last_5,58,Canada


And we can combine all these operations if needed

In [0]:
df_transformed = (
    df_customers
        .filter(F.col("age") > 30)
        .orderBy(F.col("age").desc())
        .select("customer_id", "first_name", "last_name", "age", "country")
        .withColumnRenamed("first_name", "fname")
        .withColumnRenamed("last_name", "lname")
        .withColumn("age_str", F.col("age").cast("string"))
        .withColumn(
            "age_group", 
            F.when(F.col("age") < 30, "Young")
             .when((F.col("age") >= 30) & (F.col("age") < 50), "Middle-aged")
             .otherwise("Senior")
        )
)

df_transformed.limit(5).display()

customer_id,fname,lname,age,country,age_str,age_group
49,First_49,Last_49,60,USA,60,Senior
47,First_47,Last_47,60,USA,60,Senior
136,First_136,Last_136,60,USA,60,Senior
92,First_92,Last_92,60,USA,60,Senior
135,First_135,Last_135,60,USA,60,Senior
