In [0]:
%fs
ls /Volumes/sample_catalog/default/db_catalog

In [0]:
# /Volumes/sample_catalog/default/db_catalog 

df = spark.read.csv("/Volumes/sample_catalog/default/db_catalog/customer.csv", header=True, inferSchema=True)
#df.show()
display(df)

In [0]:
df.printSchema()

In [0]:
from pyspark.sql.types import *
# Define your own schema instead of inferSchema
schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("country", StringType(), True),
    StructField("customer_type", StringType(), True),
    StructField("registration_date", DateType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("total_purchases", IntegerType(), True),
    StructField("ingestion_timestamp", TimestampType(), True)
])

df1 = spark.read.csv("/Volumes/sample_catalog/default/db_catalog/customer.csv", header=True, schema=schema)
df1.printSchema()

In [0]:
from pyspark.sql.functions import col, column, when
df1 = df.filter(df["customer_type"] == "VIP")
df2 = df.filter(column("customer_type") == "Regular")
df3 = df.filter(column("customer_type") == "Premimum")
df4 = df.filter(df.customer_type == "VIP")
df5 = df.filter((df.customer_type == 'VIP') & (df.country == 'India'))

df6 = df.where((df.customer_type == 'VIP') & (df.country == 'India'))
df7 = df.where((df.customer_type == 'VIP') | (df.country == 'USA'))
display(df7)

In [0]:
df1 = df.withColumn("Salary", col("age") * 1000)
df1.printSchema()
display(df1)

df2 = df.withColumn("Seniority", when(df.age>30, 'Senior').otherwise('Junior'))
display(df2)

In [0]:
# to rename the column
df3 = df2.withColumnRenamed("Seniority", "Is_Senior")
#display(df3)

# to drop or delete the column
df4 = df3.drop("Is_Senior")
#df4.printSchema()

# to drop multiple colums
df5 = df3.drop("Is_Senior", "Salary", "age")
df5.printSchema()


In [0]:
df1 = df.select("age", "gender", "customer_type")
df2 =df.select("age", "gender", "customer_type",col("age")*1000)
df3 =df.select("age", "gender", "customer_type",(col("age")*1000).alias("Salary"))
display(df3)

In [0]:
df1 = spark.read.json("/Volumes/sample_catalog/default/db_catalog/SampleNested.json")
df2 = df1.select("address.city", "address.state", "age", "name", "email")
display(df2)

In [0]:
df1 = df.filter("customer_type == 'VIP'")
df2 = df.filter("customer_type == 'Regular'")

df3 = df1.union(df2)
df4 = df1.union(df2).union(df)
df5 = df4.distinct()
df6 = df4.select("customer_type", "country").distinct()

#display(df1)
#display(df2)
#display(df3)
#display(df4)
#display(df5)
display(df6)


#df1 = df.filter("customer_type == 'VIP'")


In [0]:
print(df.count())   # count before the drop of null values
df1 = df.na.drop()  # this function will eliminate all rows with null values
print(df1.count())  # count after the drop of null values
df1 = df.dropna()   # this function will eliminate all rows with null values

df2 = df.filter(col("email").isNull())
#display(df2)

df3 = df.na.fill("Unknown")
df4 =df.na.fill({"email": "Unknown", "age": 0})

display(df4)




In [0]:
from pyspark.sql.functions import desc

df1 = df.orderBy("age")
df2 = df.orderBy("age", "gender")
df3 = df.orderBy(desc("age"))
df4 = df.orderBy(desc("age"), "gender")
df5 = df.sort(desc("age"))
df6 = df.orderBy(col("email").asc_nulls_first())
df7 = df.orderBy(col("email").asc_nulls_last())


display(df7)


In [0]:
from pyspark.sql.functions import sum, avg, max, min

df1 = df.groupBy("gender").count()
df2 = df.groupBy("gender", "customer_type").count()
df3 = df.groupBy("gender").sum("age")
df4 = df.groupBy("gender").max("age")
df5 = df.groupBy("gender").min("age")
df6 = df.groupBy("gender").avg("age")
df7 = df.groupBy("gender").agg(sum("age"), max("age"), min("age"))
df8 = df.groupBy("gender").agg(sum("age").alias("Sum"), max("age").alias("Highest"), min("age").alias("Lowest"), avg("age").alias("Average"))

display(df8)

In [0]:
from pyspark.sql.functions import upper, lower, rtrim, ltrim, trim, regexp_replace, split, contains, length, concat_ws

df1 = df.select(upper("country"))
df2 = df.select(lower("country"))
df3 = df.select(ltrim("country").alias("New_Country"), rtrim("country"), trim("country"))
df4 = df.select(regexp_replace(col("country"), "Unknown", "Not Sure"))
df5 = df.select("email", split("email", "@"))
df6 = df.select(("email"), col("email").contains("customer"))
df7 = df.select("email", length("email"))
df8 = df.select(concat_ws("@@", "age", "country"))

display(df8)

In [0]:
from pyspark.sql.functions import *

df1 = df.select("registration_date", year("registration_date"), month("registration_date"), dayofmonth("registration_date"))
#display(df1)

df2 = df.select("registration_date", dayofweek("registration_date"), weekofyear("registration_date"), dayofyear("registration_date"), quarter("registration_date"))
#display(df2)

df3 = df.select("registration_date", date_add("registration_date", 10), date_sub("registration_date", 10))
#display(df3)

df4 = df.select(current_date()).limit(1)
display(df4)

In [0]:
from pyspark.sql.functions import *

df1 = df.select("ingestion_timestamp", year("ingestion_timestamp"), month("ingestion_timestamp"), dayofmonth("ingestion_timestamp"), hour("ingestion_timestamp"), minute("ingestion_timestamp"), second("ingestion_timestamp"))
#display(df1)

df2 = df.select(current_timestamp()).limit(1)
#display(df2)

df3 = df.select("ingestion_timestamp", date_diff(current_timestamp(), "ingestion_timestamp"))
display(df3)

