In [10]:
import pandas as pd
import dask.dataframe as dd
from pyspark.sql.functions import col, when, year, month, current_date, datediff, to_date, explode, get_json_object, udf
from pyspark.sql.types import StringType
import json
import datetime
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder\
    .appName("SalesData")\
    .getOrCreate())

In [3]:
from google.colab import drive
drive.mount('/content/drive')
df = spark.read.csv("/content/drive/MyDrive/Sales_Dataset__500_Records_.csv", header=True, inferSchema=True)

Mounted at /content/drive


**1. DataFrame Creation and Inspection**

In [11]:
# PySpark
df.show(5)
df.show(5, truncate=False)
df.tail(5)
print(df.printSchema())

# Pandas
pdf = pd.read_csv("/content/drive/MyDrive/Sales_Dataset__500_Records_.csv")
print(pdf.head(5))
print(pdf.tail(5))
print(pdf.dtypes)

# Dask
ddf = dd.read_csv("/content/drive/MyDrive/Sales_Dataset__500_Records_.csv")
print(ddf.head(5))
print(ddf.tail(5))
print(ddf.dtypes)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

**Selection, Renaming, and Filtering**

In [19]:
# PySpark
# Select columns
selected_df = df.select("OrderID", "CustomerName", "Amount")
selected_df.show()

# Rename column
renamed_df = selected_df.withColumnRenamed("Amount", "OrderAmount")
renamed_df.show()

# Filter amount > 500
filtered_df = renamed_df.filter(col("OrderAmount") > 500)
filtered_df.show()

# Filter specific city
city_filtered_df = df.filter(col("City") == "Port Roy")
city_filtered_df.show()

+-------+------------------+------+
|OrderID|      CustomerName|Amount|
+-------+------------------+------+
|   2824|     Donald Walker|783.04|
|   7912|      Brandon Hall| 905.0|
|   4611|      Donald Booth|657.96|
|   3547|    Phillip Garcia|606.89|
|   8527|      Valerie Gray| 77.87|
|   4150|       Amber Perez|352.37|
|   5554|        Roy Martin|148.33|
|   2169|    Carolyn Daniel| 14.09|
|   6313|       Patty Perez| 79.83|
|   6155|Jonathan Wilkerson|882.68|
|   9830|       Kevin Hurst|870.55|
|   9085| Anthony Rodriguez|921.73|
|   2040|     Kyle Mcdonald|327.52|
|   6573|    Jeffrey Chavez|676.02|
|   2743|  Elizabeth Fowler| 47.06|
|   9837|     Tammy Sellers| 46.15|
|   6038|     David Bradley|348.51|
|   3060|       John Pierce|362.09|
|   4295|   Jennifer Powers|684.26|
|   5061|    George Chapman|251.89|
+-------+------------------+------+
only showing top 20 rows

+-------+------------------+-----------+
|OrderID|      CustomerName|OrderAmount|
+-------+------------------+

**3. Data Manipulation**

In [22]:
df_dropped = df.drop("CustomerSince")
df_dropped.show(10)

df_with_final = df_dropped.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))
df_with_final.show()

df_sorted = df_with_final.orderBy(col("FinalAmount").desc())
df_sorted.show()

df_replaced = df_sorted.withColumn("DeliveryStatus",
                                 when(col("DeliveryStatus") == "Cancelled", "Order Cancelled")
                                 .otherwise(col("DeliveryStatus")))
df_replaced.show(10)

+-------+------------------+---------------+------+----------+--------------+--------+----------------+-----------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|
+-------+------------------+---------------+------+----------+--------------+--------+----------------+-----------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|
|   8527|      Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|
|   4150|       Amber Perez|          Books|352.37|2024-01-13|     Cance

**4. Aggregations and GroupBy**

In [24]:
status_count = df.groupBy("DeliveryStatus").count()
status_count.show()
avg_amount = df.groupBy("ProductCategory").avg("Amount")
avg_amount.show()
city_sales = df.groupBy("City").sum("Amount")
city_sales.show(10)

+--------------+-----+
|DeliveryStatus|count|
+--------------+-----+
|      Returned|  117|
|     Cancelled|  149|
|     Delivered|  119|
|       Pending|  115|
+--------------+-----+

+---------------+------------------+
|ProductCategory|       avg(Amount)|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+-----------+
|            City|sum(Amount)|
+----------------+-----------+
|     Ramseymouth|     761.06|
|East Edwardshire|     291.26|
|      Thomasberg|     882.68|
|     Laurenville|     383.26|
| South Colinstad|     786.27|
|    Lake Douglas|     975.09|
|   Williamsmouth|      10.78|
|      Gordonport|     514.99|
|  West Dawnmouth|       12.8|
|        Seanbury|     814.39|
+----------------+-----------+
only showing top 10 rows



**5. Null Handling & Update**

In [27]:
from pyspark.sql.functions import lit
df_with_nulls = df.withColumn("City", when(col("OrderID") % 5 == 0, lit(None)).otherwise(col("City")))

df_filled = df_with_nulls.fillna({"City": "Unknown"})

# Handle nulls - drop rows with nulls in City
df_dropped_nulls = df_with_nulls.dropna(subset=["City"])

# Tag high-value customers
df_tagged = df.withColumn("CustomerType", when(col("Amount") > 800, "High Value").otherwise("Regular"))
df_tagged.show()

+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|CustomerType|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|     Regular|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|  High Value|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|     Regular|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|     R

**6. Date & Time Functions**

In [28]:
df_dates = df.withColumn("OrderYear", year("OrderDate")) \
             .withColumn("OrderMonth", month("OrderDate"))
df_dates.show()

df_loyalty = df_dates.withColumn("CustomerSinceDate", to_date("CustomerSince")) \
.withColumn ("LoyaltyYears",datediff(current_date(), col("CustomerSinceDate")) / 365)
df_dates.show()

+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+---------+----------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|OrderYear|OrderMonth|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+---------+----------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|     2024|        12|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|     2024|         9|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|     2025|         1|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| W

**7. Joins and Unions**

In [30]:
# Second DataFrame
from pyspark.sql import Row
regions_data = [Row(City="Port Jesseville", Region="West"),
    Row(City="Grayside", Region="South")]
regions_df = spark.createDataFrame(regions_data)

# Inner and left join
inner_join = df.join(regions_df, "City", "inner")
inner_join.show()
left_join = df.join(regions_df, "City", "left")
left_join.show()

#Union
df_2023 = df.filter(year("OrderDate") == 2023)
df_2024 = df.filter(year("OrderDate") == 2024)
union_df = df_2023.union(df_2024)
union_df.show()

+---------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
|           City|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|Region|
+---------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
|Port Jesseville|   4150|   Amber Perez|          Books|352.37|2024-01-13|     Cancelled|    0.24|       Cash|   2022-01-13|  West|
|       Grayside|   2169|Carolyn Daniel|    Electronics| 14.09|2023-10-07|     Delivered|    0.25|Credit Card|   2021-05-09| South|
+---------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------+

+-----------------+-------+------------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
|             City|OrderID|      CustomerName|ProductCategory|Amount|

 **8. Complex JSON Simulation (Advanced)**

In [33]:
from pyspark.sql.functions import to_json, struct, from_json, schema_of_json
#Convert to JSON
sales_json =df.withColumn("json_data", to_json(struct([col(c) for c in df.columns])))
sales_json.select("json_data").show(truncate=False)

#Load back to DataFrame
sample_json = sales_json.select("json_data").first()["json_data"]
json_schema = schema_of_json(sample_json)
df_loaded = sales_json.select(from_json("json_data", json_schema).alias("data")).select("data.*")
df_loaded.show()

#save the data in json
df.write.mode("overwrite").json("sales_json")

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|json_data                                                                                                                                                                                                                                        |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"OrderID":2824,"CustomerName":"Donald Walker","ProductCategory":"Books","Amount":783.04,"OrderDate":"2024-12-26","DeliveryStatus":"Returned","Discount":0.15,"City":"Lake Joyside","PaymentMode":"Credit Card","CustomerSince":"2020-10-15"}    |
|{"OrderID":7912,"Custom

**9. Applying Functions**

In [38]:
def order_size_tag(amount):
    if amount > 800:
        return "Big"
    elif amount > 300:
        return "Medium"
    else:
        return "Small"

# Register UDF
order_size_udf = udf(order_size_tag, StringType())

# Apply UDF
df_tagged = df.withColumn("OrderSize", order_size_udf(col("Amount")))
df_tagged.select("OrderID", "CustomerName", "Amount", "OrderSize").show()

+-------+------------------+------+---------+
|OrderID|      CustomerName|Amount|OrderSize|
+-------+------------------+------+---------+
|   2824|     Donald Walker|783.04|   Medium|
|   7912|      Brandon Hall| 905.0|      Big|
|   4611|      Donald Booth|657.96|   Medium|
|   3547|    Phillip Garcia|606.89|   Medium|
|   8527|      Valerie Gray| 77.87|    Small|
|   4150|       Amber Perez|352.37|   Medium|
|   5554|        Roy Martin|148.33|    Small|
|   2169|    Carolyn Daniel| 14.09|    Small|
|   6313|       Patty Perez| 79.83|    Small|
|   6155|Jonathan Wilkerson|882.68|      Big|
|   9830|       Kevin Hurst|870.55|      Big|
|   9085| Anthony Rodriguez|921.73|      Big|
|   2040|     Kyle Mcdonald|327.52|   Medium|
|   6573|    Jeffrey Chavez|676.02|   Medium|
|   2743|  Elizabeth Fowler| 47.06|    Small|
|   9837|     Tammy Sellers| 46.15|    Small|
|   6038|     David Bradley|348.51|   Medium|
|   3060|       John Pierce|362.09|   Medium|
|   4295|   Jennifer Powers|684.26