In [1]:
!pip install pyspark dask pandas --quiet



In [4]:
from pyspark.sql import SparkSession
import pandas as pd
import dask.dataframe as dd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Spark session
spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()

# Load dataset path
path = '/content/drive/MyDrive/Sales_Dataset__500_Records_.csv'



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
df = spark.read.csv(path, header=True, inferSchema=True)
df.show(500, truncate=False)

+-------+----------------------+---------------+------+----------+--------------+--------+---------------------+-----------+-------------+
|OrderID|CustomerName          |ProductCategory|Amount|OrderDate |DeliveryStatus|Discount|City                 |PaymentMode|CustomerSince|
+-------+----------------------+---------------+------+----------+--------------+--------+---------------------+-----------+-------------+
|2824   |Donald Walker         |Books          |783.04|2024-12-26|Returned      |0.15    |Lake Joyside         |Credit Card|2020-10-15   |
|7912   |Brandon Hall          |Groceries      |905.0 |2024-09-12|Cancelled     |0.03    |New Jamesside        |Wallet     |2022-03-15   |
|4611   |Donald Booth          |Fashion        |657.96|2025-01-12|Returned      |0.01    |Lake Roberto         |Wallet     |2021-08-07   |
|3547   |Phillip Garcia        |Fashion        |606.89|2024-03-24|Returned      |0.15    |West Melanieview     |Wallet     |2020-08-08   |
|8527   |Valerie Gray      

In [31]:
#1.DataFrame Creation and Inspection
# Load with Pandas
df_pandas = pd.read_csv(path)
print("Pandas DataFrame:")
display(df_pandas.head())

# Load with PySpark
df_spark = spark.read.csv(path, header=True, inferSchema=True)
print("Spark DataFrame:")
df_spark.show()

# Load with Dask
df_dask = dd.read_csv(path)
print("Dask DataFrame:")
df_dask.head()

Pandas DataFrame:


Unnamed: 0,OrderID,CustomerName,ProductCategory,Amount,OrderDate,DeliveryStatus,Discount,City,PaymentMode,CustomerSince
0,2824,Donald Walker,Books,783.04,2024-12-26,Returned,0.15,Lake Joyside,Credit Card,2020-10-15
1,7912,Brandon Hall,Groceries,905.0,2024-09-12,Cancelled,0.03,New Jamesside,Wallet,2022-03-15
2,4611,Donald Booth,Fashion,657.96,2025-01-12,Returned,0.01,Lake Roberto,Wallet,2021-08-07
3,3547,Phillip Garcia,Fashion,606.89,2024-03-24,Returned,0.15,West Melanieview,Wallet,2020-08-08
4,8527,Valerie Gray,Toys,77.87,2024-08-04,Delivered,0.17,Mariastad,Cash,2022-11-15


Spark DataFrame:
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   8527|      Valerie Gray|           Toys| 77.87|2024-08-04|    

Unnamed: 0,OrderID,CustomerName,ProductCategory,Amount,OrderDate,DeliveryStatus,Discount,City,PaymentMode,CustomerSince
0,2824,Donald Walker,Books,783.04,2024-12-26,Returned,0.15,Lake Joyside,Credit Card,2020-10-15
1,7912,Brandon Hall,Groceries,905.0,2024-09-12,Cancelled,0.03,New Jamesside,Wallet,2022-03-15
2,4611,Donald Booth,Fashion,657.96,2025-01-12,Returned,0.01,Lake Roberto,Wallet,2021-08-07
3,3547,Phillip Garcia,Fashion,606.89,2024-03-24,Returned,0.15,West Melanieview,Wallet,2020-08-08
4,8527,Valerie Gray,Toys,77.87,2024-08-04,Delivered,0.17,Mariastad,Cash,2022-11-15


In [32]:
#2.Selection, Renaming, and Filtering
from pyspark.sql.functions import col

# Select and Rename
df_selected = df.select("OrderID", "CustomerName", "Amount").withColumnRenamed("Amount", "OrderAmount")
df_selected.show()

# Filter where Amount > 500
df_filtered_amount = df.filter(col("Amount") > 500)
df_filtered_amount.show()

# Filter using .filter() by city
df_filtered_city = df.filter(col("City") == "Lake Joyside")
df_filtered_city.show()

+-------+------------------+-----------+
|OrderID|      CustomerName|OrderAmount|
+-------+------------------+-----------+
|   2824|     Donald Walker|     783.04|
|   7912|      Brandon Hall|      905.0|
|   4611|      Donald Booth|     657.96|
|   3547|    Phillip Garcia|     606.89|
|   8527|      Valerie Gray|      77.87|
|   4150|       Amber Perez|     352.37|
|   5554|        Roy Martin|     148.33|
|   2169|    Carolyn Daniel|      14.09|
|   6313|       Patty Perez|      79.83|
|   6155|Jonathan Wilkerson|     882.68|
|   9830|       Kevin Hurst|     870.55|
|   9085| Anthony Rodriguez|     921.73|
|   2040|     Kyle Mcdonald|     327.52|
|   6573|    Jeffrey Chavez|     676.02|
|   2743|  Elizabeth Fowler|      47.06|
|   9837|     Tammy Sellers|      46.15|
|   6038|     David Bradley|     348.51|
|   3060|       John Pierce|     362.09|
|   4295|   Jennifer Powers|     684.26|
|   5061|    George Chapman|     251.89|
+-------+------------------+-----------+
only showing top

In [34]:
#3.Data Manipulation
from pyspark.sql.functions import when

# Drop 'CustomerSince'
df_no_customer_since = df.drop("CustomerSince")

# Add FinalAmount = Amount - (Amount * Discount)
df_final_amount = df_no_customer_since.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))

# Sort by FinalAmount descending
df_sorted_final = df_final_amount.orderBy(col("FinalAmount").desc())
df_sorted_final.select("OrderID", "Amount", "Discount", "FinalAmount").show()

# Replace "Cancelled" with "Order Cancelled" in DeliveryStatus
df_status_updated = df_sorted_final.withColumn(
    "DeliveryStatus",
    when(col("DeliveryStatus") == "Cancelled", "Order Cancelled").otherwise(col("DeliveryStatus"))
)
df_status_updated.select("OrderID", "DeliveryStatus").show()

+-------+------+--------+-----------------+
|OrderID|Amount|Discount|      FinalAmount|
+-------+------+--------+-----------------+
|   5573|981.05|    0.02|          961.429|
|   8474|968.91|    0.02|         949.5318|
|   8889| 998.3|    0.06|938.4019999999999|
|   2127|933.32|    0.01|         923.9868|
|   9806|993.17|    0.07|         923.6481|
|   5593|961.35|    0.05|         913.2825|
|   2120|948.84|    0.04|         910.8864|
|   5949|918.14|    0.01|908.9585999999999|
|   1422| 973.2|    0.07|          905.076|
|   2904|922.29|    0.02|         903.8442|
|   7566|899.31|     0.0|           899.31|
|   7511|932.21|    0.04|         894.9216|
|   9085|921.73|    0.03|894.0781000000001|
|   1436|978.96|    0.09|         890.8536|
|   6008|903.71|    0.02|         885.6358|
|   9834|944.55|    0.07|878.4314999999999|
|   8253|998.21|    0.12|         878.4248|
|   7912| 905.0|    0.03|           877.85|
|   1654|903.78|    0.03|         876.6666|
|   9239|897.41|    0.04|       

In [14]:
#4.Aggregations and GroupBy

# Count of orders by DeliveryStatus
df.groupBy("DeliveryStatus").count().show()

# Average Amount by ProductCategory
df.groupBy("ProductCategory").avg("Amount").withColumnRenamed("avg(Amount)", "AvgAmount").show()

# Group by City and show total sales
df.groupBy("City").sum("Amount").withColumnRenamed("sum(Amount)", "TotalSales").show()

+--------------+-----+
|DeliveryStatus|count|
+--------------+-----+
|      Returned|  117|
|     Cancelled|  149|
|     Delivered|  119|
|       Pending|  115|
+--------------+-----+

+---------------+------------------+
|ProductCategory|         AvgAmount|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+----------+
|            City|TotalSales|
+----------------+----------+
|     Ramseymouth|    761.06|
|East Edwardshire|    291.26|
|      Thomasberg|    882.68|
|     Laurenville|    383.26|
| South Colinstad|    786.27|
|    Lake Douglas|    975.09|
|   Williamsmouth|     10.78|
|      Gordonport|    514.99|
|  West Dawnmouth|      12.8|
|        Seanbury|    814.39|
|     Sheilaville|    981.05|
|       Mollybury|    222.02|
|       Lisaville|     45.69|
|

In [35]:
#5.Null Handling & Update
from pyspark.sql.functions import lit

# Inject nulls in City (simulate)
df_with_nulls = df.withColumn("City", when(col("OrderID") < 10, lit(None)).otherwise(col("City")))

# Fill nulls with "Unknown"
df_filled = df_with_nulls.fillna({"City": "Unknown"})

# Drop rows where City is null
df_dropped = df_with_nulls.dropna(subset=["City"])

# Tag high-value customers: Amount > 800
df_tagged = df.withColumn("CustomerTag", when(col("Amount") > 800, "High-Value").otherwise("Regular"))
df_tagged.select("OrderID", "Amount", "CustomerTag").show()

+-------+------+-----------+
|OrderID|Amount|CustomerTag|
+-------+------+-----------+
|   2824|783.04|    Regular|
|   7912| 905.0| High-Value|
|   4611|657.96|    Regular|
|   3547|606.89|    Regular|
|   8527| 77.87|    Regular|
|   4150|352.37|    Regular|
|   5554|148.33|    Regular|
|   2169| 14.09|    Regular|
|   6313| 79.83|    Regular|
|   6155|882.68| High-Value|
|   9830|870.55| High-Value|
|   9085|921.73| High-Value|
|   2040|327.52|    Regular|
|   6573|676.02|    Regular|
|   2743| 47.06|    Regular|
|   9837| 46.15|    Regular|
|   6038|348.51|    Regular|
|   3060|362.09|    Regular|
|   4295|684.26|    Regular|
|   5061|251.89|    Regular|
+-------+------+-----------+
only showing top 20 rows



In [36]:
#6.Date & Time Functions
from pyspark.sql.functions import to_date, year, month, current_date, datediff

# Convert date strings to proper date format
df_dates = df.withColumn("OrderDate", to_date(col("OrderDate"))) \
             .withColumn("CustomerSince", to_date(col("CustomerSince")))

# Extract year and month
df_date_parts = df_dates.withColumn("Year", year(col("OrderDate"))) \
                        .withColumn("Month", month(col("OrderDate")))

# Loyalty in years = today - CustomerSince
df_loyalty = df_date_parts.withColumn("LoyaltyYears", (datediff(current_date(), col("CustomerSince")) / 365).cast("int"))

df_loyalty.select("CustomerName", "CustomerSince", "LoyaltyYears").show()

+------------------+-------------+------------+
|      CustomerName|CustomerSince|LoyaltyYears|
+------------------+-------------+------------+
|     Donald Walker|   2020-10-15|           4|
|      Brandon Hall|   2022-03-15|           3|
|      Donald Booth|   2021-08-07|           3|
|    Phillip Garcia|   2020-08-08|           4|
|      Valerie Gray|   2022-11-15|           2|
|       Amber Perez|   2022-01-13|           3|
|        Roy Martin|   2023-04-29|           2|
|    Carolyn Daniel|   2021-05-09|           4|
|       Patty Perez|   2021-04-25|           4|
|Jonathan Wilkerson|   2021-06-20|           3|
|       Kevin Hurst|   2022-08-02|           2|
| Anthony Rodriguez|   2022-12-15|           2|
|     Kyle Mcdonald|   2021-07-21|           3|
|    Jeffrey Chavez|   2022-07-30|           2|
|  Elizabeth Fowler|   2021-02-07|           4|
|     Tammy Sellers|   2021-12-17|           3|
|     David Bradley|   2022-09-07|           2|
|       John Pierce|   2023-05-09|      

In [37]:
#7.Joins and Unions
# Create sample region mapping DataFrame (Pandas to Spark)
region_df = pd.DataFrame({"City": ["Chennai", "Mumbai", "Delhi"], "Region": ["South", "West", "North"]})
spark_region_df = spark.createDataFrame(region_df)

# Inner join
inner_joined = df.join(spark_region_df, on="City", how="inner")
inner_joined.select("City", "Region", "OrderID").show()

# Left join
left_joined = df.join(spark_region_df, on="City", how="left")
left_joined.select("City", "Region", "OrderID").show()

# Union orders from 2023 and 2024
df_with_date = df.withColumn("OrderDate", to_date(col("OrderDate")))
orders_2023 = df_with_date.filter(year("OrderDate") == 2023)
orders_2024 = df_with_date.filter(year("OrderDate") == 2024)
unioned_df = orders_2023.union(orders_2024)

unioned_df.select("OrderID", "OrderDate").show()

+----+------+-------+
|City|Region|OrderID|
+----+------+-------+
+----+------+-------+

+-----------------+------+-------+
|             City|Region|OrderID|
+-----------------+------+-------+
|       Thomasberg|  NULL|   6155|
|   East Nathaniel|  NULL|   2743|
|  Port Jesseville|  NULL|   4150|
|       North Chad|  NULL|   5061|
|    Lake Toddland|  NULL|   6038|
|     Lake Joyside|  NULL|   2824|
|       Brandtside|  NULL|   3060|
|   West Elizabeth|  NULL|   1964|
|Lake Jenniferside|  NULL|   2040|
|        Port Erin|  NULL|   9085|
|      Teresaburgh|  NULL|   6573|
|     Lake Roberto|  NULL|   4611|
|       Tracyville|  NULL|   9837|
|      Jeffreyberg|  NULL|   9830|
|      Lake Joseph|  NULL|   5554|
| West Melanieview|  NULL|   3547|
|    New Jamesside|  NULL|   7912|
|         Grayside|  NULL|   2169|
|        Mariastad|  NULL|   8527|
|      Richardland|  NULL|   6313|
+-----------------+------+-------+
only showing top 20 rows

+-------+----------+
|OrderID| OrderDate|
+--

In [38]:
#8.Complex JSON Simulation
from pyspark.sql.functions import to_json, struct, get_json_object

# Convert row to JSON string
df_json = df.withColumn("json_string", to_json(struct([col(c) for c in df.columns])))

# Extract specific field from JSON
df_extracted_json = df_json.select(get_json_object(col("json_string"), "$.CustomerName").alias("CustomerName"))
df_extracted_json.show()

+------------------+
|      CustomerName|
+------------------+
|     Donald Walker|
|      Brandon Hall|
|      Donald Booth|
|    Phillip Garcia|
|      Valerie Gray|
|       Amber Perez|
|        Roy Martin|
|    Carolyn Daniel|
|       Patty Perez|
|Jonathan Wilkerson|
|       Kevin Hurst|
| Anthony Rodriguez|
|     Kyle Mcdonald|
|    Jeffrey Chavez|
|  Elizabeth Fowler|
|     Tammy Sellers|
|     David Bradley|
|       John Pierce|
|   Jennifer Powers|
|    George Chapman|
+------------------+
only showing top 20 rows



In [39]:
#9.Applying Functions (UDF)
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define UDF to tag order
@udf(returnType=StringType())
def tag_order(amount):
    if amount is None:
        return "Unknown"
    elif amount > 800:
        return "Big"
    elif amount > 400:
        return "Medium"
    else:
        return "Small"

# Apply UDF
df_order_tagged = df.withColumn("OrderTag", tag_order(col("Amount")))
df_order_tagged.select("OrderID", "Amount", "OrderTag").show()

+-------+------+--------+
|OrderID|Amount|OrderTag|
+-------+------+--------+
|   2824|783.04|  Medium|
|   7912| 905.0|     Big|
|   4611|657.96|  Medium|
|   3547|606.89|  Medium|
|   8527| 77.87|   Small|
|   4150|352.37|   Small|
|   5554|148.33|   Small|
|   2169| 14.09|   Small|
|   6313| 79.83|   Small|
|   6155|882.68|     Big|
|   9830|870.55|     Big|
|   9085|921.73|     Big|
|   2040|327.52|   Small|
|   6573|676.02|  Medium|
|   2743| 47.06|   Small|
|   9837| 46.15|   Small|
|   6038|348.51|   Small|
|   3060|362.09|   Small|
|   4295|684.26|  Medium|
|   5061|251.89|   Small|
+-------+------+--------+
only showing top 20 rows

