#PySpark for Store-Level Insights

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initalizing spark session

In [2]:
spark = SparkSession.builder.appName("week-3").getOrCreate()
spark

# Loading large sales data into PySpark

In [5]:
from google.colab import files
uploaded = files.upload()

Saving employees.csv to employees.csv
Saving products.csv to products.csv
Saving sales.csv to sales.csv
Saving stores.csv to stores.csv


In [6]:
dfEmp = spark.read.csv(r"/content/employees.csv", header=True, inferSchema=True)
dfProd = spark.read.csv(r"/content/products.csv", header=True, inferSchema=True)
dfSales = spark.read.csv(r"/content/sales.csv", header=True, inferSchema=True)
dfStores = spark.read.csv(r"/content/stores.csv", header=True, inferSchema=True)

# Printing Schema

In [8]:
dfEmp.printSchema()

root
 |-- employeeid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- storeid: integer (nullable = true)
 |-- role: string (nullable = true)
 |-- hiredate: timestamp (nullable = true)



In [9]:
dfProd.printSchema()

root
 |-- productid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- cost: double (nullable = true)
 |-- discountpercentage: double (nullable = true)
 |-- createdat: timestamp (nullable = true)



In [10]:
dfSales.printSchema()

root
 |-- saleid: integer (nullable = true)
 |-- productid: integer (nullable = true)
 |-- storeid: integer (nullable = true)
 |-- employeeid: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- saledate: timestamp (nullable = true)



In [11]:
dfStores.printSchema()

root
 |-- storeid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- region: string (nullable = true)
 |-- address: string (nullable = true)
 |-- createdat: timestamp (nullable = true)



# Filtering data for underperforming products

In [12]:
dfJoined = dfProd.join(dfSales, on="productID", how="inner")

In [15]:
from pyspark.sql import functions as F

dfJoined = dfJoined.withColumn("Margin %", ((F.col("price") - F.col("cost")) / F.col("price")) * 100)

In [16]:
marginMean = dfJoined.select(F.mean("Margin %")).collect()[0][0]

In [22]:
underPerforming = dfJoined.withColumn(
    "tillDate", F.abs(F.date_diff(F.col("createdat").cast("date"), F.current_date()))).filter((F.col("Margin %") < marginMean) & (F.col("tillDate") < 408)
).withColumn("marginRevenue", F.col("quantity") * F.col("price") * (F.col("Margin %") / 100)
).groupby("productid").agg(F.sum("quantity").alias("totalSold"),F.sum("marginRevenue").alias("marginRevenue"))

underPerforming = underPerforming.join(
dfProd.select(["productid", "name"]), on="productid", how="inner").sort("marginRevenue")
underPerforming.select("name", "totalSold", "marginRevenue").show()

+----------------+---------+-------------+
|            name|totalSold|marginRevenue|
+----------------+---------+-------------+
|wireless earbuds|        8|        200.0|
|gaming laptop 16|        3|        900.0|
+----------------+---------+-------------+



#  Group by store and calculate average monthly revenue

In [24]:
df_sales = dfSales.join(dfStores, on="storeID", how="inner") \
                  .join(dfProd, on="productID", how="inner")

df_sales = df_sales.withColumn("saleMonth", F.month("saleDate")) \
                   .withColumn("amount", F.col("quantity") * F.col("price"))

In [25]:
storeSummary = df_sales.groupBy(["storeID", "saleMonth"]) \
  .agg(
    F.mean("amount").alias("monthlyRevenue")
  ) \
  .join(dfStores.select("storeID", "name"), on="storeID") \
  .sort(["saleMonth"]) \
  .select(["name", "saleMonth", "monthlyRevenue"])

storeSummary.show()

+-----------------+---------+--------------+
|             name|saleMonth|monthlyRevenue|
+-----------------+---------+--------------+
|   city mart - ny|        7|        4500.0|
| super saver - il|        7|         640.0|
| sunny store - fl|        7|         330.0|
|fresh bodega - tx|        7|        1000.0|
| budget shop - la|        7|         300.0|
+-----------------+---------+--------------+



# Deliverables
##- PySpark script with filtering, grouping, and aggregation
##- Output file showing underperforming products/store summary

### Filtering

In [26]:
finalUnderperformingDF = underPerforming.select("name", "totalSold", "marginRevenue")

In [27]:
finalUnderperformingDF.coalesce(1).write.option("header", True).mode("overwrite").csv("/content/underperforming_products")

In [28]:
import os
import shutil

for file in os.listdir("/content/underperforming_products"):
    if file.endswith(".csv"):
        shutil.move(f"/content/underperforming_products/{file}", "/content/underperforming_products.csv")
        break

In [29]:
from google.colab import files
files.download("/content/underperforming_products.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Grouping and Aggregation

In [30]:
finalStoreSummaryDF = storeSummary.select("name", "saleMonth", "monthlyRevenue")

In [31]:
finalStoreSummaryDF.coalesce(1).write.option("header", True).mode("overwrite").csv("/content/store_monthly_revenue")

In [32]:
import os
import shutil

for file in os.listdir("/content/store_monthly_revenue"):
    if file.endswith(".csv"):
        shutil.move(f"/content/store_monthly_revenue/{file}", "/content/store_monthly_revenue.csv")
        break

In [33]:
from google.colab import files
files.download("/content/store_monthly_revenue.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>