In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder \
    .appName("AmazonCanadaSalesAnalysis") \
    .config("spark.jars.packages", "net.snowflake:spark-snowflake_2.12:2.10.1-spark_3.1") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.13") \
    .getOrCreate()

# Snowflake connection options
sfOptions = {
    "sfURL" : "https://pdxuwmc-wb15506.snowflakecomputing.com",
    "sfUser" : "Bob35",
    "sfPassword" : "******",
    "sfDatabase" : "GBCCLASS",
    "sfSchema" : "PUBLIC",
    "sfWarehouse" : "GBCCLASS_WH",
    "sfRole" : "ACCOUNTADMIN", 
}

# Function to read Snowflake tables
def read_snowflake_table(table_name):
    return spark.read \
        .format("snowflake") \
        .options(**sfOptions) \
        .option("dbtable", table_name) \
        .load()

In [4]:
# Load tables from TPCDS_SF10TCL
df = read_snowflake_table("amz_ca_total_products_data_processed")
# Show the data
df.show()

+----------+--------------------+--------------------+--------------------+-----+-------+------+---------+------------+------------+-----------------+
|      ASIN|               TITLE|              IMGURL|          PRODUCTURL|STARS|REVIEWS| PRICE|LISTPRICE|CATEGORYNAME|ISBESTSELLER|BOUGHTINLASTMONTH|
+----------+--------------------+--------------------+--------------------+-----+-------+------+---------+------------+------------+-----------------+
|B09P7NGVHD|Merory Card Adapt...|https://m.media-a...|https://www.amazo...|  0.0|      0|   0.0|      0.0|Memory Cards|       False|                0|
|B09P7JQT7Y|SUTK Speed Micro ...|https://m.media-a...|https://www.amazo...|  0.0|      0|   0.0|      0.0|Memory Cards|       False|                0|
|B09P78GGKZ|Extreme 16GB Comp...|https://m.media-a...|https://www.amazo...|  4.5|    130| 55.94|      0.0|Memory Cards|       False|                0|
|B09P76FDZG|Extreme 8GB Compa...|https://m.media-a...|https://www.amazo...|  4.3|     24| 53.7

In [5]:
# Handling missing values by removing rows with null in important columns
df_cleaned = df.dropna(subset=["asin", "title", "categoryName", "stars", "reviews", "price", "boughtInLastMonth"])

# Convert `isBestSeller` to boolean type, if not already
df_cleaned = df_cleaned.withColumn("isBestSeller", col("isBestSeller").cast("boolean"))

# Convert `price` and `listPrice` to numeric types (float) and handle any data cleaning there
df_cleaned = df_cleaned.withColumn("price", round(col("price"), 2))
df_cleaned = df_cleaned.withColumn("listPrice", round(col("listPrice"), 2))

# Basic statistics
df_cleaned.describe().show()

+-------+--------------------+--------------------+--------------------+--------------------+------------------+-----------------+------------------+------------------+--------------------+-----------------+
|summary|                ASIN|               TITLE|              IMGURL|          PRODUCTURL|             STARS|          REVIEWS|             price|         listPrice|        CATEGORYNAME|BOUGHTINLASTMONTH|
+-------+--------------------+--------------------+--------------------+--------------------+------------------+-----------------+------------------+------------------+--------------------+-----------------+
|  count|             2165926|             2165926|             2165926|             2165926|           2165926|          2165926|           2165926|           2165926|             2165926|          2165926|
|   mean|2.3207400514224043E9|   2.862830523638E10|                NULL|                NULL|2.6236609653327503|545.7390917325893|111.22072642816451| 4.651383025089083|

In [11]:
# Calculate the discount percentage
from pyspark.sql.functions import when

df_cleaned = df_cleaned.withColumn("discount_percentage", 
                                   round(((col("listPrice") - col("price")) / col("listPrice")) * 100, 2))

# Create a popularity score as a product of reviews and stars (ratings)
df_cleaned = df_cleaned.withColumn("popularity_score", col("reviews") * col("stars"))

# Categorizing products into "High Sales" and "Low Sales" based on units sold last month
df_cleaned = df_cleaned.withColumn("sales_performance", 
                                   when(col("boughtInLastMonth") > 100, "High Sales").otherwise("Low Sales"))

# Classify products based on reviews count (as proxy for sales performance)
df_cleaned =  df_cleaned.withColumn("reviewCategory", when(col("reviews") > 1000, "High").when(col("reviews") > 100, "Medium").otherwise("Low"))

# Show a few records after feature engineering
df_cleaned.select("title", "categoryName", "price", "discount_percentage", "popularity_score", "sales_performance","reviewCategory").show(5)


+--------------------+----------------+------+-------------------+----------------+-----------------+--------------+
|               title|    categoryName| price|discount_percentage|popularity_score|sales_performance|reviewCategory|
+--------------------+----------------+------+-------------------+----------------+-----------------+--------------+
|The Two Marys Wat...|Handmade Artwork|461.95|               NULL|             0.0|        Low Sales|           Low|
|Taking on Provisi...|Handmade Artwork|293.95|               NULL|             0.0|        Low Sales|           Low|
|Street Scene, Chr...|Handmade Artwork|265.95|               NULL|             0.0|        Low Sales|           Low|
|The First Animals...|Handmade Artwork|351.95|               NULL|             0.0|        Low Sales|           Low|
|Une dame sous la ...|Handmade Artwork|469.95|               NULL|             0.0|        Low Sales|           Low|
+--------------------+----------------+------+------------------

In [12]:
# Group by product category and summarize total sales and average price
sales_by_category = df_cleaned.groupBy("categoryName").agg(
    {"boughtInLastMonth": "sum", "price": "avg", "popularity_score": "avg"}
).orderBy("sum(boughtInLastMonth)", ascending=False)

# Show top-performing categories
sales_by_category.show()

# Sales performance by product category and price
sales_by_price_category = df_cleaned.groupBy("categoryName", "price").agg(
    {"boughtInLastMonth": "sum"}
).orderBy("sum(boughtInLastMonth)", ascending=False)

# Show price-based sales trends
sales_by_price_category.show()

+--------------------+----------------------+------------------+---------------------+
|        categoryName|sum(boughtInLastMonth)|        avg(price)|avg(popularity_score)|
+--------------------+----------------------+------------------+---------------------+
|              Beauty|             3470100.0| 23.02486917983499|   11706.788172789406|
|             Grocery|             1609450.0| 25.56372952289839|   3542.2711825899837|
|       Home  Kitchen|             1251950.0|51.219832338732054|   24721.365558397254|
|        Pet Supplies|             1063400.0| 34.00641321613692|   11074.306946944853|
|Health  Personal ...|              988750.0|28.642503282275396|    8695.916455142215|
|Health Care Products|              711300.0| 28.03775949540616|    5354.033634992452|
|         Toys  Games|              678800.0|34.503308351177054|    8909.378533190615|
|     Office Products|              550100.0|28.820169283276147|   10097.767412969288|
|Home Storage  Org...|              510900.

In [13]:
# a. Sales Performance by Category

# Aggregating total reviews by category as a proxy for sales performance
sales_performance = df_cleaned.groupBy("categoryName").agg({"reviews": "sum"}).withColumnRenamed("sum(reviews)", "totalReviews")
sales_performance.orderBy(col("totalReviews").desc()).show(10)

+------------------+------------+
|      categoryName|totalReviews|
+------------------+------------+
|       Electronics|1.09694067E8|
|            Beauty| 5.0192709E7|
|      Data Storage| 3.3968817E7|
|               Men| 2.7540946E7|
|Televisions  Video| 2.6003143E7|
|      Pet Supplies| 2.3943228E7|
|             Women| 2.3700128E7|
|     Home  Kitchen|   1.95412E7|
|   Office Products| 1.7129606E7|
|              Baby| 1.6712775E7|
+------------------+------------+
only showing top 10 rows



In [14]:
# b. Price Sensitivity Analysis
# Price vs. Reviews Analysis
price_sensitivity = df_cleaned.groupBy("price").agg({"reviews": "avg"}).withColumnRenamed("avg(reviews)", "avgReviews")
price_sensitivity.orderBy(col("price").desc()).show(10)


+--------+----------+
|   price|avgReviews|
+--------+----------+
| 40900.0|       0.0|
| 34552.5|       0.0|
| 32079.9|       0.0|
|29508.95|       0.0|
|28012.45|       0.0|
| 27890.8|       0.0|
|27504.41|       0.0|
|27409.95|       0.0|
|26975.67|       0.0|
| 26376.3|       0.0|
+--------+----------+
only showing top 10 rows

