1. Data Preprocessing
First, load the dataset and preprocess it to handle any missing values and convert data types.

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round

# Initialize Spark Session
spark = SparkSession.builder.appName("AmazonCanadaSalesAnalysis").getOrCreate()

# Load dataset (download the CSV file from the Kaggle URL locally first)
# https://www.kaggle.com/datasets/asaniczka/amazon-canada-products-2023-2-1m-products
file_path = 'data/amz_ca_total_products_data_processed.csv'

# Load dataset into PySpark DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the schema and first few rows to understand the data
df.printSchema()
df.show(5)


# Handling missing values by removing rows with null in important columns
df_cleaned = df.dropna(subset=["asin", "title", "categoryName", "stars", "reviews", "price", "boughtInLastMonth"])

# Convert `isBestSeller` to boolean type, if not already
df_cleaned = df_cleaned.withColumn("isBestSeller", col("isBestSeller").cast("boolean"))

# Convert `price` and `listPrice` to numeric types (float) and handle any data cleaning there
df_cleaned = df_cleaned.withColumn("price", round(col("price"), 2))
df_cleaned = df_cleaned.withColumn("listPrice", round(col("listPrice"), 2))

# Basic statistics
df_cleaned.describe().show()


root
 |-- asin: string (nullable = true)
 |-- title: string (nullable = true)
 |-- imgUrl: string (nullable = true)
 |-- productURL: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- reviews: string (nullable = true)
 |-- price: string (nullable = true)
 |-- listPrice: string (nullable = true)
 |-- categoryName: string (nullable = true)
 |-- isBestSeller: string (nullable = true)
 |-- boughtInLastMonth: string (nullable = true)

+----------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+---------+--------------------+------------+-----------------+
|      asin|               title|              imgUrl|          productURL|             stars|             reviews|               price|listPrice|        categoryName|isBestSeller|boughtInLastMonth|
+----------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+---------+------

2. Feature Engineering
Now, we’ll add some new features to enrich the analysis:

Discount Percentage: The difference between the listPrice and price.
Popularity Score: Combining product reviews and ratings.
Sales Trend Analysis: Using the boughtInLastMonth to gauge sales performance.

In [16]:
# Calculate the discount percentage
from pyspark.sql.functions import when

df_cleaned = df_cleaned.withColumn("discount_percentage", 
                                   round(((col("listPrice") - col("price")) / col("listPrice")) * 100, 2))

# Create a popularity score as a product of reviews and stars (ratings)
df_cleaned = df_cleaned.withColumn("popularity_score", col("reviews") * col("stars"))

# Categorizing products into "High Sales" and "Low Sales" based on units sold last month
df_cleaned = df_cleaned.withColumn("sales_performance", 
                                   when(col("boughtInLastMonth") > 100, "High Sales").otherwise("Low Sales"))

# Classify products based on reviews count (as proxy for sales performance)
df_cleaned =  df_cleaned.withColumn("reviewCategory", when(col("reviews") > 1000, "High").when(col("reviews") > 100, "Medium").otherwise("Low"))

# Show a few records after feature engineering
df_cleaned.select("title", "categoryName", "price", "discount_percentage", "popularity_score", "sales_performance","reviewCategory").show(5)


+--------------------+--------------------+-----+-------------------+------------------+-----------------+--------------+
|               title|        categoryName|price|discount_percentage|  popularity_score|sales_performance|reviewCategory|
+--------------------+--------------------+-----+-------------------+------------------+-----------------+--------------+
|Green Leaf WW3D W...|Industrial  Scien...|47.69|               NULL|12654.400000000001|        Low Sales|          High|
|8pcs Toilet Seat ...|Industrial  Scien...|10.99|               NULL|             209.0|        Low Sales|           Low|
|YaeCCC 19 Pcs Hol...|Industrial  Scien...|25.99|               7.15|             504.0|        Low Sales|        Medium|
|LLPT Butyl Putty ...|Industrial  Scien...|21.99|              29.04|            8712.0|        Low Sales|          High|
|"Lightbeam 16"" L...|                  46| NULL|               NULL|              NULL|        Low Sales|           Low|
+--------------------+--

3. Exploratory Data Analysis (EDA)
We will now explore the data to understand which product categories and price ranges perform well. We'll also explore relationships between ratings, reviews, and sales.

In [5]:
# Group by product category and summarize total sales and average price
sales_by_category = df_cleaned.groupBy("categoryName").agg(
    {"boughtInLastMonth": "sum", "price": "avg", "popularity_score": "avg"}
).orderBy("sum(boughtInLastMonth)", ascending=False)

# Show top-performing categories
sales_by_category.show()

# Sales performance by product category and price
sales_by_price_category = df_cleaned.groupBy("categoryName", "price").agg(
    {"boughtInLastMonth": "sum"}
).orderBy("sum(boughtInLastMonth)", ascending=False)

# Show price-based sales trends
sales_by_price_category.show()


+--------------------+----------------------+------------------+---------------------+
|        categoryName|sum(boughtInLastMonth)|        avg(price)|avg(popularity_score)|
+--------------------+----------------------+------------------+---------------------+
|              Beauty|             3460000.0|  22.9383352111156|   11721.571334299057|
|             Grocery|             1607150.0| 25.55472130754945|   3538.7403220978313|
|       Home  Kitchen|             1207250.0| 51.60657817109107|   24564.285309734485|
|        Pet Supplies|             1049450.0| 33.54416929604863|   11130.233489283017|
|Health  Personal ...|              984650.0|28.625196467990865|    8624.692803531994|
|Health Care Products|              701450.0|27.715260204797016|    5348.252910646641|
|         Toys  Games|              667300.0| 33.76112341197774|    8939.105226860294|
|                Baby|              506150.0|27.809507491293935|    2977.076775026393|
|     Office Products|              501700.

4. Trend Analysis
We’ll analyze the trends for the best-selling product categories, and how factors like price, reviews, and discounts influence sales.


In [17]:
# a. Sales Performance by Category

# Aggregating total reviews by category as a proxy for sales performance
sales_performance = df_cleaned.groupBy("categoryName").agg({"reviews": "sum"}).withColumnRenamed("sum(reviews)", "totalReviews")
sales_performance.orderBy(col("totalReviews").desc()).show(10)

+--------------------+--------------+
|        categoryName|  totalReviews|
+--------------------+--------------+
| Cast Iron Grill ...|1.462016451E10|
| 25708 Random Orb...|   9.0027707E8|
|https://www.amazo...|  5.32403107E8|
|           466420911|  4.63461614E8|
| Backyard BY14-10...|  4.63440109E8|
|           463441514|  4.63420509E8|
|          Tera Gear"|  4.63411712E8|
|https://www.amazo...|  4.63377017E8|
|https://www.amazo...|  4.63251714E8|
|         Electronics|   1.0802928E8|
+--------------------+--------------+
only showing top 10 rows



In [18]:
# b. Price Sensitivity Analysis
# Price vs. Reviews Analysis
price_sensitivity = df_cleaned.groupBy("price").agg({"reviews": "avg"}).withColumnRenamed("avg(reviews)", "avgReviews")
price_sensitivity.orderBy(col("price").desc()).show(10)


+--------------+--------------+
|         price|    avgReviews|
+--------------+--------------+
|1.462367931E10|1.462016451E10|
|  5.32161597E8|          NULL|
|  5.32125907E8|      193214.0|
|   5.3200904E8|          NULL|
|  4.66420909E8|  4.63461614E8|
|  4.63441312E8|  4.63440109E8|
|  4.63440109E8|  4.63420509E8|
|  4.63411911E8|  4.63411712E8|
|    3.162243E8|     1614140.0|
|      235264.0|      236264.0|
+--------------+--------------+
only showing top 10 rows



In [19]:
# c. Best Seller by Category
# Best seller count by category
best_seller = df.filter(df.isBestSeller == True).groupBy("categoryName").count().withColumnRenamed("count", "bestSellerCount")
best_seller.orderBy(col("bestSellerCount").desc()).show(10)


# Identify top-performing products and categories based on the number of units sold in the last month
top_products = df_cleaned.orderBy("boughtInLastMonth", ascending=False)
top_products.show(10)

# Analyzing the impact of discount percentage on sales
sales_vs_discounts = df_cleaned.groupBy("discount_percentage").agg(
    {"boughtInLastMonth": "sum"}
).orderBy("discount_percentage", ascending=False)

sales_vs_discounts.show()


+--------------------+---------------+
|        categoryName|bestSellerCount|
+--------------------+---------------+
|         Electronics|            428|
|       Home  Kitchen|            407|
|    Sports  Outdoors|            345|
|Automotive Replac...|            266|
|Clothing, Shoes  ...|            260|
|              Beauty|            256|
| Patio, Lawn  Garden|            230|
|        Pet Supplies|            221|
|                Baby|            213|
|             Grocery|            213|
+--------------------+---------------+
only showing top 10 rows

+----------+--------------------+--------------------+--------------------+--------------------+-----------------+-----+---------+--------------+------------+--------------------+-------------------+----------------+-----------------+--------------+
|      asin|               title|              imgUrl|          productURL|               stars|          reviews|price|listPrice|  categoryName|isBestSeller|   boughtInLastMonth

5. Exporting Data for Power BI
Once we have the analysis results, we will export the data for use in Power BI.

In [20]:
# Export to CSV for Power BI
sales_performance.toPandas().to_csv("results/sales_performance.csv", index=False)
price_sensitivity.toPandas().to_csv("results/price_sensitivity.csv", index=False)
best_seller.toPandas().to_csv("results/best_seller.csv", index=False)

6. Visualizing in Power BI
Now load these CSV files into Power BI and create visuals like:

Sales Performance: Bar chart of totalReviews per categoryName.
Price Sensitivity: Line chart of avgReviews against price.
Best Sellers: Pie chart or bar chart showing the distribution of best sellers by categoryName.

In [1]:
from pyspark.sql.functions import col, expr
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df4NetProfit = df.dropna(subset=["asin", "title", "categoryName", "stars", "reviews", "price", "listPrice", "categoryName", "isBestSeller", "boughtInLastMonth"])

df_4NetProfit = df_4NetProfit.withColumn("price", col("price").cast("double")) \
       .withColumn("listPrice", col("listPrice").cast("double")) \
       .withColumn("boughtInLastMonth", col("boughtInLastMonth").cast("int"))
# Handle null values
df_4NetProfit = df_4NetProfit.na.fill({"boughtInLastMonth": 0})



