In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t

In [3]:
spark = (
    SparkSession
    .builder
    .appName("Electronic Sales Data")
    .master("local[*]")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

24/05/04 17:53:17 WARN Utils: Your hostname, codespaces-0d4183 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/05/04 17:53:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/04 17:53:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/04 17:53:30 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
electronic_sales_df = (
    spark.read
    .option("header", "true")
    .csv("../input_data/sales.csv")
)
electronic_sales_df.show(5)

+----------+---+-----------+-----+-----+----------+------------+
|      date| id|category_id|sales|views|price_cost|price_retail|
+----------+---+-----------+-----+-----+----------+------------+
|2022-02-24|  1|          3|    0|    0|         0|           0|
|2022-02-25|  1|          3|    0|    0|         0|           0|
|2022-02-26|  1|          3|    0|    0|         0|           0|
|2022-02-27|  1|          3|    0|    0|         0|           0|
|2022-02-28|  1|          3|    0|    0|         0|           0|
+----------+---+-----------+-----+-----+----------+------------+
only showing top 5 rows



In [6]:
electronic_sales_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- sales: string (nullable = true)
 |-- views: string (nullable = true)
 |-- price_cost: string (nullable = true)
 |-- price_retail: string (nullable = true)



In [8]:
electronic_sales_df.count()

                                                                                

2548824

## Easy Level

### Basic Aggregation:

In [13]:
# Find the total sales for each category.
# Calculate the average views per category.

total_sales_per_category_df = (
    electronic_sales_df
    .groupBy('category_id')
    .agg({"views":"avg",
          "sales":"sum",
         }
        )
)
total_sales_per_category_df.show()

[Stage 14:>                                                         (0 + 2) / 2]

+-----------+------------------+----------+
|category_id|        avg(views)|sum(sales)|
+-----------+------------------+----------+
|          3| 42.06166696912871|   73233.0|
|          1| 42.66659622361947|  127308.0|
|          4| 36.96256830601093|     656.0|
|          2|32.377772201776885|  743416.0|
+-----------+------------------+----------+





In [20]:
# What is the maximum retail price among all products?
max_retail_price_df = (
    electronic_sales_df
    .withColumn('price_retail', f.col('price_retail').cast(t.FloatType()))
    .groupBy('category_id')
    .agg(f.max('price_retail'))
    .orderBy(f.desc('max(price_retail)'))
)
max_retail_price_df.show(5)



+-----------+-----------------+
|category_id|max(price_retail)|
+-----------+-----------------+
|          4|         759077.0|
|          1|         300957.0|
|          2|          62401.0|
|          3|          10878.0|
+-----------+-----------------+



                                                                                

### Data Cleaning:

In [41]:
# Check for null values in each column.

columns = electronic_sales_df.columns

null_columns_comprehensive_list = [(c, electronic_sales_df.filter(f.col(c).isNull()).count()) for c in columns]

for column, count in null_columns_comprehensive_list:
    print(f"Column {column} \t null_count:{count} ")


[Stage 148:>                                                        (0 + 2) / 2]

Column date 	 null_count:0 
Column id 	 null_count:0 
Column category_id 	 null_count:0 
Column sales 	 null_count:0 
Column views 	 null_count:0 
Column price_cost 	 null_count:0 
Column price_retail 	 null_count:0 




In [25]:
# Convert the date column to a proper date format.
# Convert the sales, views, price_cost, and price_retail columns to numeric types.
cleaned_df = (
    electronic_sales_df
    .withColumn('new_date', f.col('date').cast(t.DateType()))
    .withColumn('sales', f.col('sales').cast(t.IntegerType()))
    .withColumn('views', f.col('views').cast(t.IntegerType()))
    .withColumn('price_cost', f.col('price_cost').cast(t.FloatType()))
    .withColumn('price_retail', f.col('price_retail').cast(t.FloatType()))
)
cleaned_df.show(5)

+----------+---+-----------+-----+-----+----------+------------+----------+
|      date| id|category_id|sales|views|price_cost|price_retail|  new_date|
+----------+---+-----------+-----+-----+----------+------------+----------+
|2022-02-24|  1|          3|    0|    0|       0.0|         0.0|2022-02-24|
|2022-02-25|  1|          3|    0|    0|       0.0|         0.0|2022-02-25|
|2022-02-26|  1|          3|    0|    0|       0.0|         0.0|2022-02-26|
|2022-02-27|  1|          3|    0|    0|       0.0|         0.0|2022-02-27|
|2022-02-28|  1|          3|    0|    0|       0.0|         0.0|2022-02-28|
+----------+---+-----------+-----+-----+----------+------------+----------+
only showing top 5 rows



In [26]:
cleaned_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- sales: integer (nullable = true)
 |-- views: integer (nullable = true)
 |-- price_cost: float (nullable = true)
 |-- price_retail: float (nullable = true)
 |-- new_date: date (nullable = true)



### Filtering:

In [33]:
# Filter the dataset to include only rows where sales are greater than 0.
# Remove rows where views are null.
sales_greater_than_0_df = (
    cleaned_df
    .filter((f.col('sales')>0) 
            & (f.col('views').isNotNull())
           )
)
sales_greater_than_0_df.count()



194358

                                                                                