### 1. Spark Setup and Data Loading

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("EDA of eCommerce Dataset") \
    .master("local[*]") \
    .getOrCreate()

In [None]:
# Loading Data into Spark DataFrame
df = spark.read.csv("/Users/marcelwinterhalter/Developer/Projects/BDSP/eCommerceDataset/2019-Nov.csv", header=True, inferSchema=True)


### 2. Basic Data Overview

In [None]:
# Overview of the data
df.show()

In [None]:
# Data types and nullability
df.printSchema()

In [None]:
# Summary
df.describe().show()

In [10]:
from pyspark.sql.functions import col
count = df.filter(
    (col("category_code").isNull()) & 
    (col("brand").isNull()) & 
    (col("price") == 0)
).count()
print("Number of entries where 'category_code' and 'brand' are NULL and 'price' is 0:", count)



Number of entries where 'category_code' and 'brand' are NULL and 'price' is 0: 96167


                                                                                

### 3. Data Cleaning and Preprocessing

#### 3.1 Checking for Missing Values

In [14]:
from pyspark.sql.functions import col, when, count

df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()



+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|     21898171|9218235|    0|      0|          10|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



                                                                                