In [18]:
import pyspark
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder.appName("Sales analysis").getOrCreate()

In [20]:
df = spark.read.load('C:/Users/Asus/OneDrive/Desktop/gitclone/E-commerce Dataset.csv',format = 'csv',header ='true',inferSchema = 'true')
display(df) or df.show()

DataFrame[Order_Date: date, Time: timestamp, Aging: double, Customer_Id: int, Gender: string, Device_Type: string, Customer_Login_type: string, Product_Category: string, Product: string, Sales: double, Quantity: double, Discount: double, Profit: double, Shipping_Cost: double, Order_Priority: string, Payment_method: string]

+----------+-------------------+-----+-----------+------+-----------+-------------------+------------------+--------------------+-----+--------+--------+------+-------------+--------------+--------------+
|Order_Date|               Time|Aging|Customer_Id|Gender|Device_Type|Customer_Login_type|  Product_Category|             Product|Sales|Quantity|Discount|Profit|Shipping_Cost|Order_Priority|Payment_method|
+----------+-------------------+-----+-----------+------+-----------+-------------------+------------------+--------------------+-----+--------+--------+------+-------------+--------------+--------------+
|2018-01-02|2024-03-21 10:56:33|  8.0|      37077|Female|        Web|             Member|Auto & Accessories|   Car Media Players|140.0|     1.0|     0.3|  46.0|          4.6|        Medium|   credit_card|
|2018-07-24|2024-03-21 20:41:37|  2.0|      59173|Female|        Web|             Member|Auto & Accessories|        Car Speakers|211.0|     1.0|     0.3| 112.0|         11.2|      

In [21]:
df.count()

51290

In [22]:
df.printSchema()

root
 |-- Order_Date: date (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Aging: double (nullable = true)
 |-- Customer_Id: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Device_Type: string (nullable = true)
 |-- Customer_Login_type: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Quantity: double (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: double (nullable = true)
 |-- Shipping_Cost: double (nullable = true)
 |-- Order_Priority: string (nullable = true)
 |-- Payment_method: string (nullable = true)



In [23]:
# Creating a temp table for using Sql queries
df.createOrReplaceTempView("sales_data")

In [24]:
# 1. Customer Segmentation Categorizing customers based on their spendings

customer_data = df.groupBy('Customer_Id').sum('Sales')
# display(customer_data)
customer_data.show()

+-----------+----------+
|Customer_Id|sum(Sales)|
+-----------+----------+
|      37251|     140.0|
|      49855|     249.0|
|      19204|     336.0|
|      45011|     225.0|
|      19553|     283.0|
|      29285|     211.0|
|      37307|     250.0|
|      22346|     575.0|
|      42834|     486.0|
|      33868|     140.0|
|      49308|      54.0|
|      13832|     202.0|
|      48398|     231.0|
|      54264|     250.0|
|      40383|     250.0|
|      38723|     140.0|
|      53691|     680.0|
|      12940|      54.0|
|      32592|      54.0|
|      45307|     548.0|
+-----------+----------+
only showing top 20 rows



In [25]:
from pyspark.sql.functions import when

customer_data = customer_data.withColumn('Segment',when(customer_data['sum(Sales)'] > 500, 'First Class')
.when((customer_data['sum(Sales)'] < 500) & (customer_data['sum(Sales)'] >= 300),'Second Class')
.when(customer_data['sum(Sales)'] < 300,'Third Class')
.otherwise('Unknown'))


In [26]:
data = (customer_data.sort(customer_data['sum(Sales)'].desc()))
data.show()

+-----------+----------+-----------+
|Customer_Id|sum(Sales)|    Segment|
+-----------+----------+-----------+
|      87989|     994.0|First Class|
|      35324|     949.0|First Class|
|      79016|     935.0|First Class|
|      10436|     929.0|First Class|
|      51093|     927.0|First Class|
|      16997|     925.0|First Class|
|      80884|     904.0|First Class|
|      97843|     903.0|First Class|
|      95831|     894.0|First Class|
|      37805|     890.0|First Class|
|      11050|     887.0|First Class|
|      89861|     887.0|First Class|
|      99254|     885.0|First Class|
|      87539|     884.0|First Class|
|      39084|     881.0|First Class|
|      69462|     866.0|First Class|
|      60059|     864.0|First Class|
|      46082|     862.0|First Class|
|      13590|     854.0|First Class|
|      13264|     853.0|First Class|
+-----------+----------+-----------+
only showing top 20 rows



In [27]:
# 2. monthly trend of sales  ( how much sales each month ?)

data = (spark.sql('''select month(Order_Date) as Order_month, sum(Sales) as Total_Sales from sales_data
             group by Order_month 
             order by Total_Sales desc'''))
data.show()

+-----------+-----------+
|Order_month|Total_Sales|
+-----------+-----------+
|         11|   877881.0|
|          5|   824502.0|
|          7|   810205.0|
|         12|   767147.0|
|         10|   743387.0|
|          9|   738303.0|
|          8|   664495.0|
|          6|   642555.0|
|          4|   597312.0|
|          3|   435502.0|
|          1|   379627.0|
|          2|   332495.0|
+-----------+-----------+



In [28]:
# 3. Hourly Sales Analysis Which hour has more number of sales? 

data = (spark.sql('''select hour(Time) as Sale_Hour, sum(Sales) as Total_Sales from sales_data 
             group by Sale_Hour 
             order by Total_Sales desc'''))
data.show()


+---------+-----------+
|Sale_Hour|Total_Sales|
+---------+-----------+
|       11|   522162.0|
|       13|   517504.0|
|       15|   517257.0|
|       14|   516064.0|
|       16|   514393.0|
|       10|   498233.0|
|       20|   493227.0|
|       21|   488724.0|
|       17|   476924.0|
|       12|   463949.0|
|       19|   459328.0|
|       22|   458391.0|
|       18|   445695.0|
|        9|   374256.0|
|       23|   327508.0|
|        8|   236029.0|
|        0|   190577.0|
|        7|    97260.0|
|        1|    89801.0|
|        6|    37674.0|
+---------+-----------+
only showing top 20 rows



In [29]:
# i. Which category product has sold more?
data = (spark.sql('''select Product_Category,sum(Sales) as Total_Sales from sales_data
             group by Product_Category
             order by Total_Sales desc'''))
data.show()

+------------------+-----------+
|  Product_Category|Total_Sales|
+------------------+-----------+
|           Fashion|  4345914.0|
|  Home & Furniture|  1975831.0|
|Auto & Accessories|  1096928.0|
|        Electronic|   394738.0|
+------------------+-----------+



In [30]:
# ii. Which product has sold more?
data = (spark.sql('''select Product, count(Product) as number_of_sales from sales_data
             group by Product
             order by number_of_sales desc
             limit 5'''))
data.show()

+------------+---------------+
|     Product|number_of_sales|
+------------+---------------+
|  T - Shirts|           2332|
|       Suits|           2332|
|Fossil Watch|           2332|
|      Shirts|           2332|
|       Jeans|           2332|
+------------+---------------+



In [31]:
# i. What are the most commonly used payment types?
data= (spark.sql('''select Payment_method, count(Payment_method) as mostly_used_count from sales_data
          group by Payment_method
          order by mostly_used_count desc'''))
data.show()


+--------------+-----------------+
|Payment_method|mostly_used_count|
+--------------+-----------------+
|   credit_card|            38137|
|   money_order|             9629|
|      e_wallet|             2789|
|    debit_card|              734|
|   not_defined|                1|
+--------------+-----------------+

