# Retail Sales Superstore

In [2]:
from google.colab import files
uploaded = files.upload()

Saving superstore.csv to superstore.csv


# PART 1: Pandas DataFrame Operations
🔹 1. Load the CSV using pandas

In [6]:
import pandas as pd

df = pd.read_csv("superstore.csv")

# 2. Print schema, head, shape, dtypes.

In [7]:
print("Schema:\n", df.dtypes)
print("\nHead:\n", df.head())
print("\nShape:", df.shape)

Schema:
 OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          object
Product         object
Category        object
SubCategory     object
Quantity         int64
UnitPrice        int64
Discount       float64
Profit           int64
dtype: object

Head:
    OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        F

# 3. Select Customer , Product , Profit columns.

In [9]:
df[['Customer', 'Product', 'Profit']]

Unnamed: 0,Customer,Product,Profit
0,Ravi,Laptop,5000
1,Priya,Printer,1800
2,Amit,Notebook,150
3,Anita,Table,-1500
4,Divya,Phone,3000


# 4. Filter orders where Profit > 2000 and Discount = 0 .

In [10]:
df[(df['Profit'] > 2000) & (df['Discount'] == 0)]

Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


# 5. Sort by Profit descending.

In [11]:
df.sort_values(by='Profit', ascending=False)

Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
0,CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.1,5000
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
2,CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
3,CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.2,-1500


# 6. GroupBy Category → Total Profit, Avg Discount.

In [12]:
df.groupby('Category').agg({'Profit': 'sum', 'Discount': 'mean'})

Unnamed: 0_level_0,Profit,Discount
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Furniture,-1500,0.2
Office Supplies,150,0.05
Technology,9800,0.083333


# 7. Add a column TotalPrice = Quantity * UnitPrice .

In [13]:
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df[['Quantity', 'UnitPrice', 'TotalPrice']]

Unnamed: 0,Quantity,UnitPrice,TotalPrice
0,1,55000,55000
1,2,12000,24000
2,3,200,600
3,1,18000,18000
4,2,20000,40000


# 8. Drop the SubCategory column.

In [14]:
df.drop(columns='SubCategory', inplace=True)

# 9. Fill nulls in Discount with 0.10.

In [17]:
df['Discount'] = df['Discount'].fillna(0.10)

# 10. Apply a function to categorize orders:

In [18]:
def classify(row):
    if row['Profit'] > 4000:
        return 'High'
    elif row['Profit'] > 0:
        return 'Medium'
    else:
        return 'Low'

df['ProfitLevel'] = df.apply(classify, axis=1)
df[['Profit', 'ProfitLevel']]

Unnamed: 0,Profit,ProfitLevel
0,5000,High
1,1800,Medium
2,150,Medium
3,-1500,Low
4,3000,Medium


# PART 2: PySpark DataFrame Operations
1. Load the same CSV using PySpark.

In [19]:
!pip install -q pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Superstore").getOrCreate()

# 2. Show schema and first 5 rows.

In [20]:
df_spark = spark.read.csv('superstore.csv', header=True, inferSchema=True)
df_spark.printSchema()
df_spark.show(5)

root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  

# 3. Select columns, Rename Customer → Client .

In [21]:
from pyspark.sql.functions import col

df_spark.select(col('Customer').alias('Client'), 'Product', 'Profit').show()

+------+--------+------+
|Client| Product|Profit|
+------+--------+------+
|  Ravi|  Laptop|  5000|
| Priya| Printer|  1800|
|  Amit|Notebook|   150|
| Anita|   Table| -1500|
| Divya|   Phone|  3000|
+------+--------+------+



# 4. Filter Segment = 'Consumer' and Profit < 1000 .

In [22]:
df_spark.filter((col('Segment') == 'Consumer') & (col('Profit') < 1000)).show()

+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



# 5. GroupBy Region and show average profit.

In [23]:
df_spark.groupBy('Region').avg('Profit').show()

+------+-----------+
|Region|avg(Profit)|
+------+-----------+
| South|     4000.0|
|  East|      150.0|
|  West|    -1500.0|
| North|     1800.0|
+------+-----------+



# 6. Use withColumn to create TotalPrice = Quantity * UnitPrice .

In [24]:
from pyspark.sql.functions import expr

df_spark = df_spark.withColumn("TotalPrice", expr("Quantity * UnitPrice"))
df_spark.select('OrderID', 'Quantity', 'UnitPrice', 'TotalPrice').show()

+-------+--------+---------+----------+
|OrderID|Quantity|UnitPrice|TotalPrice|
+-------+--------+---------+----------+
|CA-1001|       1|    55000|     55000|
|CA-1002|       2|    12000|     24000|
|CA-1003|       3|      200|       600|
|CA-1004|       1|    18000|     18000|
|CA-1005|       2|    20000|     40000|
+-------+--------+---------+----------+



# 7. Use when().otherwise() to classify Profit as:
'Profit' > 2000 → 'High'
'Profit' <= 0 → 'Loss'
else 'Medium'

In [25]:
from pyspark.sql.functions import when

df_spark = df_spark.withColumn("ProfitClass", when(col("Profit") > 2000, "High")
                                .when(col("Profit") <= 0, "Loss")
                                .otherwise("Medium"))
df_spark.select('Profit', 'ProfitClass').show()

+------+-----------+
|Profit|ProfitClass|
+------+-----------+
|  5000|       High|
|  1800|     Medium|
|   150|     Medium|
| -1500|       Loss|
|  3000|       High|
+------+-----------+



# 8. Use drop() to remove SubCategory .

In [26]:
df_spark = df_spark.drop('SubCategory')

# 9. Handle nulls in Discount using fillna(0.10) .

In [27]:
df_spark = df_spark.fillna({'Discount': 0.10})

# 10. Convert OrderDate to date type and extract year , month .

In [28]:
from pyspark.sql.functions import to_date, year, month

df_spark = df_spark.withColumn("OrderDate", to_date(col("OrderDate")))
df_spark = df_spark.withColumn("Year", year(col("OrderDate")))
df_spark = df_spark.withColumn("Month", month(col("OrderDate")))
df_spark.select("OrderDate", "Year", "Month").show()

+----------+----+-----+
| OrderDate|Year|Month|
+----------+----+-----+
|2023-01-15|2023|    1|
|2023-02-20|2023|    2|
|2023-01-25|2023|    1|
|2023-03-01|2023|    3|
|2023-02-05|2023|    2|
+----------+----+-----+



# PART 3: Dask DataFrame Operations (Pandas Alternative)
1. Install Dask:

In [29]:
!pip install -q dask

# 2. Load the same superstore.csv :

In [30]:
import dask.dataframe as dd

df_dask = dd.read_csv('superstore.csv')
df_dask.head()

Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
0,CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.1,5000
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
2,CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
3,CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.2,-1500
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


# 3. Do the following:
Compute average discount by category.

In [31]:
avg_discount = df_dask.groupby('Category')['Discount'].mean().compute()
print(avg_discount)

Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64


# Filter orders with more than 1 quantity and high profit.

In [32]:
filtered = df_dask[(df_dask['Quantity'] > 1) & (df_dask['Profit'] > 2000)]
filtered.compute()

Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


# Save filtered data to new CSV.

In [33]:
filtered.to_csv('filtered_output_*.csv', single_file=True)

['/content/filtered_output_*.csv']

# PART 4: JSON Handling (Complex Nested)
1. Create a nested JSON file:

In [35]:
import json

data_json = [
    {
        "OrderID": "CA-1001",
        "Customer": {"Name": "Ravi", "Segment": "Consumer"},
        "Details": {"Region": "South", "Profit": 5000}
    },
    {
        "OrderID": "CA-1002",
        "Customer": {"Name": "Priya", "Segment": "Corporate"},
        "Details": {"Region": "North", "Profit": 1800}
    }
]

with open("orders.json", "w") as f:
    json.dump(data_json, f, indent=4)

print("JSON file saved as orders.json")

JSON file saved as orders.json


# 2. Load it using PySpark:

In [36]:
df_json = spark.read.json("orders.json", multiLine=True)
df_json.printSchema()
df_json.select("OrderID", "Customer.Name", "Details.Profit").show()

root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)

+-------+-----+------+
|OrderID| Name|Profit|
+-------+-----+------+
|CA-1001| Ravi|  5000|
|CA-1002|Priya|  1800|
+-------+-----+------+

