# Lab Assignment 9

- Name - Aryan Gupta
- Roll No.- 230150003
- Date - 15 October, 2025
- Course - DA331 Big Data Analytics: Tools & Techniques


## Loading libraries and data


In [15]:

# Create sample dataframes and save them into an SQLite database for the SQL queries
import sqlite3
import pandas as pd
from datetime import datetime

# Sample data (from the Lab problem image)
customers = pd.DataFrame([
    {"CustomerID": 1, "FirstName": "John", "LastName": "Doe", "Email": "john@example.com", "PhoneNumber": "1234567890", "City": "New York", "Country": "USA"},
    {"CustomerID": 2, "FirstName": "Jane", "LastName": "Smith", "Email": "jane@example.com", "PhoneNumber": "0987654321", "City": "Los Angeles", "Country": "USA"},
    {"CustomerID": 3, "FirstName": "Bob", "LastName": "Brown", "Email": "bob@example.com", "PhoneNumber": "4567891230", "City": "Chicago", "Country": "USA"}
])

products = pd.DataFrame([
    {"ProductID": 1, "ProductName": "T-Shirt", "Category": "Apparel", "Price": 15.99},
    {"ProductID": 2, "ProductName": "Jeans",   "Category": "Apparel", "Price": 39.99},
    {"ProductID": 3, "ProductName": "Shoes",   "Category": "Footwear", "Price": 49.99},
    {"ProductID": 4, "ProductName": "Hat",     "Category": "Accessories", "Price": 9.99}
])

orders = pd.DataFrame([
    {"OrderID": 101, "CustomerID": 1, "OrderDate": "2024-11-01", "TotalAmount": 65.97},
    {"OrderID": 102, "CustomerID": 2, "OrderDate": "2024-11-02", "TotalAmount": 39.99},
    {"OrderID": 103, "CustomerID": 3, "OrderDate": "2024-11-03", "TotalAmount": 59.98}
])

orderDetails = pd.DataFrame([
    {"OrderDetailID": 1, "OrderID": 101, "ProductID": 1, "Quantity": 1, "UnitPrice": 15.99},
    {"OrderDetailID": 2, "OrderID": 101, "ProductID": 3, "Quantity": 1, "UnitPrice": 49.99},
    {"OrderDetailID": 3, "OrderID": 102, "ProductID": 2, "Quantity": 1, "UnitPrice": 39.99},
    {"OrderDetailID": 4, "OrderID": 103, "ProductID": 1, "Quantity": 2, "UnitPrice": 15.99},
    {"OrderDetailID": 5, "OrderID": 103, "ProductID": 4, "Quantity": 1, "UnitPrice": 9.99}
])

# Ensure the data directory exists
db_path = "./lab10.db"
conn = sqlite3.connect(db_path)

customers.to_sql("Customers", conn, if_exists="replace", index=False)
products.to_sql("Products", conn, if_exists="replace", index=False)
orders.to_sql("Orders", conn, if_exists="replace", index=False)
orderDetails.to_sql("OrderDetails", conn, if_exists="replace", index=False)

# Show the initial tables
print("SQLite DB created at:", db_path)
print("\nCustomers:")
print(customers)
print("\nProducts:")
print(products)
print("\nOrders:")
print(orders)
print("\nOrderDetails:")
print(orderDetails)

conn.close()


SQLite DB created at: ./lab10.db

Customers:
   CustomerID FirstName LastName             Email PhoneNumber         City  \
0           1      John      Doe  john@example.com  1234567890     New York   
1           2      Jane    Smith  jane@example.com  0987654321  Los Angeles   
2           3       Bob    Brown   bob@example.com  4567891230      Chicago   

  Country  
0     USA  
1     USA  
2     USA  

Products:
   ProductID ProductName     Category  Price
0          1     T-Shirt      Apparel  15.99
1          2       Jeans      Apparel  39.99
2          3       Shoes     Footwear  49.99
3          4         Hat  Accessories   9.99

Orders:
   OrderID  CustomerID   OrderDate  TotalAmount
0      101           1  2024-11-01        65.97
1      102           2  2024-11-02        39.99
2      103           3  2024-11-03        59.98

OrderDetails:
   OrderDetailID  OrderID  ProductID  Quantity  UnitPrice
0              1      101          1         1      15.99
1              2      

## Problem 1: Retrieve all customers from the USA


In [16]:

import sqlite3
import pandas as pd
conn = sqlite3.connect(db_path)
sql = r"""SELECT *
FROM Customers
WHERE Country = 'USA';"""
df = pd.read_sql_query(sql, conn)
print("Query:\n", sql)
display(df)
conn.close()


Query:
 SELECT *
FROM Customers
WHERE Country = 'USA';


Unnamed: 0,CustomerID,FirstName,LastName,Email,PhoneNumber,City,Country
0,1,John,Doe,john@example.com,1234567890,New York,USA
1,2,Jane,Smith,jane@example.com,987654321,Los Angeles,USA
2,3,Bob,Brown,bob@example.com,4567891230,Chicago,USA


### MongoDB Query (customers collection)

```js
db.customers.find({ Country: "USA" });
```


## Problem 2: Get the total amount spent by each customer (Customer's Name and Total Amount Spent)


In [17]:

import sqlite3
import pandas as pd
conn = sqlite3.connect(db_path)
sql = r"""SELECT
  FirstName || ' ' || LastName AS CustomerName,
  COALESCE(SUM(TotalAmount), 0) AS TotalSpent
FROM Customers c
LEFT JOIN Orders o
  ON c.CustomerID = o.CustomerID
GROUP BY c.CustomerID, c.FirstName, c.LastName
ORDER BY TotalSpent DESC;"""
df = pd.read_sql_query(sql, conn)
print("Query:\n", sql)
display(df)
conn.close()


Query:
 SELECT
  FirstName || ' ' || LastName AS CustomerName,
  COALESCE(SUM(TotalAmount), 0) AS TotalSpent
FROM Customers c
LEFT JOIN Orders o
  ON c.CustomerID = o.CustomerID
GROUP BY c.CustomerID, c.FirstName, c.LastName
ORDER BY TotalSpent DESC;


Unnamed: 0,CustomerName,TotalSpent
0,John Doe,65.97
1,Bob Brown,59.98
2,Jane Smith,39.99


### MongoDB Query (aggregation starting from `customers`)

```js
db.customers.aggregate([
  {
    $lookup: {
      from: "orders",
      let: { cid: "$CustomerID" },
      pipeline: [
        { $match: { $expr: { $eq: ["$CustomerID", "$$cid"] } } },
        { $group: { _id: null, total: { $sum: "$TotalAmount" } } },
      ],
      as: "orderSummary",
    },
  },
  {
    $addFields: {
      TotalSpent: {
        $ifNull: [{ $arrayElemAt: ["$orderSummary.total", 0] }, 0],
      },
      CustomerName: { $concat: ["$FirstName", " ", "$LastName"] },
    },
  },
  {
    $project: { _id: 0, CustomerName: 1, TotalSpent: 1 },
  },
  { $sort: { TotalSpent: -1 } },
]);
```


## Problem 3: Find the most popular product by quantity sold (Product Name and total quantity sold)


In [18]:

import sqlite3
import pandas as pd
conn = sqlite3.connect(db_path)
sql = r"""SELECT p.ProductName,
       SUM(od.Quantity) AS TotalQuantitySold
FROM OrderDetails od
JOIN Products p
  ON od.ProductID = p.ProductID
GROUP BY p.ProductID, p.ProductName
ORDER BY TotalQuantitySold DESC
LIMIT 1;"""
df = pd.read_sql_query(sql, conn)
print("Query:\n", sql)
display(df)
conn.close()


Query:
 SELECT p.ProductName,
       SUM(od.Quantity) AS TotalQuantitySold
FROM OrderDetails od
JOIN Products p
  ON od.ProductID = p.ProductID
GROUP BY p.ProductID, p.ProductName
ORDER BY TotalQuantitySold DESC
LIMIT 1;


Unnamed: 0,ProductName,TotalQuantitySold
0,T-Shirt,3


### MongoDB Query (aggregation on `orderDetails`)

```js
db.orderDetails.aggregate([
  {
    $group: {
      _id: "$ProductID",
      totalQuantity: { $sum: "$Quantity" },
    },
  },
  { $sort: { totalQuantity: -1 } },
  { $limit: 1 },
  {
    $lookup: {
      from: "products",
      localField: "_id",
      foreignField: "ProductID",
      as: "product",
    },
  },
  { $unwind: "$product" },
  {
    $project: {
      _id: 0,
      ProductName: "$product.ProductName",
      TotalQuantitySold: "$totalQuantity",
    },
  },
]);
```


## Problem 4: List all orders placed in November 2024


In [19]:

import sqlite3
import pandas as pd
conn = sqlite3.connect(db_path)
sql = r"""SELECT *
FROM Orders
WHERE OrderDate >= '2024-11-01'
  AND OrderDate <  '2024-12-01'
ORDER BY OrderDate;"""
df = pd.read_sql_query(sql, conn)
print("Query:\n", sql)
display(df)
conn.close()


Query:
 SELECT *
FROM Orders
WHERE OrderDate >= '2024-11-01'
  AND OrderDate <  '2024-12-01'
ORDER BY OrderDate;


Unnamed: 0,OrderID,CustomerID,OrderDate,TotalAmount
0,101,1,2024-11-01,65.97
1,102,2,2024-11-02,39.99
2,103,3,2024-11-03,59.98


### MongoDB Query (if `OrderDate` is ISODate)

```js
db.orders
  .find({
    OrderDate: {
      $gte: ISODate("2024-11-01T00:00:00Z"),
      $lt: ISODate("2024-12-01T00:00:00Z"),
    },
  })
  .sort({ OrderDate: 1 });
```

(If `OrderDate` is stored as string `"YYYY-MM-DD"`:)

```js
db.orders.find({ OrderDate: { $regex: /^2024-11-/ } }).sort({ OrderDate: 1 });
```


## Problem 5: Find the average order amount


In [20]:

import sqlite3
import pandas as pd
conn = sqlite3.connect(db_path)
sql = r"""SELECT AVG(TotalAmount) AS AverageOrderAmount
FROM Orders;"""
df = pd.read_sql_query(sql, conn)
print("Query:\n", sql)
display(df)
conn.close()


Query:
 SELECT AVG(TotalAmount) AS AverageOrderAmount
FROM Orders;


Unnamed: 0,AverageOrderAmount
0,55.313333


### MongoDB Query (aggregation)

```js
db.orders.aggregate([
  {
    $group: {
      _id: null,
      AverageOrderAmount: { $avg: "$TotalAmount" },
    },
  },
  { $project: { _id: 0, AverageOrderAmount: 1 } },
]);
```


## Problem 6: Show details of orders where the total amount is greater than $50


In [21]:

import sqlite3
import pandas as pd
conn = sqlite3.connect(db_path)
sql = r"""SELECT o.OrderID,
       o.OrderDate,
       o.TotalAmount,
       c.FirstName,
       c.LastName,
       c.Email
FROM Orders o
JOIN Customers c ON o.CustomerID = c.CustomerID
WHERE o.TotalAmount > 50
ORDER BY o.OrderDate;"""
df = pd.read_sql_query(sql, conn)
print("Query:\n", sql)
display(df)
conn.close()


Query:
 SELECT o.OrderID,
       o.OrderDate,
       o.TotalAmount,
       c.FirstName,
       c.LastName,
       c.Email
FROM Orders o
JOIN Customers c ON o.CustomerID = c.CustomerID
WHERE o.TotalAmount > 50
ORDER BY o.OrderDate;
 SELECT o.OrderID,
       o.OrderDate,
       o.TotalAmount,
       c.FirstName,
       c.LastName,
       c.Email
FROM Orders o
JOIN Customers c ON o.CustomerID = c.CustomerID
WHERE o.TotalAmount > 50
ORDER BY o.OrderDate;


Unnamed: 0,OrderID,OrderDate,TotalAmount,FirstName,LastName,Email
0,101,2024-11-01,65.97,John,Doe,john@example.com
1,103,2024-11-03,59.98,Bob,Brown,bob@example.com


### MongoDB Query (detailed: orders + customer + items)

```js
db.orders.aggregate([
  { $match: { TotalAmount: { $gt: 50 } } },
  {
    $lookup: {
      from: "customers",
      localField: "CustomerID",
      foreignField: "CustomerID",
      as: "customer",
    },
  },
  { $unwind: "$customer" },
  {
    $lookup: {
      from: "orderDetails",
      localField: "OrderID",
      foreignField: "OrderID",
      as: "items",
    },
  },
  {
    $project: {
      _id: 0,
      OrderID: 1,
      OrderDate: 1,
      TotalAmount: 1,
      "Customer.FirstName": "$customer.FirstName",
      "Customer.LastName": "$customer.LastName",
      "Customer.Email": "$customer.Email",
      Items: "$items",
    },
  },
  { $sort: { OrderDate: 1 } },
]);
```


## Problem 7: Calculate total sales for each product category (Category and total sales revenue)


In [22]:

import sqlite3
import pandas as pd
conn = sqlite3.connect(db_path)
sql = r"""SELECT p.Category,
       SUM(od.Quantity * od.UnitPrice) AS TotalSalesRevenue
FROM OrderDetails od
JOIN Products p
  ON od.ProductID = p.ProductID
GROUP BY p.Category
ORDER BY TotalSalesRevenue DESC;"""
df = pd.read_sql_query(sql, conn)
print("Query:\n", sql)
display(df)
conn.close()


Query:
 SELECT p.Category,
       SUM(od.Quantity * od.UnitPrice) AS TotalSalesRevenue
FROM OrderDetails od
JOIN Products p
  ON od.ProductID = p.ProductID
GROUP BY p.Category
ORDER BY TotalSalesRevenue DESC;


Unnamed: 0,Category,TotalSalesRevenue
0,Apparel,87.96
1,Footwear,49.99
2,Accessories,9.99


### MongoDB Query (aggregation)

```js
db.orderDetails.aggregate([
  {
    $lookup: {
      from: "products",
      localField: "ProductID",
      foreignField: "ProductID",
      as: "product",
    },
  },
  { $unwind: "$product" },
  {
    $group: {
      _id: "$product.Category",
      totalSales: {
        $sum: { $multiply: ["$Quantity", "$UnitPrice"] },
      },
    },
  },
  {
    $project: {
      _id: 0,
      Category: "$_id",
      TotalSalesRevenue: "$totalSales",
    },
  },
  { $sort: { TotalSalesRevenue: -1 } },
]);
```


## Problem 8: List customers who have not placed any orders


In [23]:

import sqlite3
import pandas as pd
conn = sqlite3.connect(db_path)
sql = r"""SELECT c.*
FROM Customers c
LEFT JOIN Orders o
  ON c.CustomerID = o.CustomerID
WHERE o.OrderID IS NULL;"""
df = pd.read_sql_query(sql, conn)
print("Query:\n", sql)
display(df)
conn.close()


Query:
 SELECT c.*
FROM Customers c
LEFT JOIN Orders o
  ON c.CustomerID = o.CustomerID
WHERE o.OrderID IS NULL;


Unnamed: 0,CustomerID,FirstName,LastName,Email,PhoneNumber,City,Country


### MongoDB Query (aggregation using $lookup)

```js
db.customers.aggregate([
  {
    $lookup: {
      from: "orders",
      let: { cid: "$CustomerID" },
      pipeline: [
        { $match: { $expr: { $eq: ["$CustomerID", "$$cid"] } } },
        { $project: { OrderID: 1 } },
      ],
      as: "orders",
    },
  },
  {
    $match: { orders: { $size: 0 } },
  },
  {
    $project: {
      _id: 0,
      CustomerID: 1,
      FirstName: 1,
      LastName: 1,
      Email: 1,
      City: 1,
      Country: 1,
    },
  },
]);
```
