In [1]:
from pymongo import MongoClient
from bson.objectid import ObjectId
from bson.decimal128 import Decimal128
import json

In [2]:
class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId) or isinstance(o, Decimal128):
            return str(o)
        return json.JSONEncoder.default(self, o)

## Data source

If you do not change the data uri (*course_cluster_uri*), you can execute most
of this notebook, however you will not be able to write to the database.

To execute successfully the pipelines with an $out/save stage in this notebook,
point to your own Atlas cluster into which you will have imported the *retail.csv* dataset.


In [9]:
# course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_cluster_uri = "mongodb://alexsnow348:wuthmone08@54.198.7.183:27017"
course_client = MongoClient(course_cluster_uri)

In [10]:
retail_col = course_client['eco']['retail']

In [11]:
assemble = {
    "$group": {
        "_id": {
            "InvoiceNo": "$InvoiceNo",
            "CustomerID": "$CustomerID",
            "Country": "$Country"
        },
        "InvoiceDate": { "$max": "$InvoiceDate" },
        "Items": {
            "$push": {
                "StockCode": "$StockCode",
                "Description": "$Description",
                "Quantity": "$Quantity",
                "UnitPrice": "$UnitPrice"
            }
        }
    }
}

In [12]:
beautify = {
    "$project": {
        "_id": "$_id.InvoiceNo",
        "InvoiceDate": "$_id.InvoiceDate",
        "CustomerID": "$_id.CustomerID",
        "Country": "$_id.Country",
        "Items": 1
    }
}

In [13]:
cursor = retail_col.aggregate([
    assemble,
    beautify
  ],
  allowDiskUse=True)

In [14]:
retail_doc = cursor.next()

In [15]:
print(json.dumps(retail_doc, cls=JSONEncoder, indent=4))

{
    "Items": [
        {
            "StockCode": "71053",
            "Description": "WHITE METAL LANTERN",
            "Quantity": 6,
            "UnitPrice": "3.39"
        },
        {
            "StockCode": "84406B",
            "Description": "CREAM CUPID HEARTS COAT HANGER",
            "Quantity": 8,
            "UnitPrice": "2.75"
        },
        {
            "StockCode": "85123A",
            "Description": "WHITE HANGING HEART T-LIGHT HOLDER",
            "Quantity": 6,
            "UnitPrice": "2.55"
        },
        {
            "StockCode": "84029G",
            "Description": "KNITTED UNION FLAG HOT WATER BOTTLE",
            "Quantity": 6,
            "UnitPrice": "3.39"
        },
        {
            "StockCode": "84029E",
            "Description": "RED WOOLLY HOTTIE WHITE HEART.",
            "Quantity": 6,
            "UnitPrice": "3.39"
        },
        {
            "StockCode": "22752",
            "Description": "SET 7 BABUSHKA NESTING BOXES",
   

In [16]:
computed = {
    "$addFields" : {
        "TotalPrice": {
            "$reduce": {
                "input": "$Items",
                "initialValue": Decimal128("0.00"),
                "in": {
                    "$add": [
                        "$$value",
                        { "$multiply": [ "$$this.Quantity", "$$this.UnitPrice" ] }
                    ]
                }
            }
        }
    }
}

In [17]:
cursor = retail_col.aggregate([
    assemble,
    beautify,
    computed
  ],
  allowDiskUse=True)

In [18]:
retail_doc = cursor.next()

In [19]:
print(json.dumps(retail_doc, cls=JSONEncoder, indent=4))

{
    "Items": [
        {
            "StockCode": "71053",
            "Description": "WHITE METAL LANTERN",
            "Quantity": 6,
            "UnitPrice": "3.39"
        },
        {
            "StockCode": "84406B",
            "Description": "CREAM CUPID HEARTS COAT HANGER",
            "Quantity": 8,
            "UnitPrice": "2.75"
        },
        {
            "StockCode": "85123A",
            "Description": "WHITE HANGING HEART T-LIGHT HOLDER",
            "Quantity": 6,
            "UnitPrice": "2.55"
        },
        {
            "StockCode": "84029G",
            "Description": "KNITTED UNION FLAG HOT WATER BOTTLE",
            "Quantity": 6,
            "UnitPrice": "3.39"
        },
        {
            "StockCode": "84029E",
            "Description": "RED WOOLLY HOTTIE WHITE HEART.",
            "Quantity": 6,
            "UnitPrice": "3.39"
        },
        {
            "StockCode": "22752",
            "Description": "SET 7 BABUSHKA NESTING BOXES",
   

In [22]:
save = {
    "$out": "orders"
}

The following cell will **fail if you are not pointing** to your own Atlas group
where you have write privileges to the target collection

In [23]:
cursor = retail_col.aggregate([
    assemble,
    beautify,
    computed,
    save
  ],
  allowDiskUse=True)

In [24]:
assemble = {
    "$group": {
        "_id": {
            "InvoiceNo": "$InvoiceNo",
            "CustomerID": "$CustomerID",
            "Country": "$Country",
            "InvoiceDate": { "$max": "$InvoiceDate" },
        },
        "Items": {
            "$push": {
                "StockCode": "$StockCode",
                "Description": "$Description",
                "Quantity": "$Quantity",
                "UnitPrice": "$UnitPrice"
            }
        }
    }
}

The following cell will show the expected error message of trying to build
an index on *_id*, if you are pointing to your own Atlas cluster where you
have write privileges

In [25]:
cursor = retail_col.aggregate([
    assemble,
    beautify,
    computed,
    save
  ],
  allowDiskUse=True)

OperationFailure: insert for $out failed: { connectionId: 49, err: "E11000 duplicate key error collection: eco.tmp.agg_out.3 index: _id_ dup key: { : "536591" }", code: 11000, codeName: "DuplicateKey", n: 0, ok: 1.0 }