# Mongo DB II

In [1]:
import json
import pymongo
import pprint

In [3]:
# this is a utility function we define so that find_all() prints nicely
def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return

# Aggregation Queries

Zips JSON from the MongoDB Aggregation Tutorial: https://www.mongodb.com/docs/manual/tutorial/aggregation-zip-code-data-set/

## Load `zips.json` into new `local.zips`

For the sake of simplicity, we'll make a new collection `zips` in a new `aggquerydb` database.

In [4]:
# reimport/redefine as needed
import json
import pymongo
import pprint

def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return

In [5]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()

['admin', 'config', 'local', 'nobel_prizes']

In [8]:
# run this cell to make the new collection
# and insert zipcode documents


client.drop_database('aggquerydb') # if already exists
db = client.aggquerydb

with open('../mongodb-retrieval/data/zips.json', encoding='utf-8') as f:
    for line in f.readlines():
        db.zips.insert_one(json.loads(line))

In [9]:
db.zips.count_documents({})

29353

## `$group`

In [10]:
# Just the grouping, no filtering
output = db.zips.aggregate( [    
    { "$group": { "_id": "$state",  
              "totalPop":
                   { "$sum": "$pop" } } },    
] )

pretty_print(output, n_to_print=5)

{'_id': 'NC', 'totalPop': 6628637}
{'_id': 'TN', 'totalPop': 4876457}
{'_id': 'MA', 'totalPop': 6016425}
{'_id': 'UT', 'totalPop': 1722850}
{'_id': 'KY', 'totalPop': 3675484}


In [12]:
# adding some filtering and sorting
output = db.zips.aggregate( [    
{ "$group": { "_id": "$state",  
              "totalPop":
                   { "$sum": "$pop" } } },    
{ "$match": { "totalPop":
            { "$gte": 15000000 } } }, 
{ "$sort" : { "totalPop" : -1 } }
] )

pretty_print(output, n_to_print=10)

{'_id': 'CA', 'totalPop': 29754890}
{'_id': 'NY', 'totalPop': 17990402}
{'_id': 'TX', 'totalPop': 16984601}


# Aggregation Queries: Unwind/Lookup

Make a new collection, `inventory`.

In [13]:
db = client.aggquerydb          # stay in same database
db.drop_collection('inventory') # recreate as needed

db.inventory.insert_many( [    
{ "item": "journal",
 "tags": ["blank", "red"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "A", "qty": 5 }, { "loc": "C", "qty": 15 } ]
},    
{ "item": "notebook",
 "tags": ["red", "blank"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "C", "qty": 5 } ]
},    
{ "item": "paper",
 "tags": ["red", "blank", "plain"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "A", "qty": 60 }, { "loc": "B", "qty": 15 } ]
},    
{ "item": "planner",
 "tags": ["blank", "red"],
 "dim": [ 22.85, 30 ] ,
 "instock": [ { "loc": "A", "qty": 40 }, { "loc": "B", "qty": 5 } ]
},    
{ "item": "postcard",
 "tags": ["blue"],
 "dim": [ 10, 15.25 ],
 "instock": [ { "loc": "E", "qty": 15 }, { "loc": "D", "qty": 35 } ]
}
]);

In [14]:
pretty_print(db.inventory.find({}))

{'_id': ObjectId('6900f75e417f7dcbd6dedb10'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'tags': ['blank', 'red']}
{'_id': ObjectId('6900f75e417f7dcbd6dedb11'),
 'dim': [14, 21],
 'instock': [{'loc': 'C', 'qty': 5}],
 'item': 'notebook',
 'tags': ['red', 'blank']}
{'_id': ObjectId('6900f75e417f7dcbd6dedb12'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 60}, {'loc': 'B', 'qty': 15}],
 'item': 'paper',
 'tags': ['red', 'blank', 'plain']}


## Unwind example

In [15]:
# Notice, tags is no longer an array
output = db.inventory.aggregate( [ 
{ "$unwind" : "$tags" }
] )

pretty_print(output, n_to_print=4)

{'_id': ObjectId('6900f75e417f7dcbd6dedb10'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'tags': 'blank'}
{'_id': ObjectId('6900f75e417f7dcbd6dedb10'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'tags': 'red'}
{'_id': ObjectId('6900f75e417f7dcbd6dedb11'),
 'dim': [14, 21],
 'instock': [{'loc': 'C', 'qty': 5}],
 'item': 'notebook',
 'tags': 'red'}
{'_id': ObjectId('6900f75e417f7dcbd6dedb11'),
 'dim': [14, 21],
 'instock': [{'loc': 'C', 'qty': 5}],
 'item': 'notebook',
 'tags': 'blank'}


In [17]:
# can be followed up with a projection if we want - just to show data at a glance
output = db.inventory.aggregate( [ 
{ "$unwind" : "$tags" }, 
{ "$project" : {"_id" : 0, "instock": 0}}  
] )

pretty_print(output, n_to_print=-1)

{'dim': [14, 21], 'item': 'journal', 'tags': 'blank'}
{'dim': [14, 21], 'item': 'journal', 'tags': 'red'}
{'dim': [14, 21], 'item': 'notebook', 'tags': 'red'}
{'dim': [14, 21], 'item': 'notebook', 'tags': 'blank'}
{'dim': [14, 21], 'item': 'paper', 'tags': 'red'}
{'dim': [14, 21], 'item': 'paper', 'tags': 'blank'}
{'dim': [14, 21], 'item': 'paper', 'tags': 'plain'}
{'dim': [22.85, 30], 'item': 'planner', 'tags': 'blank'}
{'dim': [22.85, 30], 'item': 'planner', 'tags': 'red'}
{'dim': [10, 15.25], 'item': 'postcard', 'tags': 'blue'}


In [18]:
# unwinding is often followed by a grouping: this allows us to operate on values within arrays
output = db.inventory.aggregate( [ 
    { "$unwind" : "$instock" },  
    { "$group" : { "_id" : "$item", "totalqty" : {"$sum" : "$instock.qty"}}} 
] )

pretty_print(output, n_to_print=-1)

{'_id': 'paper', 'totalqty': 75}
{'_id': 'journal', 'totalqty': 20}
{'_id': 'planner', 'totalqty': 45}
{'_id': 'postcard', 'totalqty': 50}
{'_id': 'notebook', 'totalqty': 5}


In [19]:
# now our join; reducing attributes returned by using liberal projections
output = db.inventory.aggregate( [ 
{ "$lookup" : {
    "from" : "inventory",
    "localField": "instock.loc", 
    "foreignField": "instock.loc", 
    "as":"otheritems"}
},  
{ "$project" :
    {"_id" : 0, "tags" : 0, "dim" : 0, "otheritems._id": 0}
} 
] )

pretty_print(output, n_to_print=2)

{'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'otheritems': [{'dim': [14, 21],
                 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
                 'item': 'journal',
                 'tags': ['blank', 'red']},
                {'dim': [14, 21],
                 'instock': [{'loc': 'C', 'qty': 5}],
                 'item': 'notebook',
                 'tags': ['red', 'blank']},
                {'dim': [14, 21],
                 'instock': [{'loc': 'A', 'qty': 60}, {'loc': 'B', 'qty': 15}],
                 'item': 'paper',
                 'tags': ['red', 'blank', 'plain']},
                {'dim': [22.85, 30],
                 'instock': [{'loc': 'A', 'qty': 40}, {'loc': 'B', 'qty': 5}],
                 'item': 'planner',
                 'tags': ['blank', 'red']}]}
{'instock': [{'loc': 'C', 'qty': 5}],
 'item': 'notebook',
 'otheritems': [{'dim': [14, 21],
                 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C'

In [20]:
# just to show you how it looks like without it
output = db.inventory.aggregate( [ 
{ "$lookup" : {
    "from" : "inventory",
    "localField": "instock.loc", 
    "foreignField": "instock.loc", 
    "as":"otheritems"}
}
] )

pretty_print(output, n_to_print=2)

{'_id': ObjectId('6900f75e417f7dcbd6dedb10'),
 'dim': [14, 21],
 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
 'item': 'journal',
 'otheritems': [{'_id': ObjectId('6900f75e417f7dcbd6dedb10'),
                 'dim': [14, 21],
                 'instock': [{'loc': 'A', 'qty': 5}, {'loc': 'C', 'qty': 15}],
                 'item': 'journal',
                 'tags': ['blank', 'red']},
                {'_id': ObjectId('6900f75e417f7dcbd6dedb11'),
                 'dim': [14, 21],
                 'instock': [{'loc': 'C', 'qty': 5}],
                 'item': 'notebook',
                 'tags': ['red', 'blank']},
                {'_id': ObjectId('6900f75e417f7dcbd6dedb12'),
                 'dim': [14, 21],
                 'instock': [{'loc': 'A', 'qty': 60}, {'loc': 'B', 'qty': 15}],
                 'item': 'paper',
                 'tags': ['red', 'blank', 'plain']},
                {'_id': ObjectId('6900f75e417f7dcbd6dedb13'),
                 'dim': [22.85, 30],
     

# Multiple Attribute Grouping

In [21]:
# reimport/redefine as needed
import json
import pymongo
import pprint

def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return

In [22]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()

['admin', 'aggquerydb', 'config', 'local', 'nobel_prizes']

In [23]:
db = client.aggquerydb

In [24]:
db.zips.count_documents({})

29353

In [25]:
db.zips.find_one()

{'_id': '01001',
 'city': 'AGAWAM',
 'loc': [-72.622739, 42.070206],
 'pop': 15338,
 'state': 'MA'}

1. What is this doing?

In [27]:
# Population per city, state combo
output = db.zips.aggregate( [    
    { "$group": { "_id": { "state": "$state", "city": "$city" },
                 "pop": { "$sum": "$pop" } } 
    }
] )
pretty_print(output, n_to_print=10)

{'_id': {'city': 'CHERRY VALLEY', 'state': 'NY'}, 'pop': 1642}
{'_id': {'city': 'SUSQUEHANNA', 'state': 'PA'}, 'pop': 5137}
{'_id': {'city': 'STANVILLE', 'state': 'KY'}, 'pop': 309}
{'_id': {'city': 'KIRKMAN', 'state': 'IA'}, 'pop': 395}
{'_id': {'city': 'TAVERNIER', 'state': 'FL'}, 'pop': 6196}
{'_id': {'city': 'SOPER', 'state': 'OK'}, 'pop': 935}
{'_id': {'city': 'ELMORE CITY', 'state': 'OK'}, 'pop': 2520}
{'_id': {'city': 'FARMERSVILLE STA', 'state': 'NY'}, 'pop': 395}
{'_id': {'city': 'NEW FRANKEN', 'state': 'WI'}, 'pop': 2640}
{'_id': {'city': 'MEDIA', 'state': 'IL'}, 'pop': 484}


In [28]:
# Average city population per state
output = db.zips.aggregate( [    
    { "$group": { "_id": { "state": "$state", "city": "$city" },
                 "pop": { "$sum": "$pop" } } 
    },    
    { "$group": { "_id": "$_id.state", 
                 "avgCityPop": { "$avg": "$pop" } } 
    } 
] )
pretty_print(output, n_to_print=10)

{'_id': 'KS', 'avgCityPop': 3819.884259259259}
{'_id': 'SD', 'avgCityPop': 1839.6746031746031}
{'_id': 'NJ', 'avgCityPop': 15775.89387755102}
{'_id': 'CA', 'avgCityPop': 27756.42723880597}
{'_id': 'ND', 'avgCityPop': 1645.0309278350514}
{'_id': 'VT', 'avgCityPop': 2315.8765432098767}
{'_id': 'CT', 'avgCityPop': 14674.625}
{'_id': 'WY', 'avgCityPop': 3384.5373134328356}
{'_id': 'ME', 'avgCityPop': 3006.4901960784314}
{'_id': 'IN', 'avgCityPop': 9271.130434782608}


2. What is this doing?

In [23]:
output = db.zips.aggregate( [ 
{ "$group": { "_id": { "state": "$state", "city": "$city" }, 
             "pop": { "$sum": "$pop" } } }, 
{ "$sort": { "pop": -1 } }, 
{ "$group": { "_id" : "$_id.state",
             "bigCity": { "$first": "$_id.city" }, 
             "bigPop": { "$first": "$pop" } } }, 
{ "$sort" : {"bigPop" : -1} },
{ "$project" : {"bigPop" : 0} }
] )

pretty_print(output, n_to_print=10)

{'_id': 'IL', 'bigCity': 'CHICAGO'}
{'_id': 'NY', 'bigCity': 'BROOKLYN'}
{'_id': 'CA', 'bigCity': 'LOS ANGELES'}
{'_id': 'TX', 'bigCity': 'HOUSTON'}
{'_id': 'PA', 'bigCity': 'PHILADELPHIA'}
{'_id': 'MI', 'bigCity': 'DETROIT'}
{'_id': 'AZ', 'bigCity': 'PHOENIX'}
{'_id': 'FL', 'bigCity': 'MIAMI'}
{'_id': 'MD', 'bigCity': 'BALTIMORE'}
{'_id': 'TN', 'bigCity': 'MEMPHIS'}


3. What is this doing?

In [27]:
output = db.zips.aggregate( [ 
{ "$group": { "_id": { "state": "$state", "city": "$city" }, 
             "pop": { "$sum": "$pop" } } }, 
{ "$sort": { "pop": -1 } }, 
{ "$group": { "_id" : "$_id.state", 
             "bigCity": { "$first": "$_id.city" }, 
             "bigPop": { "$first": "$pop" } } }, 
{ "$sort" : {"bigPop" : -1} },
{ "$project" : { "_id" : 0, "state" : "$_id", 
                "bigCityDeets": { "name": "$bigCity", "pop": "$bigPop" } } }
] )

pretty_print(output, n_to_print=10)

{'bigCityDeets': {'name': 'CHICAGO', 'pop': 2452177}, 'state': 'IL'}
{'bigCityDeets': {'name': 'BROOKLYN', 'pop': 2300504}, 'state': 'NY'}
{'bigCityDeets': {'name': 'LOS ANGELES', 'pop': 2102295}, 'state': 'CA'}
{'bigCityDeets': {'name': 'HOUSTON', 'pop': 2095918}, 'state': 'TX'}
{'bigCityDeets': {'name': 'PHILADELPHIA', 'pop': 1610956}, 'state': 'PA'}
{'bigCityDeets': {'name': 'DETROIT', 'pop': 963243}, 'state': 'MI'}
{'bigCityDeets': {'name': 'PHOENIX', 'pop': 890853}, 'state': 'AZ'}
{'bigCityDeets': {'name': 'MIAMI', 'pop': 825232}, 'state': 'FL'}
{'bigCityDeets': {'name': 'BALTIMORE', 'pop': 733081}, 'state': 'MD'}
{'bigCityDeets': {'name': 'MEMPHIS', 'pop': 632837}, 'state': 'TN'}


# [Extra] Aggregation Pipeline Demos

Nobel laureate data from Kaggle: https://www.kaggle.com/datasets/imdevskp/nobel-prize/data

In [30]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()

client.drop_database('nobel_prizes') # if already exists

db = client.nobel_prizes
# run this cell to insert into the collection prizes
with open('data/prize.json', encoding='utf-8') as f:
    db.prizes.insert_many(json.loads(f.read()))

print("databases", client.list_database_names())

db = client.nobel_prizes
print("collections in nobel_prizes database", db.list_collection_names())

databases ['admin', 'aggquerydb', 'config', 'local', 'nobel_prizes']
collections in nobel_prizes database ['prizes']


In [None]:
# A
output = db.prizes.aggregate([{"$group": {"_id": "$category",
                                          "awardyears": {"$sum" : 1}}}])
pretty_print(output, n_to_print=-1)

In [None]:
# B
output = db.prizes.aggregate([{"$group": {"_id": "$category",
                                          "awardyears": {"$sum" : 1}}},
                              {"$match" : {"awardyears": {"$lt": 100}}}])
pretty_print(output, n_to_print=-1)

In [None]:
# C
output = db.prizes.aggregate([{"$group": {"_id": "$category", 
                                 "awardyears": {"$sum" : 1}}}, 
                     {"$match" : {"awardyears": {"$lt": 100}}}, 
                     {"$project" : {"_id": 0, "awardyears": 1}}])
pretty_print(output, n_to_print=-1)

In [None]:
# D
output = db.prizes.aggregate([{"$unwind": "$laureates"},
                              {"$group": {"_id": "$category", 
                                          "awards": {"$sum" : 1}}}]) 
pretty_print(output, n_to_print=-1)

In [None]:
# E
output = db.prizes.aggregate([{"$unwind": "$laureates"}, 
                              {"$group": {"_id": {"category": "$category",
                                                  "year": "$year"}, 
                                          "awards": {"$sum" : 1}}}])
pretty_print(output, n_to_print=10)

In [None]:
# F
output = db.prizes.aggregate([{"$unwind": "$laureates"}, 
                              {"$group": {"_id": {"category": "$category", 
                                                  "year": "$year"},
                                          "awards": {"$sum" : 1}}},
                              {"$sort" : {"awards": -1}}])
pretty_print(output, n_to_print=15)

In [None]:
# G
output = db.prizes.aggregate([{"$unwind": "$laureates"}, 
                              {"$group": {"_id": {"category": "$category",
                                                  "year": "$year"}, 
                                          "awards": {"$sum" : 1}}},
                              {"$group": {"_id":"$_id.category",
                                          "avgawards": {"$avg" : "$awards"}}}])
pretty_print(output, n_to_print=-1)