# Lecture 18: Mongo DB I

In [None]:
import json
import pymongo
import pprint

## Connect to Database on localhost

Nobel laureate data from Kaggle: https://www.kaggle.com/datasets/imdevskp/nobel-prize/data

In [None]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()

In [None]:
# client.drop_database('nobel_prizes') # if already exists

Connect to an empty database called `nobel_prizes`

In [None]:
db = client.nobel_prizes
# db = client['nobel_prizes'] # also works
db

In [None]:
db.list_collection_names()

<br/><br/>Within this empty database, access an empty collection called `prizes`.

In [None]:
collection = db.prizes
# collection = db['prizes'] # also works
collection

`find_one()` returns None (i.e., displays nothing in notebooks)

In [None]:
db.prizes.find_one({})

In [None]:
print(db.prizes.find_one({}))

## Load data from JSON 

In [None]:
# run this cell to insert into the collection prizes
with open('data/prize.json', encoding='utf-8') as f:
    db.prizes.insert_many(json.loads(f.read()))

In [None]:
# the database and the collection are now both valid
client.list_database_names()

In [None]:
db.list_collection_names()

# Retrieval Queries

**Select** all field-value pairs to output.

In [None]:
db.prizes.find_one({})

Pymongo is smart and avoids querying the **full collection** by instead returning an iterator (here, a Cursor):

In [None]:
db.prizes.find({})

It's not recommended, but you could force query the full collection by casting to list (e.g., `list(db.prizes.find({}))`).

Instead let's iterate over the collection a tiny bit so that we can see how to pretty print each document with `pprint`:

In [None]:
def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return

In [None]:
output = db.prizes.find({})
pretty_print(output)

# Selection (with Predicates)

In [None]:
# get a document that has the exact FV pair
db.prizes.find_one({"category": "chemistry"})

In [None]:
# the comma functions as the "and" operator
db.prizes.find_one({"category": "chemistry", "year": 2020})

## Dollar `$` Notation: Special MongoDB Keywords

(we defined the `pretty_print()` function earlier)

In [None]:
output = db.prizes.find({"$or": [{"category": "chemistry"}, {"year": 2020}]})

pretty_print(output, n_to_print=4)

In [None]:
output = db.prizes.find({"year": {"$gt": 2018}}, skip=6) # skip a bit down the collection just because

pretty_print(output)

## Dot `.` Notation: Traverse Trees

In [None]:
output = db.prizes.find({"laureates.0.surname": "Curie"})
pretty_print(output)

In [None]:
output = db.prizes.find({"laureates.1.surname": "Curie"})
pretty_print(output)

In [None]:
output = db.prizes.find({"laureates.surname": "Curie"})
pretty_print(output)

# Projection

In [None]:
output = db.prizes.find({}, {"year": 1, "category": 1})

pretty_print(output)

<br/><br/><br/><br/><br/>

## Projection Exercise

In [None]:
output = db.prizes.find({}, {"year": 1, "category": 1, "_id": 0})

pretty_print(output)

In [None]:
output = db.prizes.find({}, {"year": 0, "category": 0})

pretty_print(output)

In [None]:
output = db.prizes.find({}, {"year": 0, "category": 1})

pretty_print(output)

In [None]:
db.prizes.find_one({"overallMotivation": {"$exists": 1}})

# Sorting and Limits

In [None]:
output = (db.prizes.find({"category": "peace"},
               {"_id": 0, "category": 1, "year": 1,
                "laureates.firstname": 1, "laureates.surname": 1})
          .sort("year")
         )

pretty_print(output)

In [None]:
output = (db.prizes.find({"category": "peace"},
               {"_id": 0, "category": 1, "year": 1,
                "laureates.firstname": 1, "laureates.surname": 1})
          .sort("year", -1)
         )

pretty_print(output)

In [None]:
output = (db.prizes.find({"category": "peace"},
               {"_id": 0, "category": 1, "year": 1,
                "laureates.firstname": 1, "laureates.surname": 1})
          .sort([("year", 1), ("category", -1)])
         )

pretty_print(output)

In [None]:
output = (db.prizes.find({"category": "peace"},
               {"_id": 0, "category": 1, "year": 1,
                "laureates.firstname": 1, "laureates.surname": 1})
          .sort([("year", 1), ("category", -1)])
          .limit(2)
         )

pretty_print(output)

# Aggregation Queries

Zips JSON from the MongoDB Aggregation Tutorial: https://www.mongodb.com/docs/manual/tutorial/aggregation-zip-code-data-set/

## Load `zips.json` into new `local.zips`

For the sake of simplicity, we'll make a new collection `zips` in a new `aggquerydb` database.

In [None]:
# reimport/redefine as needed
import json
import pymongo
import pprint

def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return

In [None]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()

In [None]:
# run this cell to make the new collection
# and insert zipcode documents

client.drop_database('aggquerydb') # if already exists
db = client.aggquerydb

with open('data/zips.json', encoding='utf-8') as f:
    for line in f.readlines():
        db.zips.insert_one(json.loads(line))

In [None]:
db.zips.count_documents({})

In [None]:
output = db.zips.aggregate( [    
{ "$group": { "_id": "$state",  
  "totalPop": { "$sum": "$pop" } } },    
{ "$match": { "totalPop":
            { "$gte": 15000000 } } }, 
{ "$sort" : { "totalPop" : -1 } }
] )

pretty_print(output)

# Aggregation Queries: Unwind/Lookup

Make a new collection, `inventory`.

In [None]:
# stay in the aggquerydb database
db = client.aggquerydb

In [None]:
db.inventory.insert_many( [    
{ "item": "journal",
 "tags": ["blank", "red"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "A", "qty": 5 }, { "loc": "C", "qty": 15 } ]
},    
{ "item": "notebook",
 "tags": ["red", "blank"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "C", "qty": 5 } ]
},    
{ "item": "paper",
 "tags": ["red", "blank", "plain"],
 "dim": [ 14, 21 ],
 "instock": [ { "loc": "A", "qty": 60 }, { "loc": "B", "qty": 15 } ]
},    
{ "item": "planner",
 "tags": ["blank", "red"],
 "dim": [ 22.85, 30 ] ,
 "instock": [ { "loc": "A", "qty": 40 }, { "loc": "B", "qty": 5 } ]
},    
{ "item": "postcard",
 "tags": ["blue"],
 "dim": [ 10, 15.25 ],
 "instock": [ { "loc": "B", "qty": 15 }, { "loc": "C", "qty": 35 } ]
}
]);

In [None]:
output = db.inventory.aggregate( [ 
{ "$unwind" : "$tags" }, 
{ "$project" : {"_id" : 0, "instock": 0}}  
] )

pretty_print(output, n_to_print=-1)

In [None]:
output = db.inventory.aggregate( [ 
	{ "$unwind" : "$instock" },  
	{ "$group" : { "_id" : "$item", "totalqty" : {"$sum" : "$instock.qty"}}} 
] )

pretty_print(output, n_to_print=-1)

In [None]:
output = db.inventory.aggregate( [ 
{ "$lookup" : {
    "from" : "inventory",
    "localField": "instock.loc", 
    "foreignField": "instock.loc", 
    "as":"otheritems"}
},  
{ "$project" :
    {"_id" : 0, "tags" : 0, "dim" : 0}
} ] )

pretty_print(output, n_to_print=1)