# Mongo DB I : Retrieval Queries

In [1]:
import json
import pymongo
import pprint

## Connect to Database on localhost

Nobel laureate data from Kaggle: https://www.kaggle.com/datasets/imdevskp/nobel-prize/data

In [2]:
client = pymongo.MongoClient('mongodb://localhost')
client.list_database_names()

['admin', 'config', 'local']

In [3]:
client.drop_database('nobel_prizes') # if already exists
client.drop_database('aggquerydb') # if already exists

Connect to an empty database called `nobel_prizes`

In [4]:
db = client.nobel_prizes
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'nobel_prizes')

In [5]:
db.list_collection_names()

[]

<br/><br/>Within this empty database, access an empty collection called `prizes`.

In [6]:
collection = db.prizes
# collection = db['prizes'] # also works
collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'nobel_prizes'), 'prizes')

`find_one()` returns None (i.e., displays nothing in notebooks)

In [7]:
db.prizes.find_one({})

In [8]:
print(db.prizes.find_one({}))

None


## Load data from JSON 

In [9]:
# run this cell to insert into the collection prizes
with open('data/prize.json', encoding='utf-8') as f:
    db.prizes.insert_many(json.loads(f.read()))

In [10]:
# the database and the collection are now both valid
client.list_database_names()

['admin', 'config', 'local', 'nobel_prizes']

In [11]:
db.list_collection_names()

['prizes']

# Retrieval Queries

**Select** all field-value pairs to output.

In [12]:
db.prizes.find_one({})

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'year': 2020,
 'category': 'chemistry',
 'laureates': [{'id': '991',
   'firstname': 'Emmanuelle',
   'surname': 'Charpentier',
   'motivation': '"for the development of a method for genome editing"',
   'share': '2'},
  {'id': '992',
   'firstname': 'Jennifer A.',
   'surname': 'Doudna',
   'motivation': '"for the development of a method for genome editing"',
   'share': '2'}]}

Pymongo is smart and avoids querying the **full collection** by instead returning an iterator (here, a Cursor):

In [13]:
db.prizes.find({})

<pymongo.synchronous.cursor.Cursor at 0x798338363690>

In [14]:
docs = db.prizes.find()
next(docs)

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'year': 2020,
 'category': 'chemistry',
 'laureates': [{'id': '991',
   'firstname': 'Emmanuelle',
   'surname': 'Charpentier',
   'motivation': '"for the development of a method for genome editing"',
   'share': '2'},
  {'id': '992',
   'firstname': 'Jennifer A.',
   'surname': 'Doudna',
   'motivation': '"for the development of a method for genome editing"',
   'share': '2'}]}

In [15]:
# remember Python iterators/generators!
next(docs)

{'_id': ObjectId('68fa549f985bfa397937db79'),
 'year': 2020,
 'category': 'economics',
 'laureates': [{'id': '995',
   'firstname': 'Paul',
   'surname': 'Milgrom',
   'motivation': '"for improvements to auction theory and inventions of new auction formats"',
   'share': '2'},
  {'id': '996',
   'firstname': 'Robert',
   'surname': 'Wilson',
   'motivation': '"for improvements to auction theory and inventions of new auction formats"',
   'share': '2'}]}

It's not recommended, but you could force query the full collection by casting to list (e.g., `list(db.prizes.find({}))`).

Instead let's iterate over the collection a tiny bit so that we can see how to pretty print each document with `pprint`:

In [None]:
# list(db.prizes.find({}))

In [16]:
def pretty_print(output_collection, n_to_print=3):
    """
    note if n_to_print is -1, this print everything
    """
    for i, doc in enumerate(output_collection):
        pprint.pprint(doc)        # nicely formats each document
        if i+1 == n_to_print: return

In [17]:
output = db.prizes.find({})
pretty_print(output)

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'category': 'chemistry',
 'laureates': [{'firstname': 'Emmanuelle',
                'id': '991',
                'motivation': '"for the development of a method for genome '
                              'editing"',
                'share': '2',
                'surname': 'Charpentier'},
               {'firstname': 'Jennifer A.',
                'id': '992',
                'motivation': '"for the development of a method for genome '
                              'editing"',
                'share': '2',
                'surname': 'Doudna'}],
 'year': 2020}
{'_id': ObjectId('68fa549f985bfa397937db79'),
 'category': 'economics',
 'laureates': [{'firstname': 'Paul',
                'id': '995',
                'motivation': '"for improvements to auction theory and '
                              'inventions of new auction formats"',
                'share': '2',
                'surname': 'Milgrom'},
               {'firstname': 'Robert',
 

# Selection (with Predicates)

In [18]:
# get a document that has the exact FV pair
db.prizes.find_one({"category": "chemistry"})

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'year': 2020,
 'category': 'chemistry',
 'laureates': [{'id': '991',
   'firstname': 'Emmanuelle',
   'surname': 'Charpentier',
   'motivation': '"for the development of a method for genome editing"',
   'share': '2'},
  {'id': '992',
   'firstname': 'Jennifer A.',
   'surname': 'Doudna',
   'motivation': '"for the development of a method for genome editing"',
   'share': '2'}]}

In [19]:
# the comma functions as the "and" operator
db.prizes.find_one({"category": "chemistry", "year": 2000})

{'_id': ObjectId('68fa549f985bfa397937dbf0'),
 'year': 2000,
 'category': 'chemistry',
 'laureates': [{'id': '729',
   'firstname': 'Alan',
   'surname': 'Heeger',
   'motivation': '"for the discovery and development of conductive polymers"',
   'share': '3'},
  {'id': '730',
   'firstname': 'Alan',
   'surname': 'MacDiarmid',
   'motivation': '"for the discovery and development of conductive polymers"',
   'share': '3'},
  {'id': '731',
   'firstname': 'Hideki',
   'surname': 'Shirakawa',
   'motivation': '"for the discovery and development of conductive polymers"',
   'share': '3'}]}

## Dollar `$` Notation: Special MongoDB Keywords

(we defined the `pretty_print()` function earlier)

In [20]:
output = db.prizes.find({"$or": [{"category": "chemistry"},
                                 {"year": 2020}]})

pretty_print(output, n_to_print=4)

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'category': 'chemistry',
 'laureates': [{'firstname': 'Emmanuelle',
                'id': '991',
                'motivation': '"for the development of a method for genome '
                              'editing"',
                'share': '2',
                'surname': 'Charpentier'},
               {'firstname': 'Jennifer A.',
                'id': '992',
                'motivation': '"for the development of a method for genome '
                              'editing"',
                'share': '2',
                'surname': 'Doudna'}],
 'year': 2020}
{'_id': ObjectId('68fa549f985bfa397937db79'),
 'category': 'economics',
 'laureates': [{'firstname': 'Paul',
                'id': '995',
                'motivation': '"for improvements to auction theory and '
                              'inventions of new auction formats"',
                'share': '2',
                'surname': 'Milgrom'},
               {'firstname': 'Robert',
 

In [21]:
output = db.prizes.find({"year": {"$gt": 2018}}, skip=6) 
# skip a bit down the collection just because

pretty_print(output)

{'_id': ObjectId('68fa549f985bfa397937db7e'),
 'category': 'chemistry',
 'laureates': [{'firstname': 'John',
                'id': '976',
                'motivation': '"for the development of lithium-ion batteries"',
                'share': '3',
                'surname': 'Goodenough'},
               {'firstname': 'M. Stanley',
                'id': '977',
                'motivation': '"for the development of lithium-ion batteries"',
                'share': '3',
                'surname': 'Whittingham'},
               {'firstname': 'Akira',
                'id': '978',
                'motivation': '"for the development of lithium-ion batteries"',
                'share': '3',
                'surname': 'Yoshino'}],
 'year': 2019}
{'_id': ObjectId('68fa549f985bfa397937db7f'),
 'category': 'economics',
 'laureates': [{'firstname': 'Abhijit',
                'id': '982',
                'motivation': '"for their experimental approach to alleviating '
                              '

## Dot `.` Notation: Traverse Trees

In [23]:
# old query from before to set the stage
output = db.prizes.find({"category": "chemistry"})
pretty_print(output)

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'category': 'chemistry',
 'laureates': [{'firstname': 'Emmanuelle',
                'id': '991',
                'motivation': '"for the development of a method for genome '
                              'editing"',
                'share': '2',
                'surname': 'Charpentier'},
               {'firstname': 'Jennifer A.',
                'id': '992',
                'motivation': '"for the development of a method for genome '
                              'editing"',
                'share': '2',
                'surname': 'Doudna'}],
 'year': 2020}
{'_id': ObjectId('68fa549f985bfa397937db7e'),
 'category': 'chemistry',
 'laureates': [{'firstname': 'John',
                'id': '976',
                'motivation': '"for the development of lithium-ion batteries"',
                'share': '3',
                'surname': 'Goodenough'},
               {'firstname': 'M. Stanley',
                'id': '977',
                'motivatio

In [25]:
# zero indexing
output = db.prizes.find({"laureates.0.surname": "Curie"})
pretty_print(output)

{'_id': ObjectId('68fa549f985bfa397937ddcd'),
 'category': 'chemistry',
 'laureates': [{'firstname': 'Marie',
                'id': '6',
                'motivation': '"in recognition of her services to the '
                              'advancement of chemistry by the discovery of '
                              'the elements radium and polonium, by the '
                              'isolation of radium and the study of the nature '
                              'and compounds of this remarkable element"',
                'share': '1',
                'surname': 'Curie'}],
 'year': 1911}


In [24]:
output = db.prizes.find({"laureates.1.surname": "Curie"})
pretty_print(output)

{'_id': ObjectId('67edba76a78cf4ba3fffc356'),
 'category': 'physics',
 'laureates': [{'firstname': 'Henri',
                'id': '4',
                'motivation': '"in recognition of the extraordinary services '
                              'he has rendered by his discovery of spontaneous '
                              'radioactivity"',
                'share': '2',
                'surname': 'Becquerel'},
               {'firstname': 'Pierre',
                'id': '5',
                'motivation': '"in recognition of the extraordinary services '
                              'they have rendered by their joint researches on '
                              'the radiation phenomena discovered by Professor '
                              'Henri Becquerel"',
                'share': '4',
                'surname': 'Curie'},
               {'firstname': 'Marie',
                'id': '6',
                'motivation': '"in recognition of the extraordinary services '
                  

In [27]:
# if left unspecified, returns all valid matches
output = db.prizes.find({"laureates.surname": "Curie"})
pretty_print(output)

{'_id': ObjectId('68fa549f985bfa397937ddcd'),
 'category': 'chemistry',
 'laureates': [{'firstname': 'Marie',
                'id': '6',
                'motivation': '"in recognition of her services to the '
                              'advancement of chemistry by the discovery of '
                              'the elements radium and polonium, by the '
                              'isolation of radium and the study of the nature '
                              'and compounds of this remarkable element"',
                'share': '1',
                'surname': 'Curie'}],
 'year': 1911}
{'_id': ObjectId('68fa549f985bfa397937ddf8'),
 'category': 'physics',
 'laureates': [{'firstname': 'Henri',
                'id': '4',
                'motivation': '"in recognition of the extraordinary services '
                              'he has rendered by his discovery of spontaneous '
                              'radioactivity"',
                'share': '2',
                'surname': 

# Projection

In [29]:
output = db.prizes.find({}, {"year": 1, "category": 1})

pretty_print(output)

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'category': 'chemistry',
 'year': 2020}
{'_id': ObjectId('68fa549f985bfa397937db79'),
 'category': 'economics',
 'year': 2020}
{'_id': ObjectId('68fa549f985bfa397937db7a'),
 'category': 'literature',
 'year': 2020}


In [30]:
# Alternate syntax
output = db.prizes.find({}, ["year", "category"])

pretty_print(output)

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'category': 'chemistry',
 'year': 2020}
{'_id': ObjectId('68fa549f985bfa397937db79'),
 'category': 'economics',
 'year': 2020}
{'_id': ObjectId('68fa549f985bfa397937db7a'),
 'category': 'literature',
 'year': 2020}


<br/><br/><br/>

## Projection Exercise

In [36]:
# Including some, explicitly excluding _id
output = db.prizes.find({}, {"year": 1, "category": 1, "_id": 0})

pretty_print(output)

{'category': 'chemistry', 'year': 2020}
{'category': 'economics', 'year': 2020}
{'category': 'literature', 'year': 2020}


In [37]:
# Excluding some 
output = db.prizes.find({}, {"year": 0, "category": 0})

pretty_print(output)

{'_id': ObjectId('68fa549f985bfa397937db78'),
 'laureates': [{'firstname': 'Emmanuelle',
                'id': '991',
                'motivation': '"for the development of a method for genome '
                              'editing"',
                'share': '2',
                'surname': 'Charpentier'},
               {'firstname': 'Jennifer A.',
                'id': '992',
                'motivation': '"for the development of a method for genome '
                              'editing"',
                'share': '2',
                'surname': 'Doudna'}]}
{'_id': ObjectId('68fa549f985bfa397937db79'),
 'laureates': [{'firstname': 'Paul',
                'id': '995',
                'motivation': '"for improvements to auction theory and '
                              'inventions of new auction formats"',
                'share': '2',
                'surname': 'Milgrom'},
               {'firstname': 'Robert',
                'id': '996',
                'motivation': '"for imp

In [38]:
# Can't both include and exclude (except for _id)
output = db.prizes.find({}, {"year": 0, "category": 1})

pretty_print(output)

OperationFailure: Cannot do inclusion on field category in exclusion projection, full error: {'ok': 0.0, 'errmsg': 'Cannot do inclusion on field category in exclusion projection', 'code': 31253, 'codeName': 'Location31253'}

# Sorting and Limits

In [40]:
output = (db.prizes
          .find({"category": "peace"},
               {"_id": 0, "category": 1, "year": 1,
                "laureates.firstname": 1, "laureates.surname": 1})
          .sort("year")
         )

pretty_print(output)

{'category': 'peace',
 'laureates': [{'firstname': 'Henry', 'surname': 'Dunant'},
               {'firstname': 'Frédéric', 'surname': 'Passy'}],
 'year': 1901}
{'category': 'peace',
 'laureates': [{'firstname': 'Élie', 'surname': 'Ducommun'},
               {'firstname': 'Albert', 'surname': 'Gobat'}],
 'year': 1902}
{'category': 'peace',
 'laureates': [{'firstname': 'Randal', 'surname': 'Cremer'}],
 'year': 1903}


In [41]:
output = (db.prizes.find({"category": "peace"},
               {"_id": 0, "category": 1, "year": 1,
                "laureates.firstname": 1, "laureates.surname": 1})
          .sort("year", -1)
         )

pretty_print(output)

{'category': 'peace',
 'laureates': [{'firstname': 'World Food Programme'}],
 'year': 2020}
{'category': 'peace',
 'laureates': [{'firstname': 'Abiy', 'surname': 'Ahmed Ali'}],
 'year': 2019}
{'category': 'peace',
 'laureates': [{'firstname': 'Denis', 'surname': 'Mukwege'},
               {'firstname': 'Nadia', 'surname': 'Murad'}],
 'year': 2018}


In [42]:
output = (db.prizes.find({"category": "peace"},
               {"_id": 0, "category": 1, "year": 1,
                "laureates.firstname": 1, "laureates.surname": 1})
          .sort([("year", 1), ("category", -1)])
         )

pretty_print(output)

{'category': 'peace',
 'laureates': [{'firstname': 'Henry', 'surname': 'Dunant'},
               {'firstname': 'Frédéric', 'surname': 'Passy'}],
 'year': 1901}
{'category': 'peace',
 'laureates': [{'firstname': 'Élie', 'surname': 'Ducommun'},
               {'firstname': 'Albert', 'surname': 'Gobat'}],
 'year': 1902}
{'category': 'peace',
 'laureates': [{'firstname': 'Randal', 'surname': 'Cremer'}],
 'year': 1903}


In [43]:
output = (db.prizes.find({"category": "peace"},
               {"_id": 0, "category": 1, "year": 1,
                "laureates.firstname": 1, "laureates.surname": 1})
          .sort([("year", 1), ("category", -1)])
          .limit(2)
         )

pretty_print(output)

{'category': 'peace',
 'laureates': [{'firstname': 'Henry', 'surname': 'Dunant'},
               {'firstname': 'Frédéric', 'surname': 'Passy'}],
 'year': 1901}
{'category': 'peace',
 'laureates': [{'firstname': 'Élie', 'surname': 'Ducommun'},
               {'firstname': 'Albert', 'surname': 'Gobat'}],
 'year': 1902}


In [39]:
## Extra

db.prizes.find_one({"overallMotivation": {"$exists": 1}})

{'_id': ObjectId('68fa549f985bfa397937db82'),
 'year': 2019,
 'category': 'physics',
 'overallMotivation': '"for contributions to our understanding of the evolution of the universe and Earth’s place in the cosmos"',
 'laureates': [{'id': '973',
   'firstname': 'James',
   'surname': 'Peebles',
   'motivation': '"for theoretical discoveries in physical cosmology"',
   'share': '2'},
  {'id': '974',
   'firstname': 'Michel',
   'surname': 'Mayor',
   'motivation': '"for the discovery of an exoplanet orbiting a solar-type star"',
   'share': '4'},
  {'id': '975',
   'firstname': 'Didier',
   'surname': 'Queloz',
   'motivation': '"for the discovery of an exoplanet orbiting a solar-type star"',
   'share': '4'}]}