## MongoDB

### Packages 

In [1]:
%%bash
echo $VIRTUAL_ENV
pip3 install --upgrade --upgrade-strategy only-if-needed pymongo dnspython
# pip3 list | wc -l

/home/vagrant/venv/bdl03-1-jpy-3.8
Requirement already up-to-date: pymongo in /home/vagrant/venv/bdl03-1-jpy-3.8/lib/python3.8/site-packages (3.11.0)
Requirement already up-to-date: dnspython in /home/vagrant/venv/bdl03-1-jpy-3.8/lib/python3.8/site-packages (2.0.0)


### Global configuration

In [2]:
HOST_mongo = 'localhost:27017'
USER_mongo = ""
PASS_mongo = ""
if USER_mongo:
    credentials=f"{USER_mongo}:{PASS_mongo}@"
else:
    credentials=""

### pymongo

#### Just print

In [3]:
import pymongo
from pprint import pprint

client = pymongo.MongoClient(f"mongodb://{HOST_mongo}")
db = client['mflix']

project = {"$project": {
    "title": 1,
    "genre": 1,
    "viewerVotes": 1,
    "viewerRating": 1,
    "_id": 0,
}}

match = {"$match": {
    "viewerVotes": {"$gte": 1000},
    "runtime": {"$gte": 150},
}}

sort = {"$sort": {"year": -1, "viewerVotes": -1}}
limit = {"$limit": 5}
skip = {"$skip": 1000}
pipeline = [match, project, skip, sort, limit]

pprint(pipeline)

for doc in db.movies.aggregate(pipeline):
    pprint(doc)


[{'$match': {'runtime': {'$gte': 150}, 'viewerVotes': {'$gte': 1000}}},
 {'$project': {'_id': 0,
               'genre': 1,
               'title': 1,
               'viewerRating': 1,
               'viewerVotes': 1}},
 {'$skip': 1000},
 {'$sort': {'viewerVotes': -1, 'year': -1}},
 {'$limit': 5}]
{'genre': 'Crime, Drama, Romance',
 'title': 'Haider',
 'viewerRating': 8.4,
 'viewerVotes': 31553}
{'genre': 'Action, Crime, Thriller',
 'title': 'Baby',
 'viewerRating': 8.2,
 'viewerVotes': 26028}
{'genre': 'Comedy, Drama',
 'title': 'Bajrangi Bhaijaan',
 'viewerRating': 8.2,
 'viewerVotes': 25330}
{'genre': 'Drama',
 'title': 'Winter Sleep',
 'viewerRating': 8.3,
 'viewerVotes': 19960}
{'genre': 'Drama, Horror, Mystery',
 'title': 'The Blue Elephant',
 'viewerRating': 8.4,
 'viewerVotes': 16996}


#### Pandas

In [4]:
import pandas as pd

df = pd.DataFrame(client['mflix'].movies.aggregate(pipeline))
df

Unnamed: 0,title,genre,viewerRating,viewerVotes
0,Haider,"Crime, Drama, Romance",8.4,31553
1,Baby,"Action, Crime, Thriller",8.2,26028
2,Bajrangi Bhaijaan,"Comedy, Drama",8.2,25330
3,Winter Sleep,Drama,8.3,19960
4,The Blue Elephant,"Drama, Horror, Mystery",8.4,16996


In [5]:
group1 = {"$group": {
    "_id": {"state": "$state", "city": "$city"},
    "pop": {"$sum": "$pop"},
}}

sort = {"$sort": {"pop": 1}}

group2 = {"$group": {
    "_id": "$_id.state",
    "biggestCity":  {"$last": "$_id.city"},
    "biggestPop":   {"$last": "$pop"},
    "smallestCity": {"$first": "$_id.city"},
    "smallestPop":  {"$first": "$pop"},
}}

project = {"$project": {
    "_id": 0,
    "state": "$_id",
    "biggestCity":  {"name": "$biggestCity",
                     "pop": "$biggestPop"},
    "smallestCity": {"name": "$smallestCity",
                     "pop": "$smallestPop"},
}}

limit = {"$limit": 10}

#pipeline = [group1, sort]
#pipeline = [group1, sort, group2]
pipeline = [group1, sort, group2, project, limit]
#pipeline = [group1, sort, group2, project]


cursor = client['mongodb-examples'].zipcodes.aggregate(pipeline)
for doc in cursor:
    pprint(doc)

cursor = client['mongodb-examples'].zipcodes.aggregate(pipeline)
df = pd.DataFrame(cursor)
df

{'biggestCity': {'name': 'SAINT LOUIS', 'pop': 397802},
 'smallestCity': {'name': 'BENDAVIS', 'pop': 44},
 'state': 'MO'}
{'biggestCity': {'name': 'WICHITA', 'pop': 295115},
 'smallestCity': {'name': 'ARNOLD', 'pop': 0},
 'state': 'KS'}
{'biggestCity': {'name': 'LITTLE ROCK', 'pop': 192895},
 'smallestCity': {'name': 'TOMATO', 'pop': 0},
 'state': 'AR'}
{'biggestCity': {'name': 'LOS ANGELES', 'pop': 2102295},
 'smallestCity': {'name': 'ALLEGHANY', 'pop': 0},
 'state': 'CA'}
{'biggestCity': {'name': 'BRIDGEPORT', 'pop': 141638},
 'smallestCity': {'name': 'EAST KILLINGLY', 'pop': 25},
 'state': 'CT'}
{'biggestCity': {'name': 'ANCHORAGE', 'pop': 183987},
 'smallestCity': {'name': 'SLEETMUTE', 'pop': 0},
 'state': 'AK'}
{'biggestCity': {'name': 'NEWARK', 'pop': 275572},
 'smallestCity': {'name': 'IMLAYSTOWN', 'pop': 17},
 'state': 'NJ'}
{'biggestCity': {'name': 'INDIANAPOLIS', 'pop': 348868},
 'smallestCity': {'name': 'WESTPOINT', 'pop': 145},
 'state': 'IN'}
{'biggestCity': {'name': 'OMAH

Unnamed: 0,biggestCity,smallestCity,state
0,"{'name': 'WICHITA', 'pop': 295115}","{'name': 'ARNOLD', 'pop': 0}",KS
1,"{'name': 'SAINT LOUIS', 'pop': 397802}","{'name': 'BENDAVIS', 'pop': 44}",MO
2,"{'name': 'LOS ANGELES', 'pop': 2102295}","{'name': 'ALLEGHANY', 'pop': 0}",CA
3,"{'name': 'LITTLE ROCK', 'pop': 192895}","{'name': 'TOMATO', 'pop': 0}",AR
4,"{'name': 'BRIDGEPORT', 'pop': 141638}","{'name': 'EAST KILLINGLY', 'pop': 25}",CT
5,"{'name': 'NEWARK', 'pop': 275572}","{'name': 'IMLAYSTOWN', 'pop': 17}",NJ
6,"{'name': 'ANCHORAGE', 'pop': 183987}","{'name': 'SLEETMUTE', 'pop': 0}",AK
7,"{'name': 'OMAHA', 'pop': 358930}","{'name': 'LAKESIDE', 'pop': 5}",NE
8,"{'name': 'INDIANAPOLIS', 'pop': 348868}","{'name': 'WESTPOINT', 'pop': 145}",IN
9,"{'name': 'PORTLAND', 'pop': 63268}","{'name': 'BUSTINS ISLAND', 'pop': 0}",ME


In [6]:
group1   = { "$group":
             {
               "_id": { "state": "$state", "city": "$city" },
               "pop": { "$sum": "$pop" }
             }}
sort     = {"$sort": { "pop": 1 } }
group2   = { "$group":
             {
               "_id" : "$_id.state",
               "biggestCity":  { "$last": "$_id.city" },
               "biggestPop":   { "$last": "$pop" },
               "smallestCity": { "$first": "$_id.city" },
               "smallestPop":  { "$first": "$pop" }
             }}
project     = { "$project":
                { "_id": 0,
                  "state": "$_id",
                  "biggestCityName": "$biggestCity",
                  "biggestCityPop":   "$biggestPop" ,
                  "smallestCityName": "$smallestCity",
                  "smallestCityPop":  "$smallestPop" 
                }}
limit = { "$limit": 10}

#pipeline = [group1, sort]
#pipeline = [group1, sort, group2]
pipeline = [group1, sort, group2, project, limit]
#pipeline = [group1, sort, group2, project]


cursor = client['mongodb-examples'].zipcodes.aggregate(pipeline)
for doc in cursor:
    pprint(doc)

cursor = client['mongodb-examples'].zipcodes.aggregate(pipeline)
df = pd.DataFrame(cursor)
df

{'biggestCityName': 'SAINT LOUIS',
 'biggestCityPop': 397802,
 'smallestCityName': 'BENDAVIS',
 'smallestCityPop': 44,
 'state': 'MO'}
{'biggestCityName': 'WICHITA',
 'biggestCityPop': 295115,
 'smallestCityName': 'ARNOLD',
 'smallestCityPop': 0,
 'state': 'KS'}
{'biggestCityName': 'LITTLE ROCK',
 'biggestCityPop': 192895,
 'smallestCityName': 'TOMATO',
 'smallestCityPop': 0,
 'state': 'AR'}
{'biggestCityName': 'LOS ANGELES',
 'biggestCityPop': 2102295,
 'smallestCityName': 'ALLEGHANY',
 'smallestCityPop': 0,
 'state': 'CA'}
{'biggestCityName': 'BRIDGEPORT',
 'biggestCityPop': 141638,
 'smallestCityName': 'EAST KILLINGLY',
 'smallestCityPop': 25,
 'state': 'CT'}
{'biggestCityName': 'ANCHORAGE',
 'biggestCityPop': 183987,
 'smallestCityName': 'SLEETMUTE',
 'smallestCityPop': 0,
 'state': 'AK'}
{'biggestCityName': 'NEWARK',
 'biggestCityPop': 275572,
 'smallestCityName': 'IMLAYSTOWN',
 'smallestCityPop': 17,
 'state': 'NJ'}
{'biggestCityName': 'INDIANAPOLIS',
 'biggestCityPop': 348868,
 

Unnamed: 0,state,biggestCityName,biggestCityPop,smallestCityName,smallestCityPop
0,KS,WICHITA,295115,ARNOLD,0
1,MO,SAINT LOUIS,397802,BENDAVIS,44
2,CA,LOS ANGELES,2102295,ALLEGHANY,0
3,AR,LITTLE ROCK,192895,TOMATO,0
4,CT,BRIDGEPORT,141638,EAST KILLINGLY,25
5,NJ,NEWARK,275572,IMLAYSTOWN,17
6,AK,ANCHORAGE,183987,SLEETMUTE,0
7,NE,OMAHA,358930,LAKESIDE,5
8,IN,INDIANAPOLIS,348868,WESTPOINT,145
9,ME,PORTLAND,63268,BUSTINS ISLAND,0


In [7]:
pipeline = [group1, sort, group2, project]
cursor = client['mongodb-examples'].zipcodes.aggregate(pipeline)
df = pd.DataFrame(cursor)

In [8]:
df.sample(10)

Unnamed: 0,state,biggestCityName,biggestCityPop,smallestCityName,smallestCityPop
3,CA,LOS ANGELES,2102295,ALLEGHANY,0
24,VT,BURLINGTON,39127,UNIV OF VERMONT,0
42,SD,SIOUX FALLS,102046,ZEONA,8
30,SC,COLUMBIA,269521,QUINBY,0
21,OR,PORTLAND,518543,ODELL,0
32,DC,WASHINGTON,606879,PENTAGON,21
17,LA,NEW ORLEANS,496937,FORDOCHE,0
13,PA,PHILADELPHIA,1610956,HAMILTON,0
35,TN,MEMPHIS,632837,ALLRED,2
49,AZ,PHOENIX,890853,HUALAPAI,2
