## MongoDB, Atlas cloud, Pandas

In [1]:
# 2020-09, Bruno Grossniklaus, https://github.com/it-gro
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import pandas as pd
import pymongo
from pprint import pprint


pd.set_option('precision', 2)
pd.set_option('max_rows', 20)
pd.set_option('max_colwidth', 30)
# pd.describe_option('max_rows')
# pd.describe_option('precision')
# pd.describe_option('max_colwidth')

HOST_mongo = 'cluster0.ky5nm.azure.mongodb.net/'
OPTIONS_mongo = '?retryWrites=true&w=majority'
USER_mongo = "bdl1"
PASS_mongo = "bdl1"
if USER_mongo:
    credentials=f"{USER_mongo}:{PASS_mongo}@"
else:
    credentials=""

### Connect, dbs

In [2]:
client = pymongo.MongoClient(f"mongodb+srv://{credentials}{HOST_mongo}{OPTIONS_mongo}")
dbs = pd.DataFrame(client.list_databases())
dbs

Unnamed: 0,name,sizeOnDisk,empty
0,sample_airbnb,57400000.0,False
1,sample_geospatial,1410000.0,False
2,sample_mflix,45800000.0,False
3,sample_supplies,1340000.0,False
4,sample_training,74800000.0,False
5,sample_weatherdata,5150000.0,False
6,admin,283000.0,False
7,local,3060000000.0,False


### Collections

In [3]:
coll = pd.DataFrame(client['sample_mflix'].list_collection_names(), 
                    columns=["collection"])
coll

Unnamed: 0,collection
0,sessions
1,theaters
2,comments
3,movies
4,users


### sample_mflix.theaters

In [4]:
theaters = client['sample_mflix'].theaters.find_one()

In [5]:
theaters

{'_id': ObjectId('59a47286cfa9a3a73e51e73a'),
 'theaterId': 1019,
 'location': {'address': {'street1': '390 Northridge Mall',
   'city': 'Salinas',
   'state': 'CA',
   'zipcode': '93906'},
  'geo': {'type': 'Point', 'coordinates': [-121.65946, 36.715809]}}}

In [6]:
pd.DataFrame(client['sample_mflix'].theaters.find_one())

Unnamed: 0,_id,theaterId,location
address,59a47286cfa9a3a73e51e73a,1019,{'street1': '390 Northridg...
geo,59a47286cfa9a3a73e51e73a,1019,"{'type': 'Point', 'coordin..."


In [7]:
limit = { "$limit": 10}
pipeline = [limit]
pd.DataFrame(client['sample_mflix'].theaters.aggregate(pipeline))

Unnamed: 0,_id,theaterId,location
0,59a47286cfa9a3a73e51e73a,1019,{'address': {'street1': '3...
1,59a47286cfa9a3a73e51e732,1014,{'address': {'street1': '1...
2,59a47286cfa9a3a73e51e750,1039,{'address': {'street1': '1...
3,59a47286cfa9a3a73e51e73e,1026,{'address': {'street1': '9...
4,59a47286cfa9a3a73e51e749,1034,{'address': {'street1': '3...
5,59a47286cfa9a3a73e51e74f,102,{'address': {'street1': '1...
6,59a47286cfa9a3a73e51e753,1044,{'address': {'street1': '6...
7,59a47286cfa9a3a73e51e769,105,{'address': {'street1': '1...
8,59a47286cfa9a3a73e51e730,1002,{'address': {'street1': '1...
9,59a47286cfa9a3a73e51e73f,1023,{'address': {'street1': '8...


In [8]:
match_1 = {"$match": {
}}

flatten_1 = {"$addFields": {
    "loc_add_city":         "$location.address.city",
    "loc_add_state":        "$location.address.state",
    "loc_add_street1":      "$location.address.street1",
    "loc_add_zipcode":      "$location.address.zipcode",
    # "loc_geo_coord":        "$location.geo.coordinates",
    "loc_geo_coord_long":   {"$arrayElemAt": ["$location.geo.coordinates", 0]},
    "loc_geo_coord_latt":   {"$arrayElemAt": ["$location.geo.coordinates", 1]},
    "loc_geo_type":         "$location.geo.type",
    }
}

project_1 = {"$project": {"location": 0, "_id": 0}}
match_2 = {"$match": {
    "loc_add_city": {"$regex": "."},
    "loc_add_state": {"$in": ["IA", "CA"]},
    # "loc_add_zipcode": "50702",
    # "loc_geo_coord_long": -92.322624,
    # "loc_geo_coord_long": {"$gte": -92.4, "$lte": -92.3},
}}

limit = {"$limit": 10}
# pipeline = [flatten_1,  project_1, match_1, limit]
pipeline = [match_1, flatten_1,  project_1, match_2]

cursor = client['sample_mflix'].theaters.aggregate(pipeline)

#for doc in cursor:
#    pprint(doc)

theaters = pd.DataFrame(cursor)
# theaters

In [9]:
theaters.sample(10)

Unnamed: 0,theaterId,loc_add_city,loc_add_state,loc_add_street1,loc_add_zipcode,loc_geo_coord_long,loc_geo_coord_latt,loc_geo_type
133,764,Sherman Oaks,CA,4500 Van Nuys Blvd,91403,-118.45,34.15,Point
33,1128,Vallejo,CA,1182 Admiral Callaghan Lane,94591,-122.21,38.14,Point
44,1533,San Diego,CA,11160 Rancho Carmel Drive,92128,-117.08,32.98,Point
137,8146,San Francisco,CA,275 S Airport Blvd,94080,-122.4,37.65,Point
115,352,San Diego,CA,9540 Mira Mesa Blvd,92126,-117.12,32.92,Point
61,120,Santa Rosa,CA,1950 Santa Rosa Ave,95407,-122.71,38.42,Point
97,1537,Montebello,CA,2415 Via Campo,90640,-118.12,34.03,Point
74,191,Chico,CA,2005 Forest Ave,95928,-121.8,39.73,Point
64,16,West Des Moines,IA,4100 University Ave,50265,-93.76,41.6,Point
41,21,Coralville,IA,1431 Coral Ridge Avenue,52241,-91.6,41.69,Point


In [10]:
group_1 = {"$group": {
    "_id": {"state": "$loc_add_state"},
    "cnt": {"$sum": 1}
}}

project_group_1 = {"$project": {
    "_id": 0,
    "state": "$_id.state",
    "cnt": "$cnt",
}}

pipeline = [match_1, flatten_1,  project_1, match_2, group_1, project_group_1]
cursor = client['sample_mflix'].theaters.aggregate(pipeline)
theaters_agg = pd.DataFrame(cursor)
theaters_agg

Unnamed: 0,state,cnt
0,IA,14
1,CA,169
