## MongoDB, citibike trips

In [1]:
! echo $VIRTUAL_ENV
! pip3 list | grep -E 'pymongo|pandas'
# ! pip3 install --upgrade --upgrade-strategy only-if-needed pymongo dnspython pandas

/home/grossnik/venv/bdl03-1-jpy-3.8
pandas              1.2.3  
pymongo             3.11.3 


In [2]:
# 2020-09, Bruno Grossniklaus, https://github.com/it-gro
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import pandas as pd
import pymongo
from pprint import pprint

pd.set_option('precision', 2)
pd.set_option('max_rows', 20)
pd.set_option('max_colwidth', 30)
# pd.describe_option('max_rows')
# pd.describe_option('precision')
# pd.describe_option('max_colwidth')

HOST_mongo = 'localhost'
OPTIONS_mongo = ''
# OPTIONS_mongo = '?retryWrites=true&w=majority'
USER_mongo = ""
PASS_mongo = ""
if USER_mongo:
    credentials=f"{USER_mongo}:{PASS_mongo}@"
else:
    credentials=""

### Connect, dbs

In [3]:
client = pymongo.MongoClient(f"mongodb://{credentials}{HOST_mongo}{OPTIONS_mongo}")
dbs = pd.DataFrame(client.list_databases())
# dbs

### Collections

In [4]:
coll = pd.DataFrame(client['citibike'].list_collection_names(), columns=["collection"])
coll

Unnamed: 0,collection
0,trips


### citibike.trips

In [5]:
trips = client['citibike'].trips.find_one()

In [6]:
trips

{'_id': ObjectId('572bb8222b288919b68adfa5'),
 'tripduration': None,
 'start station id': 279,
 'start station name': 'Peck Slip & Front St',
 'end station id': 268,
 'end station name': 'Howard St & Centre St',
 'bikeid': 22820,
 'usertype': 'Customer',
 'birth year': '',
 'gender': 0,
 'start station location': {'type': 'Point',
  'coordinates': [-74.00167, 40.707873]},
 'end station location': {'type': 'Point',
  'coordinates': [-73.99973337, 40.71910537]},
 'start time': datetime.datetime(2016, 1, 1, 17, 19, 38),
 'stop time': datetime.datetime(2016, 1, 1, 17, 29, 42)}

In [7]:
match_1 = {"$match": {
    "gender": {"$ne": 0},
}}

group_1 = {"$group": {
    "_id": {
        "week": {"$week": "$start time"},
        "gender": "$gender"
    },
    "total_drives": {"$sum": 1},
}}

sort_1 = {"$sort": {"_id": 1}}

project_group_1 = {"$project": {
    "_id": 0,
    "gender": "$_id.gender",
    "week": "$_id.week",
    "total_drives": "$total_drives",
}}

limit = {"$limit": 10}
# pipeline = [match_1, group_1, sort_1, project_group_1, limit]
pipeline = [match_1, group_1, sort_1, project_group_1]

cursor = client['citibike'].trips.aggregate(pipeline)

# for doc in cursor:
#     pprint(doc)

trips_agg = pd.DataFrame(cursor)
# trips_agg

In [8]:
trips_agg.sample(10)

Unnamed: 0,gender,week,total_drives
13,2,6,24514
26,1,13,100855
15,2,7,22938
9,2,4,9203
17,2,8,28372
27,2,13,29739
19,2,9,34814
1,2,0,5285
0,1,0,15290
22,1,11,136735


In [9]:
trips_gender_1 = trips_agg[trips_agg["gender"] == 1]
trips_gender_1

Unnamed: 0,gender,week,total_drives
0,1,0,15290
2,1,1,114644
4,1,2,116879
6,1,3,84488
8,1,4,36929
10,1,5,108121
12,1,6,94922
14,1,7,87528
16,1,8,102076
18,1,9,122137


In [10]:
trips_gender_2 = trips_agg[trips_agg["gender"] == 2]
trips_gender_2

Unnamed: 0,gender,week,total_drives
1,2,0,5285
3,2,1,32183
5,2,2,31862
7,2,3,22190
9,2,4,9203
11,2,5,29376
13,2,6,24514
15,2,7,22938
17,2,8,28372
19,2,9,34814


In [11]:
trips_merged = trips_gender_1.merge(trips_gender_2, 
                                    on="week", 
                                    how="outer")
trips_merged 

Unnamed: 0,gender_x,week,total_drives_x,gender_y,total_drives_y
0,1,0,15290,2,5285
1,1,1,114644,2,32183
2,1,2,116879,2,31862
3,1,3,84488,2,22190
4,1,4,36929,2,9203
5,1,5,108121,2,29376
6,1,6,94922,2,24514
7,1,7,87528,2,22938
8,1,8,102076,2,28372
9,1,9,122137,2,34814


In [12]:
del(trips_merged["gender_x"])
del(trips_merged["gender_y"])
trips_merged.rename(columns={"total_drives_x": "total_drives_gender_1", 
                             "total_drives_y": "total_drives_gender_2"}, 
                    inplace=True)

In [13]:
trips_merged

Unnamed: 0,week,total_drives_gender_1,total_drives_gender_2
0,0,15290,5285
1,1,114644,32183
2,2,116879,31862
3,3,84488,22190
4,4,36929,9203
5,5,108121,29376
6,6,94922,24514
7,7,87528,22938
8,8,102076,28372
9,9,122137,34814
