## MongoDB, citibike trips

In [11]:
# 2021-03, Bruno Grossniklaus, https://github.com/it-gro
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import pandas as pd
import pymongo

pd.set_option('precision', 2)
pd.set_option('max_rows', 20)
pd.set_option('max_colwidth', 30)
# pd.describe_option('max_rows')
# pd.describe_option('precision')
# pd.describe_option('max_colwidth')

HOST_mongo = 'localhost'
OPTIONS_mongo = ''
# OPTIONS_mongo = '?retryWrites=true&w=majority'
USER_mongo = ""
PASS_mongo = ""
if USER_mongo:
    credentials=f"{USER_mongo}:{PASS_mongo}@"
else:
    credentials=""

In [2]:
client = pymongo.MongoClient(f"mongodb://{credentials}{HOST_mongo}{OPTIONS_mongo}")

### all done in mongodb

In [7]:
# 2021, Matthias Wenger 

add_calc_fields = {"$addFields": {
    "Date":    {"$toDate": "$start time"},
    "weeks":   {"$week":   "$start time"},
    "gender1": {"$cond": {"if": {"$eq": ["$gender", 1]}, "then": 1, "else": 0}},
    "gender2": {"$cond": {"if": {"$eq": ["$gender", 2]}, "then": 1, "else": 0}},
}}

grouping = {"$group": {
    "_id": "$weeks",
    "Total1": {"$sum": "$gender1"},
    "Total2": {"$sum": "$gender2"},
    # "Dates": {'$addToSet': '$Date'}, # just checking if weeks are correct
}}

project = {"$project": {
    "_id":  0,
    "week": "$_id",
    # "Dates": 1,
    "total_drives_gender_1": "$Total1",
    "total_drives_gender_2": "$Total2",
}}

limit = {"$limit": 13}

sorting = {"$sort": {"week": 1}}

pipeline = [add_calc_fields, grouping, project, sorting]

cursor = client['citibike'].trips.aggregate(pipeline)
df = pd.DataFrame(cursor)

In [8]:
df

Unnamed: 0,week,total_drives_gender_1,total_drives_gender_2
0,0,15290,5285
1,1,114644,32183
2,2,116879,31862
3,3,84488,22190
4,4,36929,9203
5,5,108121,29376
6,6,94922,24514
7,7,87528,22938
8,8,102076,28372
9,9,122137,34814


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   week                   14 non-null     int64
 1   total_drives_gender_1  14 non-null     int64
 2   total_drives_gender_2  14 non-null     int64
dtypes: int64(3)
memory usage: 464.0 bytes


In [10]:
df.describe()

Unnamed: 0,week,total_drives_gender_1,total_drives_gender_2
count,14.0,14.0,14.0
mean,6.5,102196.36,29113.93
std,4.18,39376.48,12389.26
min,0.0,15290.0,5285.0
25%,3.25,89376.5,23332.0
50%,6.5,105098.5,29557.5
75%,9.75,120822.5,34156.25
max,13.0,164363.0,51175.0
