Final queries for visualizations

In [2]:
import pymongo
from pymongo import MongoClient, UpdateOne
import json
from calendar import monthrange
from datetime import datetime

In [3]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [4]:
# Confirm list of databases
print(mongo.list_database_names())

['admin', 'chicago_bikes', 'config', 'local']


In [5]:
# Assign the database to a variable name
db = mongo.chicago_bikes

In [6]:
# Review collections in database
print(db.list_collection_names())

['divvy_rides_by_season', 'weather_daily', 'withoutStationName', 'divvy_rides_by_month', 'sig_prcp_yes', 'Top10StartStations', 'divvy_ridedata_merged', 'Top10Routes', 'sig_prcp_no', 'withLatLong', 'distinct_station_names', 'RouteDistance', 'withStationName', 'divvy_ridedata', 'Top10EndStations']


In [7]:
# Assign collections to variables 
divvy_rides = db['divvy_ridedata']
weather_daily = db['weather_daily']
divvy_ridedata_merged = db['divvy_ridedata_merged']
withoutStation = db['withoutStationName']
withStation = db['withStationName']
withLatLong = db['withLatLong']

In [8]:
# Count the number of documents in the collections 
print(divvy_rides.count_documents({}))
print(weather_daily.count_documents({}))
print(divvy_ridedata_merged.count_documents({}))
print(withoutStation.count_documents({}))
print(withStation.count_documents({}))
print(withLatLong.count_documents({}))

5667717
365
5667717
427449
4369360
5661859


In [9]:
# Review a document in each collection 
print(db.divvy_ridedata.find_one())
print(db.weather_daily.find_one())
print(db.divvy_ridedata_merged.find_one())
print(db.withoutStationName.find_one())
print(db.withStationName.find_one())
print(db.withLatLong.find_one())

{'_id': ObjectId('655f1914044b5b5ebbf90d67'), 'ride_id': 'A6CF8980A652D272', 'rideable_type': 'electric_bike', 'started_at': '2022-01-10 08:41:56', 'ended_at': '2022-01-10 08:46:17', 'start_station_name': 'Glenwood Ave & Touhy Ave', 'start_station_id': 525, 'end_station_name': 'Clark St & Touhy Ave', 'end_station_id': 'RP-007', 'start_lat': 42.012763, 'start_lng': -87.6659675, 'end_lat': 42.01256011541, 'end_lng': -87.6743671152, 'member_casual': 'casual', 'started_at_date': '2022-01-10', 'started_at_time': '08:41:56', 'ended_at_date': '2022-01-10', 'ended_at_time': '08:46:17'}
{'_id': ObjectId('655f19d5f1c6af1dd0f57470'), 'date': '2022-01-01', 'cloud_cover': 90.0, 'precipitation': 0.18, 'min_temp': 33.22, 'max_temp': 42.1, 'morning_temp': 42.1, 'afternoon_temp': 38.43, 'evening_temp': 35.24, 'night_temp': 38.44, 'max_windspeed': 15.01, 'prcp_inches': 0.0070866, 'sig_prcp': 'no', 'avg_temp': 38.59}
{'_id': ObjectId('655f1914044b5b5ebbf90d67'), 'end_lat': 42.01256011541, 'end_lng': -87.

In [10]:
# Use aggregation pipeline to find top ten start stations
pipeline = [
    {
        "$group": {
            "_id": "$start_station_name",
            "count": {"$sum": 1},
            "latitude": {"$first": "$end_lat"},
            "longitude": {"$first": "$end_lng"}
        }
    },
    {
        "$sort": {"count": -1}
    },
    {
        "$limit": 10
    },
    {   "$out": "Top10StartStations"
}
]
# Perform the aggregation
result = list(withStation.aggregate(pipeline))

# Assign results to variable 
Top10StartStations = db['Top10StartStations']

In [11]:
# Count the number of documents in the collection 
print(Top10StartStations.count_documents({}))

10


In [12]:
# Review a document in the collection 
print(db.Top10StartStations.find_one())

{'_id': 'Streeter Dr & Grand Ave', 'count': 71269, 'latitude': 41.880958, 'longitude': -87.616743}


In [13]:
# Use aggregation pipeline to find top ten end stations
pipeline = [
    {
        "$group": {
            "_id": "$end_station_name",
            "count": {"$sum": 1},
            "latitude": {"$first": "$end_lat"},
            "longitude": {"$first": "$end_lng"}
        }
    },
    {
        "$sort": {"count": -1}
    },
    {
        "$limit": 10
    },
    {   "$out": "Top10EndStations"
}
]
# Perform the aggregation
result = list(withStation.aggregate(pipeline))

# Assign to a variable
Top10EndStations = db['Top10EndStations']

In [14]:
# Count the number of documents in the collection 
print(Top10EndStations.count_documents({}))

10


In [15]:
# Review a document in the collection 
print(db.Top10EndStations.find_one())

{'_id': 'Streeter Dr & Grand Ave', 'count': 72540, 'latitude': 41.892278, 'longitude': -87.612043}


In [16]:
# Create a pipeline query to find the top ten bike routes (by start and end station)
pipeline = [
    {
        "$group": {
            "_id": { "Start Station": "$start_station_name", "End Station": "$end_station_name"},
            "count": {"$sum": 1},
            "start latitude": {"$first": "$start_lat"},
            "start longitude": {"$first": "$start_lng"},
            "end latitude": {"$first": "$end_lat"},
            "end longitude": {"$first": "$end_lng"}
        }
    },
    {"$sort": {"count": -1}
},
    {
        "$limit": 10
},
    {   "$out": "Top10Routes"
}
]
# Perform the aggregation
result = list(withStation.aggregate(pipeline))

# Assign to a variable
Top10Routes = db['Top10Routes']

In [17]:
# Count the number of documents in the collection 
print(Top10Routes.count_documents({}))

10


In [18]:
# Review a document in the collection 
print(db.Top10Routes.find_one())

{'_id': {'Start Station': 'Streeter Dr & Grand Ave', 'End Station': 'Streeter Dr & Grand Ave'}, 'count': 12202, 'start latitude': 41.892278, 'start longitude': -87.612043, 'end latitude': 41.892278, 'end longitude': -87.612043}


In [19]:
# Create a pipeline query to find docouments that have lat/long  
pipeline = [
    {
        "$match": {
            "$and": [
                { "start_lat": { "$ne": "" } },
                { "start_lng": { "$ne": "" } },
                { "end_lat": { "$ne": "" } },
                { "end_lng": { "$ne": "" } }
            ]
        }
    }, 
    {"$out": "withLatLong"}
]

# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))

In [20]:
# Create a pipeline query to find distance of each route in descending order by length  
pipeline = [
    {
        "$addFields": {
            "start_lat": { "$toDouble": "$start_lat" },
            "start_lng": { "$toDouble": "$start_lng" },
            "end_lat": { "$toDouble": "$end_lat" },
            "end_lng": { "$toDouble": "$end_lng" }
        }
    },
    {
        "$addFields": {
            "distance": {
                "$sqrt": {
                    "$add": [
                        {
                            "$pow": [
                                { "$subtract": ["$end_lat", "$start_lat"] },
                                2
                            ]
                        },
                        {
                            "$pow": [
                                {
                                    "$multiply": [
                                        { "$subtract": ["$end_lng", "$start_lng"] },
                                        { "$cos": { "$avg": ["$start_lat", "$end_lat"] } }
                                    ]
                                },
                                2
                            ]
                        }
                    ]
                }
            }
        }
    },
    {
        "$sort": {"distance": -1}
    },
    {"$out": "RouteDistance"}
]

# Perform the aggregation
result = list(withLatLong.aggregate(pipeline))

# Assign to a variable
RouteDistance = db['RouteDistance']

In [21]:
# Count the number of documents in the collection 
print(RouteDistance.count_documents({}))

5661859


In [22]:
# Find the first 10 documents
documents = RouteDistance.find().sort("distance", -1).limit(10)

# Print the documents
for doc in documents:
    print(doc)

{'_id': ObjectId('655f19c25e397d4d01a2c9fc'), 'end_lat': 0.0, 'end_lng': 0.0, 'end_station_id': 'chargingstx07', 'end_station_name': 'Green St & Madison Ave*', 'ended_at': '2022-11-09 12:26:18', 'ended_at_date': '2022-11-09', 'ended_at_time': '12:26:18', 'member_casual': 'member', 'ride_id': 'E9495F1DC3475D41', 'rideable_type': 'classic_bike', 'start_lat': 41.884114, 'start_lng': -87.654264, 'start_station_id': 18062, 'start_station_name': 'Aberdeen St & Randolph St', 'started_at': '2022-11-09 12:21:55', 'started_at_date': '2022-11-09', 'started_at_time': '12:21:55', 'weather_data': {'_id': ObjectId('655f19d5f1c6af1dd0f575a8'), 'date': '2022-11-09', 'cloud_cover': 75.0, 'precipitation': 0.0, 'min_temp': 47.35, 'max_temp': 60.37, 'morning_temp': 51.44, 'afternoon_temp': 48.54, 'evening_temp': 51.37, 'night_temp': 51.82, 'max_windspeed': 13.8}, 'distance': 60.51865544715035}
{'_id': ObjectId('655f19c25e397d4d01a2cbfb'), 'end_lat': 0.0, 'end_lng': 0.0, 'end_station_id': 'chargingstx07', '

In [23]:
db["divvy_rides_by_month"].drop()

In [24]:
# Define the aggregation pipeline to pull rides by month 
pipeline = [
    {
        "$group": {
            "_id": {
                "year": {"$year": {"$toDate": "$started_at"}},
                "month": {"$month": {"$toDate": "$started_at"}}
            },
            "total_rides": {"$sum": 1}
        }
    },
    {
        "$project": {
            "_id": 0,
            "year": "$_id.year",
            "month": "$_id.month",
            "total_rides": 1
        }
    },
    {
        "$sort": {"year": 1, "month": 1}
    }
]

# Execute the aggregation pipeline and write to a new collection
divvy_rides_by_month = db["divvy_rides_by_month"]
divvy_rides_by_month.drop()  # Drop the collection
aggregated_result = divvy_ridedata_merged.aggregate(pipeline, allowDiskUse=True, collation=None)

for doc in aggregated_result:
    divvy_rides_by_month.insert_one(doc)

print("Aggregation result has been written to the new collection.")

# Query the divvy_rides_by_month collection
rides_by_month_cursor = divvy_rides_by_month.find({}, {"_id": 0, "year": 1, "month": 1, "total_rides": 1})

# Iterate through the results and print the information
for document in rides_by_month_cursor:
    print(f"Year: {document['year']}, Month: {document['month']}, Total Rides: {document['total_rides']}")

Aggregation result has been written to the new collection.
Year: 2022, Month: 1, Total Rides: 103770
Year: 2022, Month: 2, Total Rides: 115609
Year: 2022, Month: 3, Total Rides: 284042
Year: 2022, Month: 4, Total Rides: 371249
Year: 2022, Month: 5, Total Rides: 634858
Year: 2022, Month: 6, Total Rides: 769204
Year: 2022, Month: 7, Total Rides: 823488
Year: 2022, Month: 8, Total Rides: 785932
Year: 2022, Month: 9, Total Rides: 701339
Year: 2022, Month: 10, Total Rides: 558685
Year: 2022, Month: 11, Total Rides: 337735
Year: 2022, Month: 12, Total Rides: 181806


In [25]:
# Define the aggregation pipeline to define divvy rides by season 
pipeline = [
    {
        "$group": {
            "_id": {
                "year": "$year",
                "season": {
                    "$switch": {
                        "branches": [
                            {"case": {"$in": ["$month", [3, 4, 5]]}, "then": "Spring"},
                            {"case": {"$in": ["$month", [6, 7, 8]]}, "then": "Summer"},
                            {"case": {"$in": ["$month", [9, 10, 11]]}, "then": "Autumn"},
                            {"case": {"$in": ["$month", [12, 1, 2]]}, "then": "Winter"}
                        ],
                        "default": "Unknown"
                    }
                }
            },
            "total_rides": {"$sum": "$total_rides"}
        }
    },
    {
        "$sort": {"_id.year": 1, "_id.season": 1}
    }
]

# Execute the aggregation pipeline
divvy_rides_by_season = db["divvy_rides_by_season"]
divvy_rides_by_season.drop()  # Drop the collection
aggregated_result = list(divvy_rides_by_month.aggregate(pipeline, allowDiskUse=True, collation=None))

# Insert the aggregated documents into the new collection
for doc in aggregated_result:
    print("Inserting document:", doc)
    divvy_rides_by_season.insert_one(doc)

print("Aggregation by season result has been written to the new collection.")

Inserting document: {'_id': {'year': 2022, 'season': 'Autumn'}, 'total_rides': 1597759}
Inserting document: {'_id': {'year': 2022, 'season': 'Spring'}, 'total_rides': 1290149}
Inserting document: {'_id': {'year': 2022, 'season': 'Summer'}, 'total_rides': 2378624}
Inserting document: {'_id': {'year': 2022, 'season': 'Winter'}, 'total_rides': 401185}
Aggregation by season result has been written to the new collection.


In [26]:
print(db.list_collection_names())

['divvy_rides_by_season', 'weather_daily', 'withoutStationName', 'sig_prcp_yes', 'divvy_ridedata_merged', 'sig_prcp_no', 'distinct_station_names', 'Top10Routes', 'Top10EndStations', 'Top10StartStations', 'RouteDistance', 'withStationName', 'divvy_rides_by_month', 'withLatLong', 'divvy_ridedata']


In [27]:
from bson import ObjectId

# Get distinct station names along with start_lat and start_lng
distinct_station_data = db["withStationName"].aggregate([
    {
        "$group": {
            "_id": "$start_station_name",
            "start_lat": {"$first": "$start_lat"},
            "start_lng": {"$first": "$start_lng"}
        }
    }
])

collection_name = "distinct_station_names"
station_names = db[collection_name]

station_name_documents = []
for data in distinct_station_data:
    station_name_documents.append({
        "start_station_name": data["_id"],
        "start_lat": data["start_lat"],
        "start_lng": data["start_lng"],
        "_id": str(ObjectId())
    })

station_names.insert_many(station_name_documents)

print(f"{len(station_name_documents)} distinct station names imported into '{collection_name}' collection.")

1556 distinct station names imported into 'distinct_station_names' collection.


In [28]:
station_names.find_one()


{'_id': '655f2577804185fa68c89e9a',
 'start_station_name': 'Mason Ave & Roosevelt Rd',
 'start_lat': 41.87,
 'start_lng': -87.77}

In [46]:
# Debug: Look at the structure of divvy_ridedata_merged Access the database and collection
divvy_ridedata_merged_collection = db["divvy_ridedata_merged"]

# Find and print a few documents from the collection
cursor = divvy_ridedata_merged_collection.find().limit(5)  # Limiting to 5 documents for illustration

# Print each document
for document in cursor:
    print(document)

{'_id': ObjectId('655f1914044b5b5ebbf90d67'), 'end_lat': 42.01256011541, 'end_lng': -87.6743671152, 'end_station_id': 'RP-007', 'end_station_name': 'Clark St & Touhy Ave', 'ended_at': '2022-01-10 08:46:17', 'ended_at_date': '2022-01-10', 'ended_at_time': '08:46:17', 'member_casual': 'casual', 'ride_id': 'A6CF8980A652D272', 'rideable_type': 'electric_bike', 'start_lat': 42.012763, 'start_lng': -87.6659675, 'start_station_id': 525, 'start_station_name': 'Glenwood Ave & Touhy Ave', 'started_at': '2022-01-10 08:41:56', 'started_at_date': '2022-01-10', 'started_at_time': '08:41:56', 'weather_data': {'_id': ObjectId('655f19d5f1c6af1dd0f5747e'), 'date': '2022-01-10', 'cloud_cover': 17.0, 'precipitation': 0.24, 'min_temp': 9.5, 'max_temp': 20.84, 'morning_temp': 13.69, 'afternoon_temp': 9.73, 'evening_temp': 14.0, 'night_temp': 20.75, 'max_windspeed': 16.35, 'prcp_inches': 0.0094488, 'sig_prcp': 'no', 'avg_temp': 12.473333333333334}}
{'_id': ObjectId('655f1914044b5b5ebbf90d68'), 'end_lat': 42.

In [30]:
# Debug: Access the database and collection
divvy_ridedata_merged_collection = db["divvy_ridedata_merged"]

# Find and print the first few documents in divvy_ridedata_merged
for doc in divvy_ridedata_merged_collection.find().limit(5):
    print(doc)

{'_id': ObjectId('655f1914044b5b5ebbf90d67'), 'end_lat': 42.01256011541, 'end_lng': -87.6743671152, 'end_station_id': 'RP-007', 'end_station_name': 'Clark St & Touhy Ave', 'ended_at': '2022-01-10 08:46:17', 'ended_at_date': '2022-01-10', 'ended_at_time': '08:46:17', 'member_casual': 'casual', 'ride_id': 'A6CF8980A652D272', 'rideable_type': 'electric_bike', 'start_lat': 42.012763, 'start_lng': -87.6659675, 'start_station_id': 525, 'start_station_name': 'Glenwood Ave & Touhy Ave', 'started_at': '2022-01-10 08:41:56', 'started_at_date': '2022-01-10', 'started_at_time': '08:41:56', 'weather_data': {'_id': ObjectId('655f19d5f1c6af1dd0f5747e'), 'date': '2022-01-10', 'cloud_cover': 17.0, 'precipitation': 0.24, 'min_temp': 9.5, 'max_temp': 20.84, 'morning_temp': 13.69, 'afternoon_temp': 9.73, 'evening_temp': 14.0, 'night_temp': 20.75, 'max_windspeed': 16.35}}
{'_id': ObjectId('655f1914044b5b5ebbf90d68'), 'end_lat': 42.01256011541, 'end_lng': -87.6743671152, 'end_station_id': 'RP-007', 'end_sta

In [45]:
# Debug: Print weather_data for the first few documents in divvy_ridedata_merged
for document in divvy_ridedata_merged.find().limit(5):
    print(document.get('weather_data'))

{'_id': ObjectId('655f19d5f1c6af1dd0f5747e'), 'date': '2022-01-10', 'cloud_cover': 17.0, 'precipitation': 0.24, 'min_temp': 9.5, 'max_temp': 20.84, 'morning_temp': 13.69, 'afternoon_temp': 9.73, 'evening_temp': 14.0, 'night_temp': 20.75, 'max_windspeed': 16.35}
{'_id': ObjectId('655f19d5f1c6af1dd0f5747b'), 'date': '2022-01-13', 'cloud_cover': 75.0, 'precipitation': 0.0, 'min_temp': 30.72, 'max_temp': 39.83, 'morning_temp': 35.49, 'afternoon_temp': 30.72, 'evening_temp': 34.38, 'night_temp': 39.22, 'max_windspeed': 8.01}
{'_id': ObjectId('655f19d5f1c6af1dd0f57472'), 'date': '2022-01-04', 'cloud_cover': 4.0, 'precipitation': 0.0, 'min_temp': 18.03, 'max_temp': 30.36, 'morning_temp': 18.03, 'afternoon_temp': 20.97, 'evening_temp': 21.29, 'night_temp': 19.26, 'max_windspeed': 10.0}
{'_id': ObjectId('655f19d5f1c6af1dd0f57482'), 'date': '2022-01-20', 'cloud_cover': 20.0, 'precipitation': 0.0, 'min_temp': 6.44, 'max_temp': 20.73, 'morning_temp': 15.89, 'afternoon_temp': 10.58, 'evening_temp':

In [47]:
# Debug: Define the aggregation pipeline to pull rides by month
pipeline = [
    {"$match": {"weather_data.sig_prcp": {"$exists": True, "$eq": "yes"}}},
    {"$out": "sig_prcp_yes"}
]

# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))

# Access the new collection
sig_prcp_yes_collection = db["sig_prcp_yes"]

# Check if the aggregation was successful
if result:
    print("Aggregation successful.")
else:
    print("No documents matched the aggregation criteria.")

# Check if the sig_prcp_yes collection is populated
count_documents = sig_prcp_yes_collection.count_documents({})
print(f"Number of documents in sig_prcp_yes collection: {count_documents}")

No documents matched the aggregation criteria.
Number of documents in sig_prcp_yes collection: 748440


In [48]:
# Define the aggregation pipeline to pull rides by month 

# Use aggregation pipeline to create a collection that contains start and end station names
pipeline = [
         {"$match": {"weather_data.sig_prcp": {"$exists": True, "$eq": "yes"},
                     }},
         {"$out": "sig_prcp_yes"}
         
]
# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))
# Assign to a variable
sig_prcp_yes = db["sig_prcp_yes"]


In [49]:
# Debug: See if anything is in sig_prcp_yes
# Access the database and collection
sig_prcp_yes_collection = db["sig_prcp_yes"]

# Count the documents in the collection
document_count = sig_prcp_yes_collection.count_documents({})
print("Total number of documents in 'sig_prcp_yes':", document_count)

# Print a sample document
sample_document = sig_prcp_yes_collection.find_one()
print("Sample document in 'sig_prcp_yes':", sample_document)

Total number of documents in 'sig_prcp_yes': 748440
Sample document in 'sig_prcp_yes': {'_id': ObjectId('655f1914044b5b5ebbf90d70'), 'end_lat': 41.88338, 'end_lng': -87.64117, 'end_station_id': 'WL-012', 'end_station_name': 'Clinton St & Washington Blvd', 'ended_at': '2022-01-28 15:35:16', 'ended_at_date': '2022-01-28', 'ended_at_time': '15:35:16', 'member_casual': 'member', 'ride_id': '72DC25B2DD467EEF', 'rideable_type': 'classic_bike', 'start_lat': 41.878166, 'start_lng': -87.631929, 'start_station_id': 'TA1309000004', 'start_station_name': 'LaSalle St & Jackson Blvd', 'started_at': '2022-01-28 15:27:53', 'started_at_date': '2022-01-28', 'started_at_time': '15:27:53', 'weather_data': {'_id': ObjectId('655f19d5f1c6af1dd0f5748f'), 'date': '2022-01-28', 'cloud_cover': 100.0, 'precipitation': 6.18, 'min_temp': 15.89, 'max_temp': 32.04, 'morning_temp': 29.03, 'afternoon_temp': 26.51, 'evening_temp': 17.65, 'night_temp': 29.01, 'max_windspeed': 11.5, 'prcp_inches': 0.2433066, 'sig_prcp': '

In [50]:
# Define the aggregation pipeline to pull rides by month 

# Use aggregation pipeline to create a collection that contains start and end station names
pipeline = [
         {"$match": {"weather_data.sig_prcp": {"$exists": True, "$eq": "no"},
                     }},
         {"$out": "sig_prcp_no"}
         
]
# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))
# Assign to a variable
sig_prcp_no = db["sig_prcp_no"]

In [51]:
# Create a new collection that shows average daily rides per month with precipitation 

pipeline = [
    {
        "$group": {
            "_id": {
                "year": {"$year": {"$toDate": "$started_at"}},
                "month": {"$month": {"$toDate": "$started_at"}}
            },
            "total_rides": {"$sum": 1}
        }
    },
    {
        "$project": {
            "_id": 0,
            "year": "$_id.year",
            "month": "$_id.month",
            "total_rides": 1
        }
    },
    {
        "$sort": {"year": 1, "month": 1}
    }
]

# Execute the aggregation pipeline and write to a new collection
sig_prcp_yes_month = db["sig_prcp_yes_month"]
#divvy_rides_by_month = db["divvy_rides_by_month"]
sig_prcp_yes_month.drop()  # Drop the collection
aggregated_result = sig_prcp_yes.aggregate(pipeline, allowDiskUse=True, collation=None)

for doc in aggregated_result:
    sig_prcp_yes_month.insert_one(doc)

# Function to insert number of days and sig_prcp count for each month
def update_num_days_and_sig_prcp_count(year, month):
    _, num_days = monthrange(year, month)
    query = {'year': year, 'month': month}
    update_query = {'$set': {'num_days': num_days}}
    sig_prcp_yes_month.update_one(query, update_query)

    # Count the number of days with sig_prcp = 'no' for the given month
    sig_prcp_count = weather_daily.count_documents({
        'date': {'$regex': f'^{year:04d}-{month:02d}'},  # Match the year and month in the date field
        'sig_prcp': 'yes'
    })
    update_query = {'$set': {'sig_prcp_count': sig_prcp_count}}
    sig_prcp_yes_month.update_one(query, update_query)

    # Calculate average rides per day
    # Added for loop to make sure values aren't null 
    result = sig_prcp_yes_month.find_one(query)
    if result is not None:
        total_rides = result['total_rides']
        sig_prcp_yes_month.update_one(query, update_query)
    else:
        total_rides = 0  # Assign a default value

    # Calculate and update average rides per day with no significant precipitation
    query = {'year': 2022, 'month': month}
    document = sig_prcp_yes_month.find_one(query)
    print(f"Document: {document}")

    # Changed to deal with null documents in March
    if document is not None and document['sig_prcp_count'] > 0:
        print(document)
        average_rides_per_day = document['total_rides'] / document['sig_prcp_count']
        update_query = {'$set': {'average_rides_per_day': average_rides_per_day}}
        sig_prcp_yes_month.update_one(query, update_query)

# Loop through each month in the year 2022
for month in range(1, 13):
    update_num_days_and_sig_prcp_count(2022, month)


Document: {'_id': ObjectId('6560a56bc3359531e70984cb'), 'total_rides': 9308, 'year': 2022, 'month': 1, 'num_days': 31, 'sig_prcp_count': 5}
{'_id': ObjectId('6560a56bc3359531e70984cb'), 'total_rides': 9308, 'year': 2022, 'month': 1, 'num_days': 31, 'sig_prcp_count': 5}
Document: {'_id': ObjectId('6560a56bc3359531e70984cc'), 'total_rides': 26036, 'year': 2022, 'month': 2, 'num_days': 28, 'sig_prcp_count': 8}
{'_id': ObjectId('6560a56bc3359531e70984cc'), 'total_rides': 26036, 'year': 2022, 'month': 2, 'num_days': 28, 'sig_prcp_count': 8}
Document: {'_id': ObjectId('6560a56bc3359531e70984cd'), 'total_rides': 71873, 'year': 2022, 'month': 3, 'num_days': 31, 'sig_prcp_count': 9}
{'_id': ObjectId('6560a56bc3359531e70984cd'), 'total_rides': 71873, 'year': 2022, 'month': 3, 'num_days': 31, 'sig_prcp_count': 9}
Document: {'_id': ObjectId('6560a56bc3359531e70984ce'), 'total_rides': 147085, 'year': 2022, 'month': 4, 'num_days': 30, 'sig_prcp_count': 11}
{'_id': ObjectId('6560a56bc3359531e70984ce'

In [52]:
# Create a new collection that shows average daily rides per month with no precipitation 

pipeline = [
    {
        "$group": {
            "_id": {
                "year": {"$year": {"$toDate": "$started_at"}},
                "month": {"$month": {"$toDate": "$started_at"}}
            },
            "total_rides": {"$sum": 1}
        }
    },
    {
        "$project": {
            "_id": 0,
            "year": "$_id.year",
            "month": "$_id.month",
            "total_rides": 1
        }
    },
    {
        "$sort": {"year": 1, "month": 1}
    }
]

# Execute the aggregation pipeline and write to a new collection
sig_prcp_no_month = db["sig_prcp_no_month"]
#divvy_rides_by_month = db["divvy_rides_by_month"]
sig_prcp_no_month.drop()  # Drop the collection
aggregated_result = sig_prcp_no.aggregate(pipeline, allowDiskUse=True, collation=None)

for doc in aggregated_result:
    sig_prcp_no_month.insert_one(doc)

def update_num_days_and_sig_prcp_count(year, month):
    _, num_days = monthrange(year, month)
    query = {'year': year, 'month': month}
    update_query = {'$set': {'num_days': num_days}}
    sig_prcp_no_month.update_one(query, update_query)

    # Count the number of days with sig_prcp = 'no' for the given month
    sig_prcp_count = weather_daily.count_documents({
        'date': {'$regex': f'^{year:04d}-{month:02d}'},
        'sig_prcp': 'no'
    })
    update_query = {'$set': {'sig_prcp_count': sig_prcp_count}}
    sig_prcp_no_month.update_one(query, update_query)

    # Calculate average rides per day
    document = sig_prcp_no_month.find_one(query)
    
    if document is not None and document['sig_prcp_count'] > 0:
        total_rides = document['total_rides']
        average_rides_per_day = total_rides / document['sig_prcp_count']
        update_query = {'$set': {'average_rides_per_day': average_rides_per_day}}
        sig_prcp_no_month.update_one(query, update_query)

# Loop through each month in the year 2022
for month in range(1, 13):
    update_num_days_and_sig_prcp_count(2022, month)


In [53]:
# Delete: Specify the year and month you want to check (e.g., March)
target_year = 2022
target_month = 3

# Query the "sig_prcp_no_month" collection for documents in the specified year and month
documents_in_march = sig_prcp_no_month.find({"year": target_year, "month": target_month})

# Print information about each document in March
for doc in documents_in_march:
    print(doc)

{'_id': ObjectId('6560a5a5c3359531e70984d9'), 'total_rides': 212169, 'year': 2022, 'month': 3, 'num_days': 31, 'sig_prcp_count': 22, 'average_rides_per_day': 9644.045454545454}


In [54]:
# Create a new collection to store documents with string _id
# sig_prcp_no_month_string.drop()
sig_prcp_no_month_string = db["sig_prcp_no_month_with_string_id"]

# Iterate through the documents in the original collection
for document in sig_prcp_no_month.find({}):
    document_id = document['_id']
    string_id = str(document_id)
    
    # Create a new document with the string _id and other fields
    new_document = {
        '_id': string_id,
        'year': document['year'],
        'month': document['month'],
        'total_rides': document['total_rides'],
        'num_days': document['num_days'],
        'sig_prcp_count': document['sig_prcp_count'],
        'average_rides_per_day': document['average_rides_per_day']
        # Include other fields from the original document
    }
    
    # Insert the new document into the new collection
    sig_prcp_no_month_string.insert_one(new_document)

print("Documents with string _id inserted into the new collection.")

Documents with string _id inserted into the new collection.


In [55]:
# Specify the year and month you want to check (e.g., March)
target_year = 2022
target_month = 3

# Query the "sig_prcp_yes_month_with_string_id" collection for documents in the specified year and month
documents_in_march = sig_prcp_no_month_string.find({"year": target_year, "month": target_month})

# Print information about each document in March
for doc in documents_in_march:
    print(doc)

{'_id': '6560a5a5c3359531e70984d9', 'year': 2022, 'month': 3, 'total_rides': 212169, 'num_days': 31, 'sig_prcp_count': 22, 'average_rides_per_day': 9644.045454545454}


In [56]:
# Create a new collection to store documents with string _id
# sig_prcp_yes_month_string.drop()
sig_prcp_yes_month_string = db["sig_prcp_yes_month_with_string_id"]

# Iterate through the documents in the original collection
for document in sig_prcp_yes_month.find({}):
    document_id = document['_id']
    string_id = str(document_id)
    
    # Create a new document with the string _id and other fields
    new_document = {
        '_id': string_id,
        'year': document['year'],
        'month': document['month'],
        'total_rides': document['total_rides'],
        'num_days': document['num_days'],
        'sig_prcp_count': document['sig_prcp_count'],
        'average_rides_per_day': document['average_rides_per_day']
        # Include other fields from the original document
    }
    
    # Insert the new document into the new collection
    sig_prcp_yes_month_string.insert_one(new_document)

print("Documents with string _id inserted into the new collection.")

Documents with string _id inserted into the new collection.


In [58]:
# Delete this code: Check for March
target_month = 3

# Query the "sig_prcp_yes_month_with_string_id" collection for documents in the specified year and month
documents_in_march = sig_prcp_yes_month_string.find({"year": target_year, "month": target_month})

# Print information about each document in March
for doc in documents_in_march:
    print(doc)

{'_id': '6560a56bc3359531e70984cd', 'year': 2022, 'month': 3, 'total_rides': 71873, 'num_days': 31, 'sig_prcp_count': 9, 'average_rides_per_day': 7985.888888888889}


In [59]:
# Retrieve all documents from the collection
all_documents = sig_prcp_yes_month_string.find({})

# Iterate through the documents and print them
for doc in all_documents:
    print(doc)

{'_id': '6560a56bc3359531e70984cb', 'year': 2022, 'month': 1, 'total_rides': 9308, 'num_days': 31, 'sig_prcp_count': 5, 'average_rides_per_day': 1861.6}
{'_id': '6560a56bc3359531e70984cc', 'year': 2022, 'month': 2, 'total_rides': 26036, 'num_days': 28, 'sig_prcp_count': 8, 'average_rides_per_day': 3254.5}
{'_id': '6560a56bc3359531e70984cd', 'year': 2022, 'month': 3, 'total_rides': 71873, 'num_days': 31, 'sig_prcp_count': 9, 'average_rides_per_day': 7985.888888888889}
{'_id': '6560a56bc3359531e70984ce', 'year': 2022, 'month': 4, 'total_rides': 147085, 'num_days': 30, 'sig_prcp_count': 11, 'average_rides_per_day': 13371.363636363636}
{'_id': '6560a56bc3359531e70984cf', 'year': 2022, 'month': 5, 'total_rides': 49842, 'num_days': 31, 'sig_prcp_count': 4, 'average_rides_per_day': 12460.5}
{'_id': '6560a56bc3359531e70984d0', 'year': 2022, 'month': 6, 'total_rides': 58888, 'num_days': 30, 'sig_prcp_count': 3, 'average_rides_per_day': 19629.333333333332}
{'_id': '6560a56bc3359531e70984d1', 'ye

In [60]:
# Retrieve all documents from the collection
all_documents = sig_prcp_no_month_string.find({})

# Iterate through the documents and print them
for doc in all_documents:
    print(doc)

{'_id': '6560a5a5c3359531e70984d7', 'year': 2022, 'month': 1, 'total_rides': 94462, 'num_days': 31, 'sig_prcp_count': 26, 'average_rides_per_day': 3633.153846153846}
{'_id': '6560a5a5c3359531e70984d8', 'year': 2022, 'month': 2, 'total_rides': 89573, 'num_days': 28, 'sig_prcp_count': 20, 'average_rides_per_day': 4478.65}
{'_id': '6560a5a5c3359531e70984d9', 'year': 2022, 'month': 3, 'total_rides': 212169, 'num_days': 31, 'sig_prcp_count': 22, 'average_rides_per_day': 9644.045454545454}
{'_id': '6560a5a5c3359531e70984da', 'year': 2022, 'month': 4, 'total_rides': 224164, 'num_days': 30, 'sig_prcp_count': 19, 'average_rides_per_day': 11798.105263157895}
{'_id': '6560a5a5c3359531e70984db', 'year': 2022, 'month': 5, 'total_rides': 585016, 'num_days': 31, 'sig_prcp_count': 27, 'average_rides_per_day': 21667.25925925926}
{'_id': '6560a5a5c3359531e70984dc', 'year': 2022, 'month': 6, 'total_rides': 710316, 'num_days': 30, 'sig_prcp_count': 27, 'average_rides_per_day': 26308.0}
{'_id': '6560a5a5c3

In [61]:
# Count the number of documents in the collections 
print(divvy_rides.count_documents({}))
print(weather_daily.count_documents({}))
print(divvy_ridedata_merged.count_documents({}))
print(withoutStation.count_documents({}))
print(withStation.count_documents({}))
print(withLatLong.count_documents({}))

5667717
365
5667717
427449
4369360
5661859
