### Import Divvy Bike Data: 

mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202201-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202202-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202203-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202204-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202205-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202206-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202207-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202208-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202209-divvy-publictripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202210-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline 202211-divvy-tripdata.csv<br>
mongoimport --type csv -d chicago_bikes -c divvy_ridedata --headerline  202212-divvy-tripdata.csv<br>

### Import Weather Data: 
please run OpenWeather.ipynb first to create weather_daily.csv file and then import using the mongoimport statement below

mongoimport --type csv -d chicago_bikes -c weather_daily --headerline weather_daily.csv

In [1]:
# Import Dependencies
import pymongo
from pymongo import MongoClient, UpdateOne
import json
from calendar import monthrange
from datetime import datetime

In [2]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [3]:
# check our list of collections
print(mongo.list_database_names())

['admin', 'autosaurus', 'chicago_bikes', 'classDB', 'config', 'epa', 'fruits_db', 'gardenDB', 'local', 'met', 'mongodbVSCodePlaygroundDB', 'petsitly_marketing', 'travel_db', 'uk_food']


In [4]:
# assign the database to a variable name
db = mongo.chicago_bikes

In [5]:
# review the collections in our new database
print(db.list_collection_names())

['distinct_station_names', 'divvy_ridedata_merged', 'RouteDistance', 'divvy_rides_by_season', 'withStationName', 'weather_daily', 'Top10EndStations', 'Top10Routes', 'sig_prcp_yes', 'sig_prcp_no', 'withoutStationName', 'withLatLong', 'divvy_ridedata', 'divvy_rides_by_month', 'Top10StartStations']


In [6]:
# review a document in the customer_list collection
print(db.divvy_ridedata.find_one())

{'_id': ObjectId('6544a1406281b360c1f8c053'), 'ride_id': 'C2F7DD78E82EC875', 'rideable_type': 'electric_bike', 'started_at': '2022-01-13 11:59:47', 'ended_at': '2022-01-13 12:02:44', 'start_station_name': 'Glenwood Ave & Touhy Ave', 'start_station_id': 525, 'end_station_name': 'Clark St & Touhy Ave', 'end_station_id': 'RP-007', 'start_lat': 42.0128005, 'start_lng': -87.665906, 'end_lat': 42.01256011541, 'end_lng': -87.6743671152, 'member_casual': 'casual', 'started_at_date': '2022-01-13', 'started_at_time': '11:59:47', 'ended_at_date': '2022-01-13', 'ended_at_time': '12:02:44'}


In [7]:
#Assign divvy_rides and weather_daily collections to variables for later use
divvy_rides = db['divvy_ridedata']
weather_daily = db['weather_daily']

In [8]:
# Convert precipitation field value to inches (currently in mm) in weather_daily collection
# Conversion factor: 1 mm = 0.03937 inches
mm_to_inches = 0.03937

In [9]:
# Iterate through documents and update values
for document in weather_daily.find():
    size_mm = document.get("precipitation")
    if size_mm is not None:
        size_inches = size_mm * mm_to_inches
        # Update the document with the new size value in inches
        weather_daily.update_one({"_id": document["_id"]}, {"$set": {"prcp_inches": size_inches}})

In [10]:
# Review a document in the weather_daily collection to confirm update
print(db.weather_daily.find_one())

{'_id': ObjectId('6544a21516ac0221c1a8c57f'), 'date': '2022-01-01', 'cloud_cover': 90.0, 'precipitation': 0.18, 'min_temp': 33.22, 'max_temp': 42.1, 'morning_temp': 42.1, 'afternoon_temp': 38.43, 'evening_temp': 35.24, 'night_temp': 38.44, 'max_windspeed': 15.01, 'prcp_inches': 0.0070866}


In [11]:
# Drop 'size_inches' field as it has been replaced with 'prcp_inches' and values match 
# Specify the field you want to drop
field_to_drop = "size_inches"

In [12]:
# Update documents to unset the specified field
weather_daily.update_many({}, {"$unset": {field_to_drop: 1}})

<pymongo.results.UpdateResult at 0x19a2e59c3d0>

In [13]:
# Review a document in the weather_daily collection to confirm drop
print(db.weather_daily.find_one())

{'_id': ObjectId('6544a21516ac0221c1a8c57f'), 'date': '2022-01-01', 'cloud_cover': 90.0, 'precipitation': 0.18, 'min_temp': 33.22, 'max_temp': 42.1, 'morning_temp': 42.1, 'afternoon_temp': 38.43, 'evening_temp': 35.24, 'night_temp': 38.44, 'max_windspeed': 15.01, 'prcp_inches': 0.0070866}


In [14]:
# Define the threshold value for significant precipitation comparison 
threshold = 0.1

In [15]:
# Update documents to add the new field "sig_prcp"
# Add a new field 'sig_prcp' and populate 'yes' values if 'prcp_inches' is greather than or equal .1 inches
weather_daily.update_many(
    {"prcp_inches": {"$gte": threshold}},
    {"$set": {"sig_prcp": "yes"}}
)

weather_daily.update_many(
    {"prcp_inches": {"$lt": threshold}},
    {"$set": {"sig_prcp": "no"}}
)

<pymongo.results.UpdateResult at 0x19a2e629270>

In [16]:
# Review a document in the weather_daily collection to confirm update
print(db.weather_daily.find_one())

{'_id': ObjectId('6544a21516ac0221c1a8c57f'), 'date': '2022-01-01', 'cloud_cover': 90.0, 'precipitation': 0.18, 'min_temp': 33.22, 'max_temp': 42.1, 'morning_temp': 42.1, 'afternoon_temp': 38.43, 'evening_temp': 35.24, 'night_temp': 38.44, 'max_windspeed': 15.01, 'prcp_inches': 0.0070866, 'sig_prcp': 'no'}


In [17]:
# Update documents to add the new field "avg_temp"
weather_daily.update_many(
    {},
    [
        {
            "$set": {
                "avg_temp": {
                    "$avg": ["$morning_temp", "$afternoon_temp", "$evening_temp"]
                }
            }
        }
    ]
)

<pymongo.results.UpdateResult at 0x19a2beaa410>

In [18]:
# Review a document in the weather_daily collection to confirm update
print(db.weather_daily.find_one())

{'_id': ObjectId('6544a21516ac0221c1a8c57f'), 'date': '2022-01-01', 'cloud_cover': 90.0, 'precipitation': 0.18, 'min_temp': 33.22, 'max_temp': 42.1, 'morning_temp': 42.1, 'afternoon_temp': 38.43, 'evening_temp': 35.24, 'night_temp': 38.44, 'max_windspeed': 15.01, 'prcp_inches': 0.0070866, 'sig_prcp': 'no', 'avg_temp': 38.59}


In [19]:
# Perform bulk update operation on 'started_at' field to split date and time into two separate fields
documents = divvy_rides.find({})
bulk_updates = []

for document in documents:
    original_value = document["started_at"]
    parts = original_value.split(" ")
    date_part = parts[0]
    time_part = parts[1]
    bulk_updates.append(
        pymongo.UpdateOne(
            {"_id": document["_id"]},
            {
                "$set": {
                    "started_at_date": date_part,
                    "started_at_time": time_part
                }
            }
        )
    )

# Execute bulk write operations
divvy_rides.bulk_write(bulk_updates)

<pymongo.results.BulkWriteResult at 0x19b1c1bcd60>

In [20]:
# Perform bulk update operation on 'ended_at' field to split date and time into two separate fields
documents = divvy_rides.find({})
bulk_updates = []

for document in documents:
    original_value = document["ended_at"]
    parts = original_value.split(" ")
    date_part = parts[0]
    time_part = parts[1]
    bulk_updates.append(
        pymongo.UpdateOne(
            {"_id": document["_id"]},
            {
                "$set": {
                    "ended_at_date": date_part,
                    "ended_at_time": time_part
                }
            }
        )
    )

# Execute bulk write operations
divvy_rides.bulk_write(bulk_updates)

<pymongo.results.BulkWriteResult at 0x19a307bc1c0>

In [21]:
#Assign an index to each collection for use in pipeline merging of the two collections
divvy_rides.create_index([("started_at_date", 1)])
weather_daily.create_index([("date", 1)])

'date_1'

In [22]:
#Create and run a pipeling to merge divvy_rides and weather_daily collections into divvy_ridedata_merged collection
pipeline = [
    {
        '$lookup': {
            'from': 'weather_daily',
            'localField': 'started_at_date',
            'foreignField': 'date',
            'as': 'weather_data'
        }
    },
    {
        '$unwind': {
            'path': '$weather_data',
            'preserveNullAndEmptyArrays': True
        }
    },
    {
        '$merge': {
            'into': 'divvy_ridedata_merged',  # Replace with your new collection name
            'whenMatched': 'merge',  # Merge documents with matching _id fields
            'whenNotMatched': 'insert'  # Insert documents that don't match
        }
    },
]

divvy_rides.aggregate(pipeline)
print("Update completed successfully.")

Update completed successfully.


In [23]:
#Assign the new merged collection to a variable and print a document from the collection
divvy_ridedata_merged = db['divvy_ridedata_merged']
divvy_ridedata_merged.find_one()

{'_id': ObjectId('6544a1406281b360c1f8c053'),
 'end_lat': 42.01256011541,
 'end_lng': -87.6743671152,
 'end_station_id': 'RP-007',
 'end_station_name': 'Clark St & Touhy Ave',
 'ended_at': '2022-01-13 12:02:44',
 'ended_at_date': '2022-01-13',
 'ended_at_time': '12:02:44',
 'member_casual': 'casual',
 'ride_id': 'C2F7DD78E82EC875',
 'rideable_type': 'electric_bike',
 'start_lat': 42.0128005,
 'start_lng': -87.665906,
 'start_station_id': 525,
 'start_station_name': 'Glenwood Ave & Touhy Ave',
 'started_at': '2022-01-13 11:59:47',
 'started_at_date': '2022-01-13',
 'started_at_time': '11:59:47',
 'weather_data': {'_id': ObjectId('6544a21516ac0221c1a8c58a'),
  'date': '2022-01-13',
  'cloud_cover': 75.0,
  'precipitation': 0.0,
  'min_temp': 30.72,
  'max_temp': 39.83,
  'morning_temp': 35.49,
  'afternoon_temp': 30.72,
  'evening_temp': 34.38,
  'night_temp': 39.22,
  'max_windspeed': 8.01,
  'prcp_inches': 0.0,
  'sig_prcp': 'no',
  'avg_temp': 33.53}}

In [24]:
# review the collections in our new database
print(db.list_collection_names())

['distinct_station_names', 'divvy_ridedata_merged', 'RouteDistance', 'divvy_rides_by_season', 'withStationName', 'weather_daily', 'Top10EndStations', 'Top10Routes', 'sig_prcp_yes', 'sig_prcp_no', 'withoutStationName', 'withLatLong', 'divvy_ridedata', 'divvy_rides_by_month', 'Top10StartStations']


In [25]:
# Use aggregation pipeline to create a collection that contains start and end station names
pipeline = [
         {"$match": {"start_station_name": {"$exists": True, "$ne": ""}, 
                     "end_station_name":{"$exists": True, "$ne": ""}}},

         {"$out": "withStationName"}
]

# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))

In [26]:
#check to make sure that collections is updated
db.list_collection_names()

['distinct_station_names',
 'divvy_ridedata_merged',
 'withStationName',
 'RouteDistance',
 'divvy_rides_by_season',
 'weather_daily',
 'Top10EndStations',
 'Top10Routes',
 'sig_prcp_yes',
 'sig_prcp_no',
 'withoutStationName',
 'withLatLong',
 'divvy_ridedata',
 'divvy_rides_by_month',
 'Top10StartStations']

In [27]:
#check if there are any documents without start or end station names
print(db.withStationName.find_one({"start_station_name":""}))
print(db.withStationName.find_one({"end_station_name":""}))

None
None


In [28]:
# Use aggregation pipeline to create a collection that doesn't contain start and end station names
pipeline = [
         {"$match": {"start_station_name": {"$exists": True, "$eq": ""}, 
                     "end_station_name":{"$exists": True, "$eq": ""}}},

         {"$out": "withoutStationName"}
]

# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))

In [29]:
#Check to see that the collection was added
db.list_collection_names()

['distinct_station_names',
 'divvy_ridedata_merged',
 'withStationName',
 'RouteDistance',
 'divvy_rides_by_season',
 'weather_daily',
 'Top10EndStations',
 'Top10Routes',
 'sig_prcp_yes',
 'sig_prcp_no',
 'withoutStationName',
 'withLatLong',
 'divvy_ridedata',
 'divvy_rides_by_month',
 'Top10StartStations']

In [30]:
#check if there are any documents without start or end station names
print(db.withoutStationName.find_one({"start_station_name":{"$ne":""}}))
print(db.withoutStationName.find_one({"end_station_name":{"$ne":""}}))

None
None


In [31]:
# Use aggregation pipeline to find top ten start stations 
pipeline = [
    {
        "$group": {
            "_id": "$start_station_name",
            "count": {"$sum": 1},
            "latitude": {"$first": "$end_lat"},
            "longitude": {"$first": "$end_lng"}
        }
    },
    {
        "$sort": {"count": -1}
    },
    {
        "$limit": 10
    },
    {   "$out": "Top10StartStations"
}
]

# Perform the aggregation
result = list(withStation.aggregate(pipeline))



NameError: name 'withStation' is not defined

In [32]:
# Assign collection to a variable
Top10StartStations = db['Top10StartStations']

In [33]:
# Check to see that the collection was added
db.list_collection_names()

['distinct_station_names',
 'divvy_ridedata_merged',
 'withStationName',
 'RouteDistance',
 'divvy_rides_by_season',
 'weather_daily',
 'Top10EndStations',
 'Top10Routes',
 'sig_prcp_yes',
 'sig_prcp_no',
 'withoutStationName',
 'withLatLong',
 'divvy_ridedata',
 'divvy_rides_by_month',
 'Top10StartStations']

In [34]:
#Pull a document from the collection
Top10StartStations.find_one()

{'_id': 'Streeter Dr & Grand Ave',
 'count': 68245,
 'latitude': 41.880958,
 'longitude': -87.616743}

In [None]:
# Use aggregation pipeline to find top ten end stations 
pipeline = [
    {
        "$group": {
            "_id": "$end_station_name",
            "count": {"$sum": 1},
            "latitude": {"$first": "$end_lat"},
            "longitude": {"$first": "$end_lng"}
        }
    },
    {
        "$sort": {"count": -1}
    },
    {
        "$limit": 10
    },
    {   "$out": "Top10EndStations"
}
]

# Perform the aggregation
result = list(withStation.aggregate(pipeline))



In [None]:
# Assign to a variable
Top10EndStations = db['Top10EndStations']

In [None]:
# Create a pipeline query to find the top ten bike routes (by start and end station)
pipeline = [
    {
        "$group": {
            "_id": { "Start Station": "$start_station_name", "End Station": "$end_station_name"},
            "count": {"$sum": 1},
            "start latitude": {"$first": "$start_lat"},
            "start longitude": {"$first": "$start_lng"},
            "end latitude": {"$first": "$end_lat"},
            "end longitude": {"$first": "$end_lng"}
        }
    },
    {"$sort": {"count": -1}
},
    {
        "$limit": 10
},
    {   "$out": "Top10Routes"
}
]
# Perform the aggregation
result = list(withStation.aggregate(pipeline))



In [None]:
# Assign to a variable
Top10Routes = db['Top10Routes']

In [None]:
# Check to see that the collection was added
db.list_collection_names()

In [None]:
# Review a document in the collection 
Top10Routes.find_one()

In [35]:
# Create a pipeline query to find docouments that have lat/long  
pipeline = [
    {
        "$match": {
            "$and": [
                { "start_lat": { "$ne": "" } },
                { "start_lng": { "$ne": "" } },
                { "end_lat": { "$ne": "" } },
                { "end_lng": { "$ne": "" } }
            ]
        }
    }, 
    {"$out": "withLatLong"}
]

# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))



In [36]:
#Assign the collection to a variable
withLatLong = db['withLatLong']

In [37]:
# Create a pipeline query to find distance of each route in descending order by length  
pipeline = [
    {
        "$addFields": {
            "start_lat": { "$toDouble": "$start_lat" },
            "start_lng": { "$toDouble": "$start_lng" },
            "end_lat": { "$toDouble": "$end_lat" },
            "end_lng": { "$toDouble": "$end_lng" }
        }
    },
    {
        "$addFields": {
            "distance": {
                "$sqrt": {
                    "$add": [
                        {
                            "$pow": [
                                { "$subtract": ["$end_lat", "$start_lat"] },
                                2
                            ]
                        },
                        {
                            "$pow": [
                                {
                                    "$multiply": [
                                        { "$subtract": ["$end_lng", "$start_lng"] },
                                        { "$cos": { "$avg": ["$start_lat", "$end_lat"] } }
                                    ]
                                },
                                2
                            ]
                        }
                    ]
                }
            }
        }
    },
    {
        "$sort": {"distance": -1}
    },
    {"$out": "RouteDistance"}
]

# Perform the aggregation
result = list(withLatLong.aggregate(pipeline))



In [38]:
# Assign to a variable
RouteDistance = db['RouteDistance']

In [39]:
# Count the number of documents in the collection 
print(RouteDistance.count_documents({}))

5429682


In [40]:
# Find the first 10 documents
documents = RouteDistance.find().sort("distance", -1).limit(10)

# Print the documents
for doc in documents:
    print(doc)

{'_id': ObjectId('6544a1ade61f9c6d659ecc3a'), 'end_lat': 0.0, 'end_lng': 0.0, 'end_station_id': 'chargingstx07', 'end_station_name': 'Green St & Madison Ave*', 'ended_at': '2022-11-09 12:26:18', 'ended_at_date': '2022-11-09', 'ended_at_time': '12:26:18', 'member_casual': 'member', 'ride_id': 'E9495F1DC3475D41', 'rideable_type': 'classic_bike', 'start_lat': 41.884114, 'start_lng': -87.654264, 'start_station_id': 18062, 'start_station_name': 'Aberdeen St & Randolph St', 'started_at': '2022-11-09 12:21:55', 'started_at_date': '2022-11-09', 'started_at_time': '12:21:55', 'weather_data': {'_id': ObjectId('6544a21516ac0221c1a8c6b5'), 'date': '2022-11-09', 'cloud_cover': 75.0, 'precipitation': 0.0, 'min_temp': 47.35, 'max_temp': 60.37, 'morning_temp': 51.44, 'afternoon_temp': 48.54, 'evening_temp': 51.37, 'night_temp': 51.82, 'max_windspeed': 13.8, 'prcp_inches': 0.0, 'sig_prcp': 'no', 'avg_temp': 50.449999999999996}, 'distance': 60.51865544715035}
{'_id': ObjectId('6544a1ade61f9c6d659ece3a')

In [41]:
# Define the aggregation pipeline to pull rides by month 
pipeline = [
    {
        "$group": {
            "_id": {
                "year": {"$year": {"$toDate": "$started_at"}},
                "month": {"$month": {"$toDate": "$started_at"}}
            },
            "total_rides": {"$sum": 1}
        }
    },
    {
        "$project": {
            "_id": 0,
            "year": "$_id.year",
            "month": "$_id.month",
            "total_rides": 1
        }
    },
    {
        "$sort": {"year": 1, "month": 1}
    }
]

# Execute the aggregation pipeline and write to a new collection
divvy_rides_by_month = db["divvy_rides_by_month"]
divvy_rides_by_month.drop()  # Drop the collection
aggregated_result = divvy_ridedata_merged.aggregate(pipeline, allowDiskUse=True, collation=None)

for doc in aggregated_result:
    divvy_rides_by_month.insert_one(doc)

print("Aggregation result has been written to the new collection.")

Aggregation result has been written to the new collection.


In [42]:
# Define the aggregation pipeline to define divvy rides by season 
pipeline = [
    {
        "$group": {
            "_id": {
                "year": "$year",
                "season": {
                    "$switch": {
                        "branches": [
                            {"case": {"$in": ["$month", [3, 4, 5]]}, "then": "Spring"},
                            {"case": {"$in": ["$month", [6, 7, 8]]}, "then": "Summer"},
                            {"case": {"$in": ["$month", [9, 10, 11]]}, "then": "Autumn"},
                            {"case": {"$in": ["$month", [12, 1, 2]]}, "then": "Winter"}
                        ],
                        "default": "Unknown"
                    }
                }
            },
            "total_rides": {"$sum": "$total_rides"}
        }
    },
    {
        "$sort": {"_id.year": 1, "_id.season": 1}
    }
]

# Execute the aggregation pipeline
divvy_rides_by_season = db["divvy_rides_by_season"]
divvy_rides_by_season.drop()  # Drop the collection
aggregated_result = list(divvy_rides_by_month.aggregate(pipeline, allowDiskUse=True, collation=None))

# Insert the aggregated documents into the new collection
for doc in aggregated_result:
    print("Inserting document:", doc)
    divvy_rides_by_season.insert_one(doc)

print("Aggregation by season result has been written to the new collection.")

Inserting document: {'_id': {'year': 2022, 'season': 'Autumn'}, 'total_rides': 1597759}
Inserting document: {'_id': {'year': 2022, 'season': 'Spring'}, 'total_rides': 1057778}
Inserting document: {'_id': {'year': 2022, 'season': 'Summer'}, 'total_rides': 2378624}
Inserting document: {'_id': {'year': 2022, 'season': 'Winter'}, 'total_rides': 401185}
Aggregation by season result has been written to the new collection.


In [43]:
print(db.list_collection_names())

['distinct_station_names', 'withLatLong', 'divvy_ridedata_merged', 'withStationName', 'RouteDistance', 'weather_daily', 'Top10EndStations', 'Top10Routes', 'divvy_rides_by_season', 'sig_prcp_yes', 'sig_prcp_no', 'withoutStationName', 'divvy_ridedata', 'divvy_rides_by_month', 'Top10StartStations']


In [44]:
from bson import ObjectId

# Get distinct station names along with start_lat and start_lng
distinct_station_data = db["withStationName"].aggregate([
    {
        "$group": {
            "_id": "$start_station_name",
            "start_lat": {"$first": "$start_lat"},
            "start_lng": {"$first": "$start_lng"}
        }
    }
])

collection_name = "distinct_station_names"
station_names = db[collection_name]

station_name_documents = []
for data in distinct_station_data:
    station_name_documents.append({
        "start_station_name": data["_id"],
        "start_lat": data["start_lat"],
        "start_lng": data["start_lng"],
        "_id": str(ObjectId())
    })

station_names.insert_many(station_name_documents)

print(f"{len(station_name_documents)} distinct station names imported into '{collection_name}' collection.")

1635 distinct station names imported into 'distinct_station_names' collection.


In [45]:
#Find a station name from the collection
station_names.find_one()

{'_id': '6544b085cd1525c0862447b5',
 'start_station_name': 'Clinton St & Polk St',
 'start_lat': 41.87146651779,
 'start_lng': -87.6409491327}

In [46]:
# Use aggregation pipeline to create a collection that contains rides on days with significant precipitation
pipeline = [
         {"$match": {"weather_data.sig_prcp": {"$exists": True, "$eq": "yes"},
                     }},
         {"$out": "sig_prcp_yes"}
         
]
# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))


In [47]:
# Assign to a variable
sig_prcp_yes = db["sig_prcp_yes"]

In [48]:
sig_prcp_yes.find_one()

{'_id': ObjectId('6544a1406281b360c1f8c056'),
 'end_lat': 41.88338,
 'end_lng': -87.64117,
 'end_station_id': 'WL-012',
 'end_station_name': 'Clinton St & Washington Blvd',
 'ended_at': '2022-01-28 15:35:16',
 'ended_at_date': '2022-01-28',
 'ended_at_time': '15:35:16',
 'member_casual': 'member',
 'ride_id': '72DC25B2DD467EEF',
 'rideable_type': 'classic_bike',
 'start_lat': 41.878166,
 'start_lng': -87.631929,
 'start_station_id': 'TA1309000004',
 'start_station_name': 'LaSalle St & Jackson Blvd',
 'started_at': '2022-01-28 15:27:53',
 'started_at_date': '2022-01-28',
 'started_at_time': '15:27:53',
 'weather_data': {'_id': ObjectId('6544a21516ac0221c1a8c599'),
  'date': '2022-01-28',
  'cloud_cover': 100.0,
  'precipitation': 6.18,
  'min_temp': 15.89,
  'max_temp': 32.04,
  'morning_temp': 29.03,
  'afternoon_temp': 26.51,
  'evening_temp': 17.65,
  'night_temp': 29.01,
  'max_windspeed': 11.5,
  'prcp_inches': 0.2433066,
  'sig_prcp': 'yes',
  'avg_temp': 24.396666666666665}}

In [49]:
# Use aggregation pipeline to create a collection that contains rides on days without significant precipitation
pipeline = [
         {"$match": {"weather_data.sig_prcp": {"$exists": True, "$eq": "no"},
                     }},
         {"$out": "sig_prcp_no"}
         
]
# Perform the aggregation
result = list(divvy_ridedata_merged.aggregate(pipeline))


In [50]:
# Assign to a variable
sig_prcp_no = db["sig_prcp_no"]

In [51]:
sig_prcp_no.find_one()

{'_id': ObjectId('6544a1406281b360c1f8c053'),
 'end_lat': 42.01256011541,
 'end_lng': -87.6743671152,
 'end_station_id': 'RP-007',
 'end_station_name': 'Clark St & Touhy Ave',
 'ended_at': '2022-01-13 12:02:44',
 'ended_at_date': '2022-01-13',
 'ended_at_time': '12:02:44',
 'member_casual': 'casual',
 'ride_id': 'C2F7DD78E82EC875',
 'rideable_type': 'electric_bike',
 'start_lat': 42.0128005,
 'start_lng': -87.665906,
 'start_station_id': 525,
 'start_station_name': 'Glenwood Ave & Touhy Ave',
 'started_at': '2022-01-13 11:59:47',
 'started_at_date': '2022-01-13',
 'started_at_time': '11:59:47',
 'weather_data': {'_id': ObjectId('6544a21516ac0221c1a8c58a'),
  'date': '2022-01-13',
  'cloud_cover': 75.0,
  'precipitation': 0.0,
  'min_temp': 30.72,
  'max_temp': 39.83,
  'morning_temp': 35.49,
  'afternoon_temp': 30.72,
  'evening_temp': 34.38,
  'night_temp': 39.22,
  'max_windspeed': 8.01,
  'prcp_inches': 0.0,
  'sig_prcp': 'no',
  'avg_temp': 33.53}}

In [56]:
# Create a new collection that shows average daily rides per month with precipitation 

pipeline = [
    {
        "$group": {
            "_id": {
                "year": {"$year": {"$toDate": "$started_at"}},
                "month": {"$month": {"$toDate": "$started_at"}}
            },
            "total_rides": {"$sum": 1}
        }
    },
    {
        "$project": {
            "_id": 0,
            "year": "$_id.year",
            "month": "$_id.month",
            "total_rides": 1
        }
    },
    {
        "$sort": {"year": 1, "month": 1}
    }
]

# Execute the aggregation pipeline and write to a new collection
sig_prcp_yes_month = db["sig_prcp_yes_month"]
#divvy_rides_by_month = db["divvy_rides_by_month"]
sig_prcp_yes_month.drop()  # Drop the collection
aggregated_result = sig_prcp_yes.aggregate(pipeline, allowDiskUse=True, collation=None)

for doc in aggregated_result:
    sig_prcp_yes_month.insert_one(doc)

# Function to insert number of days and sig_prcp count for each month
def update_num_days_and_sig_prcp_count(year, month):
    _, num_days = monthrange(year, month)
    query = {'year': year, 'month': month}
    update_query = {'$set': {'num_days': num_days}}
    sig_prcp_yes_month.update_one(query, update_query)

    # Count the number of days with sig_prcp = 'no' for the given month
# Count the number of days with sig_prcp = 'no' for the given month
    sig_prcp_count = weather_daily.count_documents({
        'date': {'$regex': f'^{year:04d}-{month:02d}'},  # Match the year and month in the date field
        'sig_prcp': 'yes'
    })
    update_query = {'$set': {'sig_prcp_count': sig_prcp_count}}
    sig_prcp_yes_month.update_one(query, update_query)

    # Calculate average rides per day
    total_rides = sig_prcp_yes_month.find_one(query)['total_rides']
    sig_prcp_yes_month.update_one(query, update_query)

    # Calculate and update average rides per day with no significant precipitation
    query = {'year': 2022, 'month': month}
    document = sig_prcp_yes_month.find_one(query)
    
    if document is not None and document['sig_prcp_count'] > 0:  # To avoid division by zero
        average_rides_per_day = document['total_rides'] / document['sig_prcp_count']
        update_query = {'$set': {'average_rides_per_day': average_rides_per_day}}
        sig_prcp_yes_month.update_one(query, update_query)

# Loop through each month in the year 2022
for month in range(1, 13):
    update_num_days_and_sig_prcp_count(2022, month)


TypeError: 'NoneType' object is not subscriptable

In [None]:
# Create a new collection that shows average daily rides per month with no precipitation 

pipeline = [
    {
        "$group": {
            "_id": {
                "year": {"$year": {"$toDate": "$started_at"}},
                "month": {"$month": {"$toDate": "$started_at"}}
            },
            "total_rides": {"$sum": 1}
        }
    },
    {
        "$project": {
            "_id": 0,
            "year": "$_id.year",
            "month": "$_id.month",
            "total_rides": 1
        }
    },
    {
        "$sort": {"year": 1, "month": 1}
    }
]

# Execute the aggregation pipeline and write to a new collection
sig_prcp_no_month = db["sig_prcp_no_month"]
#divvy_rides_by_month = db["divvy_rides_by_month"]
sig_prcp_no_month.drop()  # Drop the collection
aggregated_result = sig_prcp_no.aggregate(pipeline, allowDiskUse=True, collation=None)

for doc in aggregated_result:
    sig_prcp_no_month.insert_one(doc)

def update_num_days_and_sig_prcp_count(year, month):
    _, num_days = monthrange(year, month)
    query = {'year': year, 'month': month}
    update_query = {'$set': {'num_days': num_days}}
    sig_prcp_no_month.update_one(query, update_query)

    # Count the number of days with sig_prcp = 'no' for the given month
    sig_prcp_count = weather_daily.count_documents({
        'date': {'$regex': f'^{year:04d}-{month:02d}'},
        'sig_prcp': 'no'
    })
    update_query = {'$set': {'sig_prcp_count': sig_prcp_count}}
    sig_prcp_no_month.update_one(query, update_query)

    # Calculate average rides per day
    document = sig_prcp_no_month.find_one(query)
    
    if document['sig_prcp_count'] > 0:
        total_rides = document['total_rides']
        average_rides_per_day = total_rides / document['sig_prcp_count']
        update_query = {'$set': {'average_rides_per_day': average_rides_per_day}}
        sig_prcp_no_month.update_one(query, update_query)

# Loop through each month in the year 2022
for month in range(1, 13):
    update_num_days_and_sig_prcp_count(2022, month)


TypeError: 'NoneType' object is not subscriptable

In [None]:
# Create a new collection to store documents with string _id
sig_prcp_no_month_string.drop()
sig_prcp_no_month_string = db["sig_prcp_no_month_with_string_id"]

# Iterate through the documents in the original collection
for document in sig_prcp_no_month.find({}):
    document_id = document['_id']
    string_id = str(document_id)
    
    # Create a new document with the string _id and other fields
    new_document = {
        '_id': string_id,
        'year': document['year'],
        'month': document['month'],
        'total_rides': document['total_rides'],
        'num_days': document['num_days'],
        'sig_prcp_count': document['sig_prcp_count'],
        'average_rides_per_day': document['average_rides_per_day']
        # Include other fields from the original document
    }
    
    # Insert the new document into the new collection
    sig_prcp_no_month_string.insert_one(new_document)

print("Documents with string _id inserted into the new collection.")

NameError: name 'sig_prcp_no_month_string' is not defined

In [None]:
# Create a new collection to store documents with string _id
sig_prcp_yes_month_string.drop()
sig_prcp_yes_month_string = db["sig_prcp_yes_month_with_string_id"]

# Iterate through the documents in the original collection
for document in sig_prcp_yes_month.find({}):
    document_id = document['_id']
    string_id = str(document_id)
    
    # Create a new document with the string _id and other fields
    new_document = {
        '_id': string_id,
        'year': document['year'],
        'month': document['month'],
        'total_rides': document['total_rides'],
        'num_days': document['num_days'],
        'sig_prcp_count': document['sig_prcp_count'],
        'average_rides_per_day': document['average_rides_per_day']
        # Include other fields from the original document
    }
    
    # Insert the new document into the new collection
    sig_prcp_yes_month_string.insert_one(new_document)

print("Documents with string _id inserted into the new collection.")

NameError: name 'sig_prcp_yes_month_string' is not defined