In [55]:
import pymongo
from pymongo import MongoClient
from pprint import pprint
import json
import geopandas as gpd

In [56]:
# Create an instance of MongoClient and specify the database name
mongo = MongoClient(port=27017)
db = mongo.divvy_db

# Access the collection directly
divvy_rides = db['divvy_ride_data']

# Count the number of documents in the collection
document_count = divvy_rides.count_documents({})

# Print the collection name and the document count
print(f"Collection Name: {divvy_rides.name}")
print(f"Count of Documents: {document_count}")

Collection Name: divvy_ride_data
Count of Documents: 5667717


In [57]:
# Access the collection
station_names = db['station_names']

# Delete the collection
station_names.drop()

# Specify the name of the source collection (your original data)
source_collection_name = "divvy_ride_data"  # Replace with your source collection name

# Access the source collection
source_collection = db[source_collection_name]

# Create an index on the "start_station_name" field
source_collection.create_index([("start_station_name", pymongo.ASCENDING)])

# Aggregation pipeline to find unique start stations by "start_station_name", exclude blank start station names, and get the first "start_lat" and "start_lng"
pipeline = [
    {
        "$match": {
            "start_station_name": {"$ne": ""}
        }
    },
    {
        "$group": {
            "_id": "$start_station_name",  # Use "start_station_name" as the _id
            "station_name": {"$first": "$start_station_name"},  # Rename the field
            "lat": {"$first": "$start_lat"},  # Get the first "start_lat" in each group
            "lng": {"$first": "$start_lng"},  # Get the first "start_lng" in each group
        }
    },
    {
        "$merge": {
            "into": "station_names",  # Name of the new collection
            "whenMatched": "merge",  # How to handle conflicts
            "whenNotMatched": "insert",  # How to handle non-matching documents
        }
    }
]
# Execute the aggregation pipeline using the aggregate method
db[source_collection_name].aggregate(pipeline)

<pymongo.command_cursor.CommandCursor at 0x123110f40>

In [64]:
station_names = db['station_names']

### Add Chicago Neighborhoods

In [58]:
# Load the GeoJSON file containing Chicago neighborhoods
neighborhoods = gpd.read_file("location_data/chicago_neighborhoods.geojson")

# Define a function to find the neighborhood for a given longitude and latitude
def find_neighborhood(lon, lat):
    point = [lon, lat]
    for index, row in neighborhoods.iterrows():
        if row['geometry'].contains(gpd.points_from_xy([lon], [lat])[0]):
            return row['pri_neigh']
    return None

In [59]:
for station in station_names.find():
        lon = station.get('lng')
        lat = station.get('lat')

        # Calculate the new neighborhood
        new_neighborhood = find_neighborhood(lon, lat)

        # Update the document with the new neighborhood information
        station_names.update_one(
            {"_id": station["_id"]},
            {"$set": {"neighborhood": new_neighborhood}}
        )

### Add Chicago Wards

In [61]:
# Load the GeoJSON file containing Chicago neighborhoods
wards = gpd.read_file("location_data/chicago_wards.geojson")

# Define a function to find the neighborhood for a given longitude and latitude
def find_ward(lon, lat):
    point = [lon, lat]
    for index, row in wards.iterrows():
        if row['geometry'].contains(gpd.points_from_xy([lon], [lat])[0]):
            return row['ward']
    return None

In [65]:
for station in station_names.find():
        lon = station.get('lng')
        lat = station.get('lat')

        # Calculate the new neighborhood
        new_ward = find_ward(lon, lat)

        # Update the document with the new neighborhood information
        station_names.update_one(
            {"_id": station["_id"]},
            {"$set": {"ward": new_ward}}
        )

### Add Chicago Community Areas

In [69]:
# Load the GeoJSON file containing Chicago neighborhoods
community_area = gpd.read_file("location_data/chicago_community_area.geojson")

# Define a function to find the neighborhood for a given longitude and latitude
def find_community_area(lon, lat):
    point = [lon, lat]
    for index, row in community_area.iterrows():
        if row['geometry'].contains(gpd.points_from_xy([lon], [lat])[0]):
            return row['community']
    return None

In [70]:
for station in station_names.find():
        lon = station.get('lng')
        lat = station.get('lat')

        # Calculate the new neighborhood
        new_community_area = find_community_area(lon, lat)

        # Update the document with the new neighborhood information
        station_names.update_one(
            {"_id": station["_id"]},
            {"$set": {"community_area": new_community_area}}
        )

### Summarize station_names collection

In [71]:
# Access the collection directly
station_names = db['station_names']

# Count the number of documents in the collection
document_count = station_names.count_documents({})

# Print the collection name and the document count
print(f"Collection Name: {station_names.name}")
print(f"Count of Documents: {document_count}")

document = station_names.find_one()
pprint(document)

Collection Name: station_names
Count of Documents: 1674
{'_id': '10101 S Stony Island Ave',
 'community_area': 'SOUTH DEERING',
 'lat': 41.71,
 'lng': -87.58,
 'neighborhood': 'South Deering',
 'station_name': '10101 S Stony Island Ave',
 'ward': '10'}
