In [1]:
# Import dependencies and libraries
from pymongo import MongoClient
import requests
from pprint import pprint
import pandas as pd
from datetime import datetime

In [2]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

# Assign the database to a variable name
db = mongo['crimes']

# Assign the collection to a variable
chicago = db['chicago']

In [3]:
# Collect the data from the API
url = "https://data.cityofchicago.org/resource/ijzp-q8t2.json?$$app_token=7NHu8Y4PkgGZrMzrNtJwJ20bm"
response = requests.get(url)
list_of_crimes = response.json()

In [4]:
# Select an item from the returned data to view
item_to_view = list_of_crimes[0]
pprint(item_to_view)

{':@computed_region_43wa_7qmu': '45',
 ':@computed_region_6mkv_f3dw': '4299',
 ':@computed_region_8hcu_yrd4': '37',
 ':@computed_region_awaf_s7ux': '4',
 ':@computed_region_bdys_3d7i': '562',
 ':@computed_region_d3ds_rm58': '67',
 ':@computed_region_d9mm_jgwp': '25',
 ':@computed_region_rpca_8um6': '5',
 ':@computed_region_vrxf_vc4k': '26',
 'arrest': False,
 'beat': '1524',
 'block': '008XX N LONG AVE',
 'case_number': 'JG417579',
 'community_area': '25',
 'date': '2023-09-09T00:00:00.000',
 'description': 'ATTEMPT - AUTOMOBILE',
 'district': '015',
 'domestic': False,
 'fbi_code': '07',
 'id': '13205083',
 'iucr': '0920',
 'latitude': '41.896083371',
 'location': {'human_address': '{"address": "", "city": "", "state": "", '
                               '"zip": ""}',
              'latitude': '41.896083371',
              'longitude': '-87.760679089'},
 'location_description': 'STREET',
 'longitude': '-87.760679089',
 'primary_type': 'MOTOR VEHICLE THEFT',
 'updated_on': '2023-09-16

In [5]:
# Remove all documents currently in the chicago collection. 
#This is a precautionary measure for times where retrieving data happens at multiple, inconsistent times. 
#It allows us to start fresh.
chicago.delete_many({})

<pymongo.results.DeleteResult at 0x1987912f3c0>

In [6]:
# Loop through the list of crimes and only add those that are not currently in the collection.
for crime_to_add in list_of_crimes:
    if chicago.find_one({"id":crime_to_add["id"]}) == None:
        chicago.insert_one(crime_to_add)
    else: print(f"Crime is already in the database collection.")


In [7]:
# Filter out any data that doesn't contain a community_area.
query = {"community_area": None}
chicago.delete_many(query)

<pymongo.results.DeleteResult at 0x1987bd506c0>

In [8]:
# Filter out any data that doesn't contain a latitude.
query = {"latitude": None}
chicago.delete_many(query)

<pymongo.results.DeleteResult at 0x1987bd50240>

In [9]:
# Filter out any data that doesn't contain a longitude.
query = {"longitude": None}
chicago.delete_many(query)

<pymongo.results.DeleteResult at 0x1987bd2e3c0>

In [10]:
# Change the date formatting. 
for document in chicago.find():
    datetime_string = document["date"]
    python_datetime = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S.%f")
    date = python_datetime.date().strftime("%m/%d/%Y")
    time = python_datetime.time().strftime("%H:%M")
    chicago.update_one(
        {"id": document["id"]},
        {
            "$set":{
                "occurance_date": date,
                "occurance_time": time
            }
        }
    )

In [11]:
# The data may come through with mutliple dates. Filter out all days, except what is predicted to be the latest full day.
available_dates = chicago.find().sort("occurance_date", -1).distinct("occurance_date")
if len(available_dates)>2:
    selected_date = available_dates[len(available_dates)-2]
else:
    selected_date = available_dates[len(available_dates)-1]

query = {"occurance_date": {"$ne": selected_date}}
chicago.delete_many(query)

<pymongo.results.DeleteResult at 0x1987c3ff240>

In [12]:
# Place a zero in front of any single character community_area numbers values.
for document in chicago.find():
    number = document["community_area"]
    if len(number)==1:
        chicago.update_one(
            {"id": document["id"]},
            {"$set": {"community_area": f"0{number}"}}
        )

In [13]:
# Create categories for the times. 
for document in chicago.find():
    time_string = document["occurance_time"]
    if time_string.startswith(("00","01","02","03","04","05")):
        chicago.update_one(
            {"id": document["id"]},
            {
                "$set":{
                    "time_group": "A. Midnight - 5:59AM",
                }
            }
        )
    elif time_string.startswith(("06","07","08","09","10","11")):
        chicago.update_one(
            {"id": document["id"]},
            {
                "$set":{
                    "time_group": "B. 6AM - 11:59AM",
                }
            }
        )
    elif time_string.startswith(("12","13","14","15","16","17")):
        chicago.update_one(
            {"id": document["id"]},
            {
                "$set":{
                    "time_group": "C. Noon - 5:59PM",
                }
            }
        )
    else:
        chicago.update_one(
            {"id": document["id"]},
            {
                "$set":{
                    "time_group": "D. 6:00PM - 11:59PM",
                }
            }
        )

In [14]:
# Group by community_area with subgroups of primary_types. Save results as a new collection called chicago_community_by_crime.
pipeline = [
    {
        "$group":{
            "_id":{
                "community_area":"$community_area",
                "primary_type":"$primary_type"
            },
            "count":{"$sum":1}
        }
    },
    {
        "$group":{
            "_id": "$_id.community_area",
            "crime":{
                "$push":{
                    "type":"$_id.primary_type",
                    "count":"$count"
                }
            }
        }
    },
    {
        "$unwind": "$crime"
    },
    {
        "$sort": {"crime.type": 1}
    },
    {
        "$group": {
            "_id": "$_id",
            "crime": {"$push": "$crime"}
        }
    },
    {
        "$sort": {"_id":1}
    },
    {
        "$out":"chicago_community_by_crime"
    }
]

chicago_community_by_crime = list(chicago.aggregate(pipeline))

In [15]:
# Group by primary_types with subgroups of community_area. Save results as a new collection called chicago_crime_by_community.
pipeline = [
    {
        "$group":{
            "_id":{
                "primary_type":"$primary_type",
                "community_area":"$community_area",
            },
            "count":{"$sum":1}
        }
    },
    {
        "$group":{
            "_id": "$_id.primary_type",
            "communities":{
                "$push":{
                    "community":"$_id.community_area",
                    "count":"$count"
                }
            }
        }
    },
    {
        "$unwind": "$communities"
    },
    {
        "$sort": {"communities.community": 1}
    },
    {
        "$group": {
            "_id": "$_id",
            "communities": {"$push": "$communities"}
        }
    },
    {
        "$sort": {"_id":1}
    },
    {
        "$out":"chicago_crime_by_community"
    }
]

chicago_crime_by_community = list(chicago.aggregate(pipeline))

In [16]:
# Group by primary_types with subgroups of time_group. Save results as a new collection called chicago_crime_by_time.
pipeline = [
    {
        "$group":{
            "_id":{
                "primary_type":"$primary_type",
                "time_group":"$time_group",
            },
            "count":{"$sum":1}
        }
    },
    {
        "$group":{
            "_id": "$_id.primary_type",
            "times":{
                "$push":{
                    "time":"$_id.time_group",
                    "count":"$count"
                }
            },
        }
    },
    {
        "$unwind": "$times"
    },
    {
        "$sort": {"times.time": 1}
    },
    {
        "$group": {
            "_id": "$_id",
            "times": {"$push": "$times"}
        }
    },
    {
        "$sort": {"_id":1}
    },
    {
        "$out":"chicago_crime_by_time"
    }
]

chicago_crime_by_time = list(chicago.aggregate(pipeline))