### Fetching Crime Data

#### Functions for Fetching Data

In [None]:
from typing import Optional, List
from sodapy import Socrata
from pathlib import Path
from time import sleep
import requests
import csv


def write_results(data: List[dict], filepath: str, number: int) -> str:
    """
    Function used internally by other functions to write csv files.
    """
    base_path = Path(filepath)
    new_filename = base_path.stem + "_" + f"{number:,}" + base_path.suffix
    new_filepath = base_path.parent / new_filename
    with open(new_filepath, mode="w", encoding="utf-8") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(data[10].keys())
        writer.writerows([item.values() for item in data])
    return new_filepath


def save_data_as_csv(
    dataset_id: str,
    app_token: str,
    limit: int = 10**5,
    offset: int = 0,
    sleep_t: int = 15,
    filepath: str = "fetched_data/save.csv",
    columns: Optional[List[str]] = None,
) -> List[list]:
    """
    Fetch data from the City of Chicago API and saves it in batches.

    Args:
        dataset_id (str): The dataset identifier.
        app_token (str): The application token for API access.
        limit (int): Number of records per page (default: 1000,000).
        offset (int): Will fetch data starting from the defined offset index (default is 0).
        sleep_t (int): How long (in seconds) to sleep between page requests.
        filepath (str): Path to the file, will be appended with offset point number.
        columns (Optional[List[str]]): A list of columns to be included in data.

    Returns:
        list[dict]: A list of JSON objects containing the crime data.
    """
    domain = "data.cityofchicago.org"
    client = Socrata(domain=domain, app_token=app_token)

    if columns:
        columns = ", ".join([col for col in columns])

    temp_data = []
    all_data = []
    break_counter = 0
    while True:
        try:
            response = client.get(
                dataset_identifier=dataset_id,
                limit=limit,
                offset=offset,
                content_type="json",
                select=columns,
            )
            if not response:
                print("No new data returned.")
                break
            temp_data.extend(response)
            offset = offset + limit
            break_counter += 1
            print(f"The first {offset} entries successfully fetched.")
            if break_counter >= 10:
                # Write data if more than 10 requests run.
                new_filepath = write_results(
                    data=temp_data, filepath=filepath, number=offset
                )
                print(
                    f"Total of {offset:,} entries were fetched. Results saved in {new_filepath}"
                )
                all_data.extend(temp_data)
                temp_data = []
                break_counter = 0
            sleep(sleep_t)
        except requests.exceptions.ReadTimeout:
            print(f"Read timeout occured, will try after {sleep_t} seconds.")
            sleep(sleep_t)
            continue
        except Exception as e:
            if temp_data:
                new_filepath = write_results(
                    data=temp_data, filepath=filepath, number=offset
                )
                print(f"Some fetched data saved in {new_filepath}.\n{e}")
            print(f"Error occured {e}")
            break
    return all_data

#### Query and Save Crime Data from Socrata Platform

To query and save crime data you can start with setting the `first_page` parameter to 0 (or leave it blank), pick a reasonable `n_pages` parameter (e.g. query through 500 pages takes about 25 mins). Run and save the results recursively each time starting at the previous `first_page + n_pages` until there is no more results.

*To query all data without using any filters takes a significant amount of time (north of 4 hours)*

In [None]:
from dotenv import load_dotenv
import os

starting_offset = 9*10**6

load_dotenv()
app_token = os.getenv("APP_TOKEN")
dataset_id = "ijzp-q8t2"
columns = [
    "id",
    "case_number",
    "date",
    "block",
    "iucr",
    "primary_type",
    "description",
    "location_description",
    "arrest",
    "domestic",
    "beat",
    "district",
    "ward",
    "community_area",
    "fbi_code",
    "latitude",
    "longitude",
]

results = save_data_as_csv(
    dataset_id=dataset_id,
    app_token=app_token,
    limit=5*10**3,
    offset=starting_offset,
    filepath="data/crimes.csv",
    columns=columns,
    sleep_t = 5,
)

#### Combining Data

Combining data that were saved into scattered CSV files as a single CSV.

In [None]:
import os
import csv

# Reading all data files
data = []
for filename in os.listdir("data/"):
    if filename.startswith("crimes_"):
        path = os.path.join("data/", filename)
        with open(path, "r") as infile:
            reader = csv.reader(infile)
            for line in reader:
                data.append(line)

# Removing duplicated header rows.
data_cleaned = []
for i, row in enumerate(data):
    if i == 0 or row[0] != "id":
        data_cleaned.append(row)

# Writing data
with open("data/crimes_combined.csv", "w", encoding="utf-8") as outfile:
    writer = csv.writer(outfile)
    writer.writerows(data_cleaned)