In [1]:
# Always include these two lines.
# They allow multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Importing required libraries
import requests  
import pandas as pd  
import sqlite3 
import json  
import matplotlib.pyplot as plt 
import os 

# Display confirmation
print("Libraries imported successfully!") 

Libraries imported successfully!


In [2]:
print(os.getcwd())

directory="/Users/leisha/Documents/DAEN 328"
os.chdir(directory)
# Verify the change
print(os.getcwd()) 

/Users/leisha/Documents/DAEN 328
/Users/leisha/Documents/DAEN 328


In [3]:
def fetch_api_data(api_url, output_file, batch_size=1000, num_records=None):
    """
    Fetches all data from the API in chunks using $limit and $offset parameters, 
    and saves each batch to a file incrementally.

    Parameters:
    - api_url (str): The base URL of the API.
    - output_file (str): Path to the JSON file to save data incrementally.
    - batch_size (int): Number of records to fetch per request (default: 1000).
    - num_records (int or None): Maximum number of records to fetch. If None, fetch all records.
    """
    offset = 0
   
    # Check if the output file already exists and load existing data
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            try:
                all_data = json.load(f)
                print(f"Resuming from {len(all_data)} records in {output_file}.")
            except json.JSONDecodeError:
                print(f"{output_file} is corrupted or empty. Starting fresh.")
                all_data = []
    else:
        all_data = []

    # Calculate the starting offset based on the existing data
    offset = len(all_data)
    print(f"Starting from offset {offset}...")

    while True:
        # Add $limit and $offset parameters to the API URL
        paginated_url = f"{api_url}?$limit={batch_size}&$offset={offset}"
        print(f"Fetching records starting at offset {offset}...")
        
        # Fetch data from the API
        try:
            response = requests.get(paginated_url)
            response.raise_for_status()
            batch_data = response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break

        # Stop if no more data is returned
        if not batch_data:
            print("No more data to fetch.")
            break

        # Append the batch to the combined data list
        all_data.extend(batch_data)

        # Save the updated data to the output file incrementally
        with open(output_file, "w") as f:
            json.dump(all_data, f, indent=2)
        print(f"Appended {len(batch_data)} records. Total records saved: {len(all_data)}")

        # Update offset to fetch the next batch
        offset += batch_size

        # Stop if a specific number of records is requested and reached
        if num_records is not None and len(all_data) >= num_records:
            print(f"Reached the specified number of records: {num_records}.")
            break

        # Break if the batch size is less than the limit, indicating the end of the dataset
        if len(batch_data) < batch_size:
            print("Reached the end of the dataset.")
            break

    print(f"Fetched a total of {len(all_data)} records. Data saved to {output_file}.")
    return all_data


In [9]:
# API URL for NYC COVID-19 Outcomes dataset
api_url = "https://data.cityofnewyork.us/resource/4b4i-vvec.json"
 
# Store json data set. You will need to adjust this paths
json_file_path = directory +"/api_data.json"

if os.path.exists(json_file_path):
    os.remove(json_file_path)
    print(f"Deleted existing file: {json_file_path}")

# Fetch the data
api_data = fetch_api_data(api_url = api_url, output_file = json_file_path, batch_size = 1000, num_records = 80000)

# Verify the total number of records fetched
print(f"Total records fetched: {len(api_data)}")

# Display a sample of the data to inspect
if api_data:
    print("Sample data (first 5 records):")
    print(json.dumps(api_data[:5], indent=2))

Deleted existing file: /Users/leisha/Documents/DAEN 328/api_data.json
Starting from offset 0...
Fetching records starting at offset 0...
Appended 1000 records. Total records saved: 1000
Fetching records starting at offset 1000...
Appended 1000 records. Total records saved: 2000
Fetching records starting at offset 2000...
Appended 1000 records. Total records saved: 3000
Fetching records starting at offset 3000...
Appended 1000 records. Total records saved: 4000
Fetching records starting at offset 4000...
Appended 1000 records. Total records saved: 5000
Fetching records starting at offset 5000...
Appended 1000 records. Total records saved: 6000
Fetching records starting at offset 6000...
Appended 1000 records. Total records saved: 7000
Fetching records starting at offset 7000...
Appended 1000 records. Total records saved: 8000
Fetching records starting at offset 8000...
Appended 1000 records. Total records saved: 9000
Fetching records starting at offset 9000...
Appended 1000 records. Tot

In [10]:
# Read the CSV into a DataFrame
df = pd.read_json(json_file_path)

# Display the first few rows of the DataFrame
print(df.head())

# Display the first few rows to inspect the structure
print("Sample DataFrame (first 5 rows):")
print(df.head())

# Display information about the DataFrame's structure and data types
print("\nDataFrame Info:")
print(df.info())

# Display summary statistics for numeric columns
print("\nSummary Statistics for Numeric Columns:")
print(df.describe())

   vendorid     tpep_pickup_datetime    tpep_dropoff_datetime  \
0         2  2023-01-01T00:32:10.000  2023-01-01T00:40:36.000   
1         2  2023-01-01T00:55:08.000  2023-01-01T01:01:27.000   
2         2  2023-01-01T00:25:04.000  2023-01-01T00:37:49.000   
3         1  2023-01-01T00:03:48.000  2023-01-01T00:13:25.000   
4         2  2023-01-01T00:10:29.000  2023-01-01T00:21:19.000   

   passenger_count  trip_distance  ratecodeid store_and_fwd_flag  \
0                1           0.97           1                  N   
1                1           1.10           1                  N   
2                1           2.51           1                  N   
3                0           1.90           1                  N   
4                1           1.43           1                  N   

   pulocationid  dolocationid  payment_type  fare_amount  extra  mta_tax  \
0           161           141             2          9.3   1.00      0.5   
1            43           237             1     