In [3]:
import pandas as pd
import requests
import os
from dateutil.relativedelta import relativedelta
from datetime import datetime as dt
from typing import Dict

pd.set_option('display.max_columns',30)

data_out = r'../data/'

In [2]:
def get_taxi_data(workingDateStart: str, workingDateEnd: str) -> Dict:
    """
    Retrieces taxi data for the given date.
    Parameters:
        workingDateStart (str) : Start date in 'YYYY-MM-DD' format.
        workingDateEnd (str) : End date in 'YYYY-MM-DD' format.
    
    Return:
        Dict: A dictionary containing taxi data as JSON
    """
    
    url_new = f"https://data.cityofchicago.org/resource/ajtu-isnz.json?$where=trip_start_timestamp >= '{workingDateStart}T00:00:00' AND trip_start_timestamp <= '{workingDateEnd}T23:59:59'&$limit=213000000"
    response_taxi = requests.get(url_new)
    taxi_data = response_taxi.json()
    return taxi_data

In [3]:
def get_weather_data(weather_start_date: str, weather_end_date: str) -> Dict:
    """
    Retrieves weather data from the Open Meteo API for a specified location and time period.

    Args:
        weather_start_date (str): The start date for the weather data in 'YYYY-MM-DD' format.
        weather_end_date (str): The end date for the weather data in 'YYYY-MM-DD' format.

    Returns:
        Dict: A dictionary containing weather data for the specified time period and location.

    Example:
        >>> data = get_weather_data('2024-05-01', '2024-05-05')
    """
    weather_url = f"https://archive-api.open-meteo.com/v1/era5"
    params = {
        "latitude" : 41.85,
        "longitude" : -87.65,
        "start_date" : weather_start_date,
        "end_date" : weather_end_date,
        "hourly" : "temperature_2m,wind_speed_10m,precipitation,rain"
    }
    response = requests.get(weather_url, params=params)
    weather_data = response.json()
    return weather_data

In [4]:
def get_minmax_timestamp(taxi_data: Dict) -> Dict:
    """
    Calculates the minimum and maximum timestamps from a list of taxi data.

    Args:
        taxi_data (Dict): A list of dictionaries containing taxi data. Each dictionary
            should have a 'trip_start_timestamp' key containing a timestamp in ISO format.

    Returns:
        Dict: A dictionary containing the minimum and maximum timestamps found in the
            'trip_start_timestamp' field of the input data. The timestamps are formatted
            as strings in the format 'YYYY-MM-DD' and returned under the keys 'start'
            and 'end' respectively.
    """
    start_date = dt.fromisoformat(taxi_data[0]['trip_start_timestamp'])
    end_date = dt.fromisoformat(taxi_data[0]['trip_start_timestamp'])
    
    for item in taxi_data:
        timestamp = dt.fromisoformat(item['trip_start_timestamp'])
        
        if timestamp < start_date:
            start_date = timestamp
        elif timestamp > end_date:
            end_date = timestamp

    start_date = start_date.strftime("%Y-%m-%d")
    end_date = end_date.strftime("%Y-%m-%d")
    return {"start": start_date, "end": end_date}

In [5]:
endDate = dt.now().strftime("%Y-%m-%d")
startDate = (dt.now() - relativedelta(months=1)).strftime("%Y-%m-%d")

workingDateStart = (dt.now() - relativedelta(months=2))
workingDateEnd = (workingDateStart + relativedelta(days=1)).strftime("%Y-%m-%d")
workingDateStart = workingDateStart.strftime("%Y-%m-%d")

weather_bucket_key = f'raw-data/to-process/weather-data/weather_raw{workingDateStart}.json'
taxi_bucket_key = f"raw-data/to-process/taxi-data/taxi_raw{workingDateStart}.json"
bucket_name = 'chicago-taxi'

taxi_data_api_call = get_taxi_data(workingDateStart, workingDateEnd)
taxi_df = pd.DataFrame(taxi_data_api_call)

#determine weather start and end date

weather_data_api_call = get_weather_data(
    get_minmax_timestamp(taxi_data_api_call)['start'],
    get_minmax_timestamp(taxi_data_api_call)['end']
)

weather_filtered = {
    'datetime': weather_data_api_call['hourly']['time'],
    'temperature' : weather_data_api_call['hourly']['temperature_2m'],
    'wind' : weather_data_api_call['hourly']['wind_speed_10m'],
    'precipitation' : weather_data_api_call['hourly']['precipitation'],
    'rain' : weather_data_api_call['hourly']['rain']
}

weather_df = pd.DataFrame(weather_filtered)


In [7]:
weather_df['datetime'].max()

'2024-03-06T23:00'

In [5]:
from openai import OpenAI

ModuleNotFoundError: No module named 'openai'

ModuleNotFoundError: No module named 'openai'