In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import json
import os

import pandas as pd
pd.set_option("display.max_columns", 30)

import requests


In [None]:
current_datetime = datetime.now() - relativedelta(months=0)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")
# API endpoint konkrét trip_id-val
url = "https://data.cityofchicago.org/resource/ajtu-isnz.json"
# API token (opcionális, de ajánlott ha van)
headers = {"X-App-Token": os.environ.get("CHICAGO_API_TOKEN")}
# Lekérés
response = requests.get(url, headers=headers)
# JSON adat átalakítása
data = response.json()

In [None]:
taxi_trips = pd.DataFrame(data)

In [None]:
# Felesleges oszlopok eltávolítása (csak ha léteznek)
columns_to_drop = ["pickup_census_tract", "dropoff_census_tract", "pickup_centroid_location", "dropoff_centroid_location"]
existing_columns = [col for col in columns_to_drop if col in taxi_trips.columns]

if existing_columns:
    taxi_trips.drop(existing_columns, axis=1, inplace=True)

# Hiányzó sorok kiszűrése
taxi_trips.dropna(inplace=True)

# Oszlopok átnevezése ID formára (csak ha léteznek)
if "pickup_community_area" in taxi_trips.columns:
    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id"}, inplace=True)

if "dropoff_community_area" in taxi_trips.columns:
    taxi_trips.rename(columns={"dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

# Új oszlop: órára kerekített kezdési időpont (időjárási join-hoz)
if "trip_start_timestamp" in taxi_trips.columns:
    taxi_trips["datetime_for_weather"] = pd.to_datetime(
        taxi_trips["trip_start_timestamp"]
    ).dt.floor("h")

In [None]:
taxi_trips.head()

In [None]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Perform transformations with the taxi data.

    Parameters
    ----------
    taxi_trips : pd.DataFrame
        The DataFrame holding the daily taxi trips.

    Returns
    -------
    pd.DataFrame
        The cleaned, transformed DataFrame holding the daily taxi trips.
    """

    # Felesleges oszlopok eldobása
    taxi_trips.drop(
        ["pickup_census_tract", "dropoff_census_tract",
         "pickup_centroid_location", "dropoff_centroid_location"],
        axis=1, inplace=True
    )

    # Hiányzó sorok törlése
    taxi_trips.dropna(inplace=True)

    # Oszlopok átnevezése
    taxi_trips.rename(columns={
        "pickup_community_area": "pickup_community_area_id",
        "dropoff_community_area": "dropoff_community_area_id"
    }, inplace=True)

    # Új oszlop: órára kerekített kezdési időpont
    taxi_trips["datetime_for_weather"] = pd.to_datetime(
        taxi_trips["trip_start_timestamp"], errors="coerce"
    ).dt.floor("h")

    return taxi_trips

In [None]:
print(taxi_trips.head())

In [None]:
print(taxi_trips.dtypes)

In [None]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Perform transformations with the taxi data.

    Parameters
    ----------
    taxi_trips : pd.DataFrame
        The DataFrame holding the daily taxi trips

    Returns
    -------
    pd.DataFrame
        The cleaned, transformed DataFrame holding the daily taxi trips.
    """

    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError("taxi_trips is not a valid pandas DataFrame.")

    taxi_trips.drop(
        ["pickup_census_tract", "dropoff_census_tract",
         "pickup_centroid_location", "dropoff_centroid_location"],
        axis=1, inplace=True
    )

    taxi_trips.dropna(inplace=True)

    taxi_trips.rename(
        columns={
            "pickup_community_area": "pickup_community_area_id",
            "dropoff_community_area": "dropoff_community_area_id"
        },
        inplace=True
    )

    taxi_trips["datetime_for_weather"] = pd.to_datetime(
        taxi_trips["trip_start_timestamp"]
    ).dt.floor("h")

    return taxi_trips

In [None]:
company_master = taxi_trips["company"].drop_duplicates().reset_index(drop=True)

company_master = pd.DataFrame(
    {
        "company_id": range(1, len(company_master) + 1),
        "company": company_master
    }
)

company_master.tail()


In [None]:
new_company_data = [
    {"company": "6574 - Babylon Express Inc."},
    {"company": "X"},
    {"company": "Y"}
]

new_company_mapping = pd.DataFrame(new_company_data)

new_company_mapping


In [None]:
for company in new_company_mapping["company"].values:
    if company not in company_master["company"].values:
        print(company)


In [None]:
company_max_id = company_master["company_id"].max()
company_max_id


In [None]:
new_companies_list = []

for company in new_company_mapping["company"].values:
    if company not in company_master["company"].values:
        new_companies_list.append(company)




In [None]:
new_companies_df = pd.DataFrame({
    "company_id": range(company_max_id + 1, company_max_id + len(new_companies_list) + 1),
    "company": new_companies_list  
})
new_companies_df

In [None]:
updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)

updated_company_master.tail()


In [None]:
def update_company_master(taxi_trips: pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame:
    company_max_id = company_master["company_id"].max()
    
    new_companies_list = [
        company for company in taxi_trips["company"].values 
        if company not in company_master["company"].values
    ]
    
    new_companies_df = pd.DataFrame({
        "company_id": range(company_max_id + 1, company_max_id + len(new_companies_list) + 1),
        "company": new_companies_list
    })

    updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)
    
    return updated_company_master


In [None]:
test_df = update_company_master(taxi_trips=taxi_trips, company_master=company_master)
test_df


In [None]:
taxi_trips_company_only = pd.DataFrame({
    "company_id": [1, 2, 3],
    "company": ["6574 - Babylon Express Inc.", "X", "Y"]
})

taxi_trips_company_only


In [None]:
# taxi_trips_company_only DataFrame létrehozása
taxi_trips_company_only = pd.DataFrame({
    "company_id": [1, 2, 3],
    "company": ["6574 - Babylon Express Inc.", "X", "Y"]
})

taxi_trips_company_only


In [None]:
# update_company_master futtatása
updated_company_master = update_company_master(
    taxi_trips=taxi_trips_company_only,
    company_master=company_master
)

# új cégek hozzáadva a végéhez
updated_company_master.tail()


In [None]:
payment_type_master = taxi_trips["payment_type"].drop_duplicates().reset_index(drop=True)

payment_type_master = pd.DataFrame({
"payment_type_id": range(1, len(payment_type_master) + 1),
"payment_type": payment_type_master
})

taxi_trips_payment_type_only = pd.DataFrame({
"payment_type_id": [1, 2, 3],
"payment_type": ["Credit Card", "X", "Y"]
})

taxi_trips_payment_type_only


In [None]:
def update_payment_type_master(taxi_trips: pd.DataFrame, payment_type_master: pd.DataFrame) -> pd.DataFrame:
    """
    Extend the payment type master with new payment types if there are new payment types.

    Parameters
    ----------
    taxi_trips : pd.DataFrame
        DataFrame holding the daily taxi trips.
    payment_type_master : pd.DataFrame
        DataFrame holding the payment_type_master data.

    Returns
    -------
    pd.DataFrame
        The updated payment_type_master data, if new payment types are in the taxi data, 
        they will be loaded to it.
    """
    # Maximum meglévő payment_type_id
    payment_type_max_id = payment_type_master["payment_type_id"].max()

    # Új payment_type lista
    new_payment_types_list = [
        payment_type for payment_type in taxi_trips["payment_type"].values
        if payment_type not in payment_type_master["payment_type"].values
    ]

    # Új DataFrame az új payment típusoknak
    new_payment_type_df = pd.DataFrame({
        "payment_type_id": range(payment_type_max_id + 1, payment_type_max_id + len(new_payment_types_list) + 1),
        "payment_type": new_payment_types_list
    })

    # Összefűzés a régivel
    updated_payment_type_master = pd.concat([payment_type_master, new_payment_type_df], ignore_index=True)

    return updated_payment_type_master


In [None]:
updated_payment_type_master = update_payment_type_master(
    taxi_trips=taxi_trips_payment_type_only,
    payment_type_master=payment_type_master
)

In [None]:
print(updated_payment_type_master)

In [None]:
def update_master(taxi_trips: pd.DataFrame, master: pd.DataFrame, id_column: str, value_column: str) -> pd.DataFrame:
    """
    Extend the master DataFrame with new values if there are any.

    Parameters
    ----------
    taxi_trips : pd.DataFrame
        DataFrame holding the daily taxi trips.
    master : pd.DataFrame
        DataFrame holding the master data.
    id_column : str
        The id column of the master DataFrame.
    value_column : str
        The value column to compare and extend.

    Returns
    -------
    pd.DataFrame
        The updated master data, if new values are in the taxi data, they will be loaded to it.
    """

    max_id = master[id_column].max()

    new_values_list = [
        value for value in taxi_trips[value_column].values
        if value not in master[value_column].values
    ]

    new_values_df = pd.DataFrame({
        id_column: range(max_id + 1, max_id + len(new_values_list) + 1),
        value_column: new_values_list
    })

    updated_master = pd.concat([master, new_values_df], ignore_index=True)

    return updated_master


In [None]:
test_payment_type_master = update_master(
    taxi_trips=taxi_trips,
    master=payment_type_master,
    id_column="payment_type_id",
    value_column="payment_type"
)


In [None]:
test_payment_type_master

In [None]:
test_company_master = update_master(
    taxi_trips=taxi_trips_company_only,
    master=company_master,
    id_column="company_id",
    value_column="company"
)

test_company_master.tail()


In [None]:
def update_taxi_trips_with_master_data(
    taxi_trips: pd.DataFrame, 
    payment_type_master: pd.DataFrame, 
    company_master: pd.DataFrame
) -> pd.DataFrame:
    """
    Update the taxi_trips DataFrame with the company_master and payment_type_master ids,
    and delete the string columns.

    Parameters
    ----------
    taxi_trips : pd.DataFrame
        The DataFrame with the daily taxi trips.
    payment_type_master : pd.DataFrame
        The payment type master table.
    company_master : pd.DataFrame
        The company master table.

    Returns
    -------
    pd.DataFrame
        The taxi trips data, with only payment_type_id and company_id, no string columns.
    """
    taxi_trips_id = taxi_trips.merge(payment_type_master, on="payment_type")
    taxi_trips_id = taxi_trips_id.merge(company_master, on="company")

    taxi_trips_id.drop(["payment_type", "company"], axis=1, inplace=True)

    return taxi_trips_id

In [None]:
taxi_trips_id = update_taxi_trips_with_master_data(
    taxi_trips=taxi_trips,
    payment_type_master=payment_type_master,
    company_master=company_master
)

taxi_trips_id.sample(5)


In [None]:
def transform_weather_data(weather_data: json) -> pd.DataFrame:
    """
    Make transformations on the daily weather API response.

    Parameters
    ----------
    weather_data : json
        The daily weather data from the Open Meteo API.

    Returns
    -------
    pd.DataFrame
        A DataFrame representation of the data.
    """
    weather_data_filtered = {
        "datetime": weather_data["hourly"]["time"],
        "temperature": weather_data["hourly"]["temperature_2m"],
        "wind_speed": weather_data["hourly"]["wind_speed_10m"],
        "rain": weather_data["hourly"]["rain"],
        "precipitation": weather_data["hourly"]["precipitation"],
    }

    weather_df = pd.DataFrame(weather_data_filtered)

    weather_df["datetime"] = pd.to_datetime(weather_df["datetime"])

    return weather_df


In [None]:
# Test

current_datetime = datetime.now() - relativedelta(months=2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")

url = "https://archive-api.open-meteo.com/v1/era5"

params = {
    
    "latitude": 41.85,
    "longitude": -87.65,
    "start_date": formatted_datetime,
    "end_date": formatted_datetime,
    "hourly": "temperature_2m,wind_speed_10m,rain,precipitation"
}

response = requests.get(url, params=params)

weather_data = response.json()

weather_data_df = transform_weather_data(weather_data)

weather_data_df.head()
