In [58]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import json
import os

import pandas as pd
pd.set_option("display.max_columns", 30)

import requests


###A képen a következő lépések szerepelnek egy ETL-pipeline feladatlistájaként:

Get the data from S3 – töltsd le a nyers adatokat az S3-ból.

Weather data transformations – alakítsd át, tisztítsd és normalizáld az időjárás adatokat.

Taxi trips transformations – végezd el a taxi adatok átalakítását és tisztítását.

Update payment_type_master – frissítsd a payment_type_master táblát (egyedi payment típusok listája).

Update company_master – frissítsd a company_master táblát (egyedi taxi cégek listája).

Update taxi_trips with company and payment_type ids – cseréld le a taxi trip adatokban a szöveges értékeket (pl. Cash, Flash Cab) a legfrissebb master táblákból származó ID-kra.

Upload weather data to S3 – töltsd vissza a feldolgozott időjárás adatokat az S3-ba.

Upload taxi data to S3 – töltsd vissza a feldolgozott taxi adatokat az S3-ba.

Upload the newest payment_type_master and company_master – töltsd vissza a frissített payment_type_master és company_master táblákat is.

Ez a pipeline tehát: S3 → transform → master lookup → ID-mapping → S3 visszatöltés.



In [59]:
current_datetime = datetime.now() - relativedelta(months=0)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")
# API endpoint konkrét trip_id-val
url = "https://data.cityofchicago.org/resource/ajtu-isnz.json"
# API token (opcionális, de ajánlott ha van)
headers = {"X-App-Token": os.environ.get("CHICAGO_API_TOKEN")}
# Lekérés
response = requests.get(url, headers=headers)
# JSON adat átalakítása
data = response.json()

In [60]:
taxi_trips = pd.DataFrame(data)


In [61]:
# Felesleges oszlopok eltávolítása (csak ha léteznek)
columns_to_drop = ["pickup_census_tract", "dropoff_census_tract", "pickup_centroid_location", "dropoff_centroid_location"]
existing_columns = [col for col in columns_to_drop if col in taxi_trips.columns]

if existing_columns:
    taxi_trips.drop(existing_columns, axis=1, inplace=True)

# Hiányzó sorok kiszűrése
taxi_trips.dropna(inplace=True)

# Oszlopok átnevezése ID formára (csak ha léteznek)
if "pickup_community_area" in taxi_trips.columns:
    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id"}, inplace=True)

if "dropoff_community_area" in taxi_trips.columns:
    taxi_trips.rename(columns={"dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

# Új oszlop: órára kerekített kezdési időpont (időjárási join-hoz)
if "trip_start_timestamp" in taxi_trips.columns:
    taxi_trips["datetime_for_weather"] = pd.to_datetime(
        taxi_trips["trip_start_timestamp"]
    ).dt.floor("H")

  ).dt.floor("H")


In [62]:
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_community_area_id,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
1,64f5092e40e9a251e3e1d30f615ed034c8e4abf6,e5e1bb9c3329c0f9bd1f291cb9bbbb016731c148fefca8...,2025-08-01T00:00:00.000,2025-08-01T00:15:00.000,1188,14.04,76,35.5,6.0,0,4,46.0,Credit Card,Globe Taxi,41.980264315,-87.913624596,22,41.92276062,-87.699155343,2025-08-01
3,84dd63f755a6ec0a351410de9436a2692af4dc90,2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba...,2025-08-01T00:00:00.000,2025-08-01T00:30:00.000,1527,7.05,3,21.25,0.0,0,0,21.25,Cash,Flash Cab,41.96581197,-87.655878786,15,41.954027649,-87.763399032,2025-08-01
4,21aa949b4b5c4f904c7cd035da14905d471c398f,5883034bc7d5b5ec14dd9d5249efe4d57ac89543713a98...,2025-08-01T00:00:00.000,2025-08-01T00:15:00.000,628,3.3,8,13.92,0.0,0,0,14.42,Mobile,Flash Cab,41.899602111,-87.633308037,7,41.922686284,-87.649488729,2025-08-01
5,4f40228bacef6a4d5975812055bfeee054bc24b5,d511072131b602026bdb9faa5491d15c3af8d62dc00659...,2025-08-01T00:00:00.000,2025-08-01T00:00:00.000,480,0.9,28,6.5,0.0,0,0,6.5,Cash,Transit Administrative Center Inc,41.874005383,-87.66351755,28,41.874005383,-87.66351755,2025-08-01
6,c5f5131a14d82d8bb49fd50161222995b01123ab,07800b5117d6e914644f00471b596f2a38426759f6f64c...,2025-08-01T00:00:00.000,2025-08-01T00:00:00.000,8,0.0,32,120.0,24.1,0,0,144.6,Credit Card,Flash Cab,41.884987192,-87.620992913,32,41.880994471,-87.632746489,2025-08-01


In [63]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Perform transformations with the taxi data.

    Parameters
    ----------
    taxi_trips : pd.DataFrame
        The DataFrame holding the daily taxi trips.

    Returns
    -------
    pd.DataFrame
        The cleaned, transformed DataFrame holding the daily taxi trips.
    """

    # Felesleges oszlopok eldobása
    taxi_trips.drop(
        ["pickup_census_tract", "dropoff_census_tract",
         "pickup_centroid_location", "dropoff_centroid_location"],
        axis=1, inplace=True
    )

    # Hiányzó sorok törlése
    taxi_trips.dropna(inplace=True)

    # Oszlopok átnevezése
    taxi_trips.rename(columns={
        "pickup_community_area": "pickup_community_area_id",
        "dropoff_community_area": "dropoff_community_area_id"
    }, inplace=True)

    # Új oszlop: órára kerekített kezdési időpont
    taxi_trips["datetime_for_weather"] = pd.to_datetime(
        taxi_trips["trip_start_timestamp"], errors="coerce"
    ).dt.floor("H")

    return taxi_trips


In [64]:
print(taxi_trips.head())

                                    trip_id  \
1  64f5092e40e9a251e3e1d30f615ed034c8e4abf6   
3  84dd63f755a6ec0a351410de9436a2692af4dc90   
4  21aa949b4b5c4f904c7cd035da14905d471c398f   
5  4f40228bacef6a4d5975812055bfeee054bc24b5   
6  c5f5131a14d82d8bb49fd50161222995b01123ab   

                                             taxi_id     trip_start_timestamp  \
1  e5e1bb9c3329c0f9bd1f291cb9bbbb016731c148fefca8...  2025-08-01T00:00:00.000   
3  2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba...  2025-08-01T00:00:00.000   
4  5883034bc7d5b5ec14dd9d5249efe4d57ac89543713a98...  2025-08-01T00:00:00.000   
5  d511072131b602026bdb9faa5491d15c3af8d62dc00659...  2025-08-01T00:00:00.000   
6  07800b5117d6e914644f00471b596f2a38426759f6f64c...  2025-08-01T00:00:00.000   

        trip_end_timestamp trip_seconds trip_miles pickup_community_area_id  \
1  2025-08-01T00:15:00.000         1188      14.04                       76   
3  2025-08-01T00:30:00.000         1527       7.05                      

In [65]:
print(taxi_trips.dtypes)

trip_id                               object
taxi_id                               object
trip_start_timestamp                  object
trip_end_timestamp                    object
trip_seconds                          object
trip_miles                            object
pickup_community_area_id              object
fare                                  object
tips                                  object
tolls                                 object
extras                                object
trip_total                            object
payment_type                          object
company                               object
pickup_centroid_latitude              object
pickup_centroid_longitude             object
dropoff_community_area_id             object
dropoff_centroid_latitude             object
dropoff_centroid_longitude            object
datetime_for_weather          datetime64[ns]
dtype: object


In [66]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Perform transformations with the taxi data.

    Parameters
    ----------
    taxi_trips : pd.DataFrame
        The DataFrame holding the daily taxi trips

    Returns
    -------
    pd.DataFrame
        The cleaned, transformed DataFrame holding the daily taxi trips.
    """

    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError("taxi_trips is not a valid pandas DataFrame.")

    taxi_trips.drop(
        ["pickup_census_tract", "dropoff_census_tract",
         "pickup_centroid_location", "dropoff_centroid_location"],
        axis=1, inplace=True
    )

    taxi_trips.dropna(inplace=True)

    taxi_trips.rename(
        columns={
            "pickup_community_area": "pickup_community_area_id",
            "dropoff_community_area": "dropoff_community_area_id"
        },
        inplace=True
    )

    taxi_trips["datetime_for_weather"] = pd.to_datetime(
        taxi_trips["trip_start_timestamp"]
    ).dt.floor("H")

    return taxi_trips


In [67]:
taxi_trips

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_community_area_id,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
1,64f5092e40e9a251e3e1d30f615ed034c8e4abf6,e5e1bb9c3329c0f9bd1f291cb9bbbb016731c148fefca8...,2025-08-01T00:00:00.000,2025-08-01T00:15:00.000,1188,14.04,76,35.5,6,0,4,46,Credit Card,Globe Taxi,41.980264315,-87.913624596,22,41.92276062,-87.699155343,2025-08-01 00:00:00
3,84dd63f755a6ec0a351410de9436a2692af4dc90,2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba...,2025-08-01T00:00:00.000,2025-08-01T00:30:00.000,1527,7.05,3,21.25,0,0,0,21.25,Cash,Flash Cab,41.96581197,-87.655878786,15,41.954027649,-87.763399032,2025-08-01 00:00:00
4,21aa949b4b5c4f904c7cd035da14905d471c398f,5883034bc7d5b5ec14dd9d5249efe4d57ac89543713a98...,2025-08-01T00:00:00.000,2025-08-01T00:15:00.000,628,3.3,8,13.92,0,0,0,14.42,Mobile,Flash Cab,41.899602111,-87.633308037,7,41.922686284,-87.649488729,2025-08-01 00:00:00
5,4f40228bacef6a4d5975812055bfeee054bc24b5,d511072131b602026bdb9faa5491d15c3af8d62dc00659...,2025-08-01T00:00:00.000,2025-08-01T00:00:00.000,480,0.9,28,6.5,0,0,0,6.5,Cash,Transit Administrative Center Inc,41.874005383,-87.66351755,28,41.874005383,-87.66351755,2025-08-01 00:00:00
6,c5f5131a14d82d8bb49fd50161222995b01123ab,07800b5117d6e914644f00471b596f2a38426759f6f64c...,2025-08-01T00:00:00.000,2025-08-01T00:00:00.000,8,0,32,120,24.1,0,0,144.6,Credit Card,Flash Cab,41.884987192,-87.620992913,32,41.880994471,-87.632746489,2025-08-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,a63c2a5451fcd092f837f99f237db32d3b4310bf,00f011f177d14242f8a1eb4315abe2f7b82be989761ed1...,2025-07-31T22:30:00.000,2025-07-31T23:00:00.000,1968,17.73,76,44.25,9.75,0,4,58.5,Credit Card,Flash Cab,41.97907082,-87.903039661,8,41.892507781,-87.626214906,2025-07-31 22:00:00
995,19e7aada53e319e12b66592d70596c046a338699,8307cf9433f0293eee99c6944aeab484521d9cd9b1fce5...,2025-07-31T22:30:00.000,2025-07-31T22:30:00.000,4,0,7,30,6.1,0,0,36.6,Credit Card,City Service,41.922686284,-87.649488729,6,41.944226601,-87.655998182,2025-07-31 22:00:00
997,91a7839ea2e025f75758338379dcf3b8268fa03c,bf66d006c4634bdc6c98ee6d505187ed750e647a6b4d1c...,2025-07-31T22:30:00.000,2025-07-31T22:30:00.000,5,0,38,3.25,0,0,0,3.25,Cash,Flash Cab,41.812948939,-87.617859676,38,41.812948939,-87.617859676,2025-07-31 22:00:00
998,a5cc5ae2e9dd8af426d5f6342fe9187dc9bc8671,68e1a2ea570e86d7ca0402907273138fa5aede37ae81ee...,2025-07-31T22:30:00.000,2025-07-31T22:30:00.000,7,0,73,83,12.52,0,0,96.02,Credit Card,5 Star Taxi,41.717493036,-87.648895072,73,41.717493036,-87.648895072,2025-07-31 22:00:00
