### EDA: exploratory data analysis

In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

import pandas as pd
pd.set_option('display.max_columns', 30)

import requests

In [2]:
current_datetime = datetime.now() - relativedelta(months = 2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")

In [3]:
url = (f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
	   f"$where=trip_start_timestamp >= '{formatted_datetime}T00:00:00' "
	   f"AND trip_start_timestamp <= '{formatted_datetime}T23:59:59'"
	   f"&$limit=30000")
response = requests.get(url)
data = response.json()
taxi_trips = pd.DataFrame(data)

In [4]:
taxi_trips.describe()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
count,25515,25515,25515,25515,25511,25515,24789,23441,25482.0,25482,25482,25482,25482.0,25515,25515,24792.0,24792.0,24792,23573.0,23573.0,23573,12946,12673
unique,25515,2580,96,111,4141,2485,77,77,2724.0,1185,37,192,3717.0,7,33,173.0,173.0,173,201.0,201.0,201,97,129
top,2439d42ac59524ce144ca6e5b4c1467db3da4094,8da9e1d18757022c6a6a614fc2d38483e38aae441feff5...,2025-09-25T17:15:00.000,2025-09-25T18:00:00.000,0,0,8,8,9.5,0,0,0,3.25,Credit Card,Flash Cab,41.97907082,-87.903039661,"{'type': 'Point', 'coordinates': [-87.90303966...",41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",17031980000,17031839100
freq,1,47,507,476,290,1893,5727,6111,914.0,14016,25263,18786,349.0,8821,4787,2429.0,2429.0,2429,1847.0,1847.0,1847,2429,1847


In [5]:
taxi_trips[taxi_trips['fare'].isna()].sample(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
15439,1cb4eb645ae6db4ee899b41799b6f71ba7a4f1e1,5e48748b582fe3c16baf5ac371a27773e8aef2927f3344...,2025-09-25T13:15:00.000,2025-09-25T13:15:00.000,600,1.72,8,28,,,,,,Cash,Sun Taxi,41.89503345,-87.619710672,"{'type': 'Point', 'coordinates': [-87.61971067...",41.885300022,-87.642808466,"{'type': 'Point', 'coordinates': [-87.64280846...",17031081401.0,17031280100.0
10841,bdc31b417fc32b82b82844aed28f796a56e7c021,19447854f6105afc7307d18998685917b36bbbb2c8a7e5...,2025-09-25T15:45:00.000,2025-09-25T15:45:00.000,17,0.0,32,32,,,,,,Cash,Tac - Yellow Cab Association,41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",17031839100.0,17031839100.0
8269,761da9083924693d18b364f2e13fcf3484c2700a,574b161e75a7dda9dc1f63751af8b83c982a3a76f5fc88...,2025-09-25T17:15:00.000,2025-09-25T17:30:00.000,973,1.12,32,8,,,,,,Cash,Taxicab Insurance Agency Llc,41.880994471,-87.632746489,"{'type': 'Point', 'coordinates': [-87.63274648...",41.890922026,-87.618868355,"{'type': 'Point', 'coordinates': [-87.61886835...",17031839100.0,17031081403.0
18538,28c44659b5ec652764bc696ae37b8c679327f121,6b00c6e07523bf0964046af1bf69575f25c6ea393f288d...,2025-09-25T11:00:00.000,2025-09-25T11:45:00.000,2054,9.64,15,32,,,,,,Cash,5 Star Taxi,41.954027649,-87.763399032,"{'type': 'Point', 'coordinates': [-87.76339903...",41.878865584,-87.625192142,"{'type': 'Point', 'coordinates': [-87.62519214...",,
532,61e8dc78d944d14bcbf76ba579760cfdd2fb368b,180309cc29892cd16348ef74d7dd6f33af87e585a29b62...,2025-09-25T23:00:00.000,2025-09-26T01:15:00.000,8722,12.96,56,32,,,,,,Cash,Taxicab Insurance Agency Llc,41.785998518,-87.750934289,"{'type': 'Point', 'coordinates': [-87.75093428...",41.884987192,-87.620992913,"{'type': 'Point', 'coordinates': [-87.62099291...",17031980100.0,17031320100.0


## adattranszformációk

In [6]:
# NaN értékek kezelése
taxi_trips = taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis = 1)
taxi_trips = taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis = 1)

In [7]:
# eldobjuk azokat a sorokat, amelyekben null van
taxi_trips.dropna(inplace = True)

In [8]:
taxi_trips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23186 entries, 0 to 25513
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   trip_id                     23186 non-null  object
 1   taxi_id                     23186 non-null  object
 2   trip_start_timestamp        23186 non-null  object
 3   trip_end_timestamp          23186 non-null  object
 4   trip_seconds                23186 non-null  object
 5   trip_miles                  23186 non-null  object
 6   pickup_community_area       23186 non-null  object
 7   dropoff_community_area      23186 non-null  object
 8   fare                        23186 non-null  object
 9   tips                        23186 non-null  object
 10  tolls                       23186 non-null  object
 11  extras                      23186 non-null  object
 12  trip_total                  23186 non-null  object
 13  payment_type                23186 non-null  object


In [9]:
# oszlopok átnevezése
taxi_trips = taxi_trips.rename(
		columns = {'pickup_community_area' : 'pickup_community_area_id', 'dropoff_community_area' : 'dropoff_community_area_id'})

In [10]:
# időjárás adatok segéd oszlop - adott trip start timestamp órára lekerekített értéke
taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp'])
taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('h')  # órára lefelé kerekít

## időjárás adatok összefűzése

In [11]:
current_datetime = datetime.now() - relativedelta(months = 2)  # datetime típus
formatted_datetime = current_datetime.strftime("%Y-%m-%d")  # string from time formázás

url = "https://archive-api.open-meteo.com/v1/era5"

params = {
	'latitude' : 41.85,  # chicago
	'longitude' : -87.65,
	'start_date' : formatted_datetime,
	'end_date' : formatted_datetime,
	'hourly' : 'temperature_2m,wind_speed_10m,rain,precipitation'
}

response = requests.get(url, params = params)
data = response.json()

# api lekérdezés adatai metaadatok nélkül
weather_data = {
	'datetime' : data['hourly']['time'],
	'temperature' : data['hourly']['temperature_2m'],
	'wind_speed' : data['hourly']['wind_speed_10m'],
	'rain' : data['hourly']['rain'],
	'precipitation' : data['hourly']['precipitation']}

weather_df = pd.DataFrame(weather_data)
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])

In [12]:
taxi_trips_weather = taxi_trips.merge(weather_df, left_on = 'datetime_for_weather', right_on = 'datetime')

## adattípus konverziók

In [13]:
# eredeti verzió tárolása
taxi_trips_original = taxi_trips.copy()

In [14]:
data_types = {
	"trip_end_timestamp" : "datetime64[ns]",
	"trip_seconds" : "int32",
	"trip_miles" : "float",
	"pickup_community_area_id" : "int8",  # -127/+127
	"dropoff_community_area_id" : "int8",
	"fare" : "float",
	"tips" : "float",
	"tolls" : "float",
	"extras" : "float",
	"trip_total" : "float"
}

taxi_trips = taxi_trips.astype(data_types)

In [15]:
taxi_trips.describe()

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,datetime_for_weather
count,23186,23186,23186.0,23186.0,23186.0,23186.0,23186.0,23186.0,23186.0,23186.0,23186.0,23186
mean,2025-09-25 14:22:57.676183808,2025-09-25 14:44:01.930475008,1267.212844,5.796197,31.752997,25.428405,20.885929,2.464223,0.009276,1.082012,24.742484,2025-09-25 14:00:29.345294592
min,2025-09-25 00:00:00,2025-09-25 00:00:00,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2025-09-25 00:00:00
25%,2025-09-25 10:30:00,2025-09-25 10:45:00,496.0,1.11,8.0,8.0,8.5,0.0,0.0,0.0,10.0,2025-09-25 10:00:00
50%,2025-09-25 14:45:00,2025-09-25 15:00:00,884.0,2.58,28.0,28.0,13.44,0.0,0.0,0.0,15.415,2025-09-25 14:00:00
75%,2025-09-25 18:15:00,2025-09-25 18:30:00,1705.0,10.41,38.0,32.0,30.75,3.05,0.0,0.0,32.5,2025-09-25 18:00:00
max,2025-09-25 23:45:00,2025-09-26 08:00:00,57709.0,136.27,77.0,77.0,4950.0,96.25,13.75,75.0,4950.0,2025-09-25 23:00:00
std,,,1364.229964,6.33582,24.175527,19.589923,36.779152,3.916222,0.198876,3.670387,38.580855,


#### memóriahasználat

In [16]:
# összeadja, mennyi memóriát használ a df
orig_mem = taxi_trips_original.memory_usage(deep = True).sum()
opt_mem = taxi_trips.memory_usage(deep = True).sum()

reduction = (orig_mem - opt_mem) / orig_mem * 100

print(f'Original memory usage: {orig_mem} bytes\n'
	  f'Optimized memory usage: {opt_mem} bytes\n'
	  f'Memory reduced by: {reduction} %')

Original memory usage: 27688946 bytes
Optimized memory usage: 16722198 bytes
Memory reduced by: 39.60695361968635 %


## sanity check

In [17]:
# legutolsó trip
taxi_trips[taxi_trips['trip_end_timestamp'] == taxi_trips['trip_end_timestamp'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
9336,2bc7c2d41bfd627f81ef9aba5245d77bafa9624d,faf67da63df4278d1cac829b7a0fc440a84d09942841f1...,2025-09-25 16:45:00,2025-09-26 08:00:00,55350,1.72,28,32,12.0,0.0,0.0,0.0,12.0,Cash,Sun Taxi,41.879255084,-87.642648998,41.877406123,-87.621971652,2025-09-25 16:00:00


In [18]:
# leghosszabb trip
taxi_trips[taxi_trips['trip_seconds'] == taxi_trips['trip_seconds'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
13846,24e14602041ff73ccf62d6386db574888ecf6d3c,54da4e28c09d723164b27064b3a681139550240b96513a...,2025-09-25 14:15:00,2025-09-26 06:15:00,57709,18.99,76,76,50.5,0.0,0.0,4.0,54.5,Cash,Sun Taxi,41.97907082,-87.903039661,41.97907082,-87.903039661,2025-09-25 14:00:00


In [19]:
# legdrágább út
taxi_trips[taxi_trips['fare'] == taxi_trips['fare'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
22157,534d603d8122f7f5b8bd48d0cec3441b41dccf41,3cdbbd850dee58fd415cce1b7af4256839cd5e6523328c...,2025-09-25 08:30:00,2025-09-25 09:00:00,1898,0.0,42,56,4950.0,0.0,0.0,0.0,4950.0,Cash,Tac - Yellow Cab Association,41.77887686,-87.594925439,41.79259236,-87.769615453,2025-09-25 08:00:00


## Dimenziótáblák létrehozása

### payment type dimension

In [20]:
dim_payment_type = taxi_trips['payment_type'].drop_duplicates().reset_index(drop = True)

In [21]:
dim_payment_type = pd.DataFrame({
	'payment_type_id': range(1, len(dim_payment_type)+1),
	'payment_type': dim_payment_type
})

### company dimension

In [22]:
dim_company = taxi_trips['company'].drop_duplicates().reset_index(drop = True)

In [23]:
dim_company = pd.DataFrame({
	'company_id': range(1, len(dim_company)+1),
	'company': dim_company
})

## visszahelyettesítés az eredeti adatokba

In [24]:
fact_taxi_trips = taxi_trips.merge(dim_payment_type, on = 'payment_type', how = 'left')
fact_taxi_trips = fact_taxi_trips.merge(dim_company, on = 'company', how = 'left')

In [25]:
fact_taxi_trips.drop(['payment_type', 'company'], axis = 1, inplace = True)