### EDA: exploratory data analysis

In [28]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

import pandas as pd
pd.set_option('display.max_columns', 30)

import requests

In [29]:
current_datetime = datetime.now() - relativedelta(months = 2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")

In [30]:
url = (f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
	   f"$where=trip_start_timestamp >= '{formatted_datetime}T00:00:00' "
	   f"AND trip_start_timestamp <= '{formatted_datetime}T23:59:59'"
	   f"&$limit=30000")
response = requests.get(url)
data = response.json()
taxi_trips = pd.DataFrame(data)

## adattranszformáció, időjárás és dátum dimenziók segédoszlopai

In [31]:
# NaN értékek kezelése
taxi_trips = taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis = 1)
taxi_trips = taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis = 1)

In [32]:
# eldobjuk azokat a sorokat, amelyekben null van
taxi_trips.dropna(inplace = True)

In [33]:
# oszlopok átnevezése
taxi_trips = taxi_trips.rename(
		columns = {'pickup_community_area' : 'pickup_community_area_id', 'dropoff_community_area' : 'dropoff_community_area_id'})

In [34]:
# időjárás adatok segéd oszlop - adott trip start timestamp órára lekerekített értéke
taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp'])
taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('h')  # órára lefelé kerekít

In [35]:
taxi_trips['trip_start_date'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.date

## időjárás adatok összefűzése

In [36]:
current_datetime = datetime.now() - relativedelta(months = 2)  # datetime típus
formatted_datetime = current_datetime.strftime("%Y-%m-%d")  # string from time formázás

url = "https://archive-api.open-meteo.com/v1/era5"

params = {
	'latitude' : 41.85,  # chicago
	'longitude' : -87.65,
	'start_date' : formatted_datetime,
	'end_date' : formatted_datetime,
	'hourly' : 'temperature_2m,wind_speed_10m,rain,precipitation'
}

response = requests.get(url, params = params)
data = response.json()

# api lekérdezés adatai metaadatok nélkül
weather_data = {
	'datetime' : data['hourly']['time'],
	'temperature' : data['hourly']['temperature_2m'],
	'wind_speed' : data['hourly']['wind_speed_10m'],
	'rain' : data['hourly']['rain'],
	'precipitation' : data['hourly']['precipitation']}

weather_df = pd.DataFrame(weather_data)
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])

In [37]:
taxi_trips_weather = taxi_trips.merge(weather_df, left_on = 'datetime_for_weather', right_on = 'datetime')

## adattípus konverziók

In [38]:
# eredeti verzió tárolása
taxi_trips_original = taxi_trips.copy()

In [39]:
data_types = {
	"trip_end_timestamp" : "datetime64[ns]",
	"trip_seconds" : "int32",
	"trip_miles" : "float",
	"pickup_community_area_id" : "int8",  # -127/+127
	"dropoff_community_area_id" : "int8",
	"fare" : "float",
	"tips" : "float",
	"tolls" : "float",
	"extras" : "float",
	"trip_total" : "float"
}

taxi_trips = taxi_trips.astype(data_types)

In [40]:
taxi_trips.describe()

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,datetime_for_weather
count,23186,23186,23186.0,23186.0,23186.0,23186.0,23186.0,23186.0,23186.0,23186.0,23186.0,23186
mean,2025-09-25 14:22:57.676183808,2025-09-25 14:44:01.930475008,1267.212844,5.796197,31.752997,25.428405,20.885929,2.464223,0.009276,1.082012,24.742484,2025-09-25 14:00:29.345294592
min,2025-09-25 00:00:00,2025-09-25 00:00:00,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2025-09-25 00:00:00
25%,2025-09-25 10:30:00,2025-09-25 10:45:00,496.0,1.11,8.0,8.0,8.5,0.0,0.0,0.0,10.0,2025-09-25 10:00:00
50%,2025-09-25 14:45:00,2025-09-25 15:00:00,884.0,2.58,28.0,28.0,13.44,0.0,0.0,0.0,15.415,2025-09-25 14:00:00
75%,2025-09-25 18:15:00,2025-09-25 18:30:00,1705.0,10.41,38.0,32.0,30.75,3.05,0.0,0.0,32.5,2025-09-25 18:00:00
max,2025-09-25 23:45:00,2025-09-26 08:00:00,57709.0,136.27,77.0,77.0,4950.0,96.25,13.75,75.0,4950.0,2025-09-25 23:00:00
std,,,1364.229964,6.33582,24.175527,19.589923,36.779152,3.916222,0.198876,3.670387,38.580855,


#### memóriahasználat

In [41]:
# összeadja, mennyi memóriát használ a df
orig_mem = taxi_trips_original.memory_usage(deep = True).sum()
opt_mem = taxi_trips.memory_usage(deep = True).sum()

reduction = (orig_mem - opt_mem) / orig_mem * 100

print(f'Original memory usage: {orig_mem} bytes\n'
	  f'Optimized memory usage: {opt_mem} bytes\n'
	  f'Memory reduced by: {reduction} %')

Original memory usage: 28616386 bytes
Optimized memory usage: 17649638 bytes
Memory reduced by: 38.323315879230876 %


## sanity check

In [42]:
# legutolsó trip
taxi_trips[taxi_trips['trip_end_timestamp'] == taxi_trips['trip_end_timestamp'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,trip_start_date
9336,2bc7c2d41bfd627f81ef9aba5245d77bafa9624d,faf67da63df4278d1cac829b7a0fc440a84d09942841f1...,2025-09-25 16:45:00,2025-09-26 08:00:00,55350,1.72,28,32,12.0,0.0,0.0,0.0,12.0,Cash,Sun Taxi,41.879255084,-87.642648998,41.877406123,-87.621971652,2025-09-25 16:00:00,2025-09-25


In [43]:
# leghosszabb trip
taxi_trips[taxi_trips['trip_seconds'] == taxi_trips['trip_seconds'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,trip_start_date
13846,24e14602041ff73ccf62d6386db574888ecf6d3c,54da4e28c09d723164b27064b3a681139550240b96513a...,2025-09-25 14:15:00,2025-09-26 06:15:00,57709,18.99,76,76,50.5,0.0,0.0,4.0,54.5,Cash,Sun Taxi,41.97907082,-87.903039661,41.97907082,-87.903039661,2025-09-25 14:00:00,2025-09-25


In [44]:
# legdrágább út
taxi_trips[taxi_trips['fare'] == taxi_trips['fare'].max()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,trip_start_date
22157,534d603d8122f7f5b8bd48d0cec3441b41dccf41,3cdbbd850dee58fd415cce1b7af4256839cd5e6523328c...,2025-09-25 08:30:00,2025-09-25 09:00:00,1898,0.0,42,56,4950.0,0.0,0.0,0.0,4950.0,Cash,Tac - Yellow Cab Association,41.77887686,-87.594925439,41.79259236,-87.769615453,2025-09-25 08:00:00,2025-09-25


## Dimenziótáblák létrehozása

### payment type dimension

In [45]:
dim_payment_type = taxi_trips['payment_type'].drop_duplicates().reset_index(drop = True)

In [46]:
dim_payment_type = pd.DataFrame({
	'payment_type_id' : range(1, len(dim_payment_type) + 1),
	'payment_type' : dim_payment_type
})

### company dimension

In [47]:
dim_company = taxi_trips['company'].drop_duplicates().reset_index(drop = True)

In [48]:
dim_company = pd.DataFrame({
	'company_id' : range(1, len(dim_company) + 1),
	'company' : dim_company
})

## visszahelyettesítés az eredeti adatokba

In [49]:
fact_taxi_trips = taxi_trips.merge(dim_payment_type, on = 'payment_type', how = 'left')
fact_taxi_trips = fact_taxi_trips.merge(dim_company, on = 'company', how = 'left')

In [50]:
fact_taxi_trips.drop(['payment_type', 'company'], axis = 1, inplace = True)

In [51]:
# dim_payment_type.to_csv('dim_payment_type.csv', index = False)
# dim_company.to_csv('dim_company.csv', index = False)

## Update logika a payment dimenziótábláknak

In [52]:
# új adat szimulálása
dummy_payment_type_data = [{'payment_type' : 'Credit Card'},
						   {'payment_type' : 'Z'},
						   {'payment_type' : 'Y'},
						   {'payment_type' : 'Z'}]
dummy_payment_type_data_df = pd.DataFrame(dummy_payment_type_data)
# ismétlődések eltávolítása
todays_payment_types = pd.DataFrame(dummy_payment_type_data_df['payment_type'].unique(), columns = ['payment_type'])

# új típusok, amik nincsenek benne a dimben
new_payment_types = todays_payment_types[~todays_payment_types['payment_type'].isin(dim_payment_type['payment_type'])]

if not new_payment_types.empty :
	max_id = dim_payment_type['payment_type_id'].max()
	new_payment_types['payment_type_id'] = range(max_id + 1, max_id + 1 + len(new_payment_types))
	dim_payment_type = pd.concat([dim_payment_type, new_payment_types], ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_payment_types['payment_type_id'] = range(max_id + 1, max_id + 1 + len(new_payment_types))


In [53]:
# új adat szimulálása
dummy_payment_type_data = [{'payment_type' : 'Credit Card'},
						   {'payment_type' : 'X'},
						   {'payment_type' : 'Y'},
						   {'payment_type' : 'X'}]
dummy_payment_type_data_df = pd.DataFrame(dummy_payment_type_data)
# ismétlődések eltávolítása
todays_payment_types = pd.DataFrame(dummy_payment_type_data_df['payment_type'].unique(), columns = ['payment_type'])

# új típusok, amik nincsenek benne a dimben
new_payment_types = todays_payment_types[~todays_payment_types['payment_type'].isin(dim_payment_type['payment_type'])]

if not new_payment_types.empty :
	max_id = dim_payment_type['payment_type_id'].max()
	new_payment_types['payment_type_id'] = range(max_id + 1, max_id + 1 + len(new_payment_types))
	dim_payment_type = pd.concat([dim_payment_type, new_payment_types], ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_payment_types['payment_type_id'] = range(max_id + 1, max_id + 1 + len(new_payment_types))


## Company dimenziótábla update logika

In [54]:
# új adat szimulálása
dummy_company_data = [{'company' : 'Metro Jet Taxi A.'},
					  {'company' : 'X'},
					  {'company' : 'Y'},
					  {'company' : 'X'}]
dummy_company_df = pd.DataFrame(dummy_company_data)
# ismétlődések eltávolítása
todays_companies = pd.DataFrame(dummy_company_df['company'].unique(), columns = ['company'])

# új típusok, amik nincsenek benne a dimben
new_companies = todays_companies[~todays_companies['company'].isin(dim_company['company'])]

if not new_companies.empty :
	max_id = dim_company['company_id'].max()
	new_companies['company_id'] = range(max_id + 1, max_id + 1 + len(new_companies))
	dim_company = pd.concat([dim_company, new_companies], ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_companies['company_id'] = range(max_id + 1, max_id + 1 + len(new_companies))
