In [1]:
from io import StringIO
import json
import os

from dotenv import load_dotenv
load_dotenv()

import boto3
import pandas as pd

In [2]:
aws_access_key = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [3]:
s3 = boto3.client("s3", aws_access_key_id = aws_access_key, aws_secret_access_key = aws_secret_key)

In [4]:
bucket = 'cubix-chicago-taxi-ab'

dim_community_areas_path = 'transformed_data/dim_community_areas/dim_community_areas.csv'
dim_company_path = 'transformed_data/dim_company/dim_company.csv'
dim_date_path = 'transformed_data/dim_date/dim_date.csv'
dim_payment_type_path = 'transformed_data/dim_payment_type/dim_payment_type.csv'

dim_weather_path = 'transformed_data/dim_weather/'
fact_taxi_trips_path = 'transformed_data/fact_taxi_trips/'

In [5]:
def read_file_from_s3(s3, bucket: str, key: str, file_format: str = "csv") :
	"""
    read a csv or json file from S3 bucket

    :param s3:              s3 client
    :param bucket:          name of the s3 bucket where the file is stored
    :param key:             path within the s3 bucket
    :param file_format:     json or csv
    """
	response = s3.get_object(Bucket = bucket, Key = key)
	content = response['Body'].read().decode('utf-8')

	if file_format == 'csv' :
		return pd.read_csv(StringIO(content))
	elif file_format == 'json' :
		return json.loads(content)
	else :
		raise ValueError(f"Unsupported file format: {file_format}")

In [6]:
# print(bucket)
# print(dim_payment_type_path)

In [7]:
dim_community_areas = read_file_from_s3(s3, bucket, dim_community_areas_path)
dim_company = read_file_from_s3(s3, bucket, dim_company_path)
dim_date = read_file_from_s3(s3, bucket, dim_date_path)
dim_payment_type = read_file_from_s3(s3, bucket, dim_payment_type_path)

In [8]:
print(dim_community_areas.head(1))
print(dim_company.head(1))
print(dim_date.head(1))
print(dim_payment_type.head(1))

   area_code community_area
0          1    Rogers Park
   company_id    company
0           1  Flash Cab
         date  year  month  day  day_of_week  is_weekend
0  2025-01-01  2025      1    1            3       False
   payment_type_id payment_type
0                1       Prcard


In [9]:
fact_taxi_trips_list = []
dim_weather_list = []

In [10]:
# read files from AWS transformed_data/fact_taxi_trips and add them to the list
for file in s3.list_objects(Bucket = bucket, Prefix = fact_taxi_trips_path)["Contents"] :
	taxi_key = file["Key"]
	taxi_raw_file_name = file["Key"].split("/")[-1]

	if taxi_raw_file_name.split(".")[-1] == "csv" :
		daily_file_name = fact_taxi_trips_path + taxi_raw_file_name
		taxi_trip_daily = read_file_from_s3(s3, bucket, daily_file_name)

		fact_taxi_trips_list.append(taxi_trip_daily)
		print(f"{taxi_raw_file_name} has been added")

taxi_2025-08-04.csv has been added
taxi_2025-08-05.csv has been added
taxi_2025-08-06.csv has been added
taxi_2025-08-07.csv has been added
taxi_2025-08-08.csv has been added
taxi_2025-08-09.csv has been added
taxi_2025-08-10.csv has been added
taxi_2025-08-11.csv has been added
taxi_2025-08-12.csv has been added
taxi_2025-08-13.csv has been added
taxi_2025-09-25.csv has been added
taxi_2025-09-27.csv has been added
taxi_2025-09-28.csv has been added
taxi_2025-09-29.csv has been added
taxi_2025-09-30.csv has been added
taxi_2025-10-01.csv has been added
taxi_2025-10-02.csv has been added
taxi_2025-10-03.csv has been added
taxi_2025-10-04.csv has been added
taxi_2025-10-13.csv has been added
taxi_2025-10-15.csv has been added


In [11]:
# concatenate all taxi trips lists into one
fact_taxi_trips = pd.concat(fact_taxi_trips_list, ignore_index = True)

In [12]:
# read files from AWS transformed_data/dim_weather and add them to a list
for file in s3.list_objects(Bucket = bucket, Prefix = dim_weather_path)["Contents"] :
	weather_key = file["Key"]
	weather_raw_file_name = file["Key"].split("/")[-1]

	if weather_raw_file_name.split(".")[-1] == "csv" :
		daily_file_name = dim_weather_path + weather_raw_file_name
		weather_daily = read_file_from_s3(s3, bucket, daily_file_name)

		dim_weather_list.append(weather_daily)
		print(f"{weather_raw_file_name} has been added")

weather_2025-09-30.csv has been added


In [13]:
# concatenate all dim weather lists into one
dim_weather = pd.concat(dim_weather_list, ignore_index = True)

In [14]:
dim_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   datetime       24 non-null     object 
 1   temperature    24 non-null     float64
 2   wind_speed     24 non-null     float64
 3   rain           24 non-null     float64
 4   precipitation  24 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.1+ KB


## create datamodel

In [15]:
# start merging with faxi taxi trips and dim weather
fact_taxi_trips_full = pd.merge(fact_taxi_trips, dim_weather, how = 'left', left_on = "datetime_for_weather", right_on = "datetime")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns = ["datetime", "datetime_for_weather"])

fact_taxi_trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,fare,tips,tolls,...,pickup_centroid_longitude,dropoff_community_area_id,dropoff_centroid_latitude,dropoff_centroid_longitude,payment_type_id,company_id,temperature,wind_speed,rain,precipitation
0,158a42d78de596df02ceb20cfa22a4c5a72a4144,b3ee94a13b61037620cbbfc6a4a106d2ad628a06f0988f...,2025-08-04 23:45:00,2025-08-04T23:45:00.000,60,0.0,6,25.0,0.0,0.0,...,-87.655998,6,41.944227,-87.655998,3,4,,,,
1,0e10ee7b114d289f4ffaea92f5fe9a1cf394d24a,6c1e4e8e25a1b47575b359c5a0844cf23c50e540a86ecd...,2025-08-04 23:45:00,2025-08-04T23:45:00.000,427,1.2,8,6.75,0.0,0.0,...,-87.633308,8,41.899602,-87.633308,2,1,,,,
2,07187497f109f28d568a52280e6c40c89747e295,fc68d8f57ebef02a03efc9ed6c5ebd4488403fa6458091...,2025-08-04 23:45:00,2025-08-05T00:00:00.000,951,4.95,6,19.7,0.0,0.0,...,-87.655998,24,41.901207,-87.676356,4,11,,,,
3,04e399aaf339cfccde8eab32020dc6334452cbb2,b77a2dcc078698ea493d4d703014076e4272dc7d8b420e...,2025-08-04 23:45:00,2025-08-04T23:45:00.000,420,0.0,33,0.0,0.0,0.0,...,-87.620335,33,41.857184,-87.620335,2,4,,,,
4,019730e0f5b40188b4ebb68c21d05527699f1b99,00c152d49563ae56e208fde25f6c65b08511d0f50259d1...,2025-08-04 23:45:00,2025-08-05T00:15:00.000,1238,16.99,76,41.75,9.25,0.0,...,-87.913625,28,41.874005,-87.663518,4,5,,,,


In [16]:
# continue mergint into the taxi trips full df
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_company, left_on = "company_id", right_on = "company_id")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns = ["company_id"])

In [17]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_payment_type, left_on = "payment_type_id", right_on = "payment_type_id")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns = ["payment_type_id"])

In [18]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_community_areas, left_on = "pickup_community_area_id", right_on = "area_code")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns = ["pickup_community_area_id", "area_code"])
fact_taxi_trips_full.rename(columns = {"community_area" : "pickup_community_area_name"}, inplace = True)

In [19]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_community_areas, left_on = "dropoff_community_area_id", right_on = "area_code")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns = ["dropoff_community_area_id", "area_code"])
fact_taxi_trips_full.rename(columns = {"community_area" : "dropoff_community_area_name"}, inplace = True)

In [20]:
dim_date["date"] = pd.to_datetime(dim_date["date"])
fact_taxi_trips_full["trip_start_timestamp"] = pd.to_datetime(fact_taxi_trips_full["trip_start_timestamp"])
fact_taxi_trips_full["trip_start_date"] = pd.to_datetime(fact_taxi_trips_full["trip_start_timestamp"].dt.date)

In [None]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_date, left_on = "trip_start_date", right_on = "date")

In [21]:
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns = ["trip_start_date", "date"])

In [22]:
fact_taxi_trips_full

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,...,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name,year,month,day,day_of_week,is_weekend
0,158a42d78de596df02ceb20cfa22a4c5a72a4144,b3ee94a13b61037620cbbfc6a4a106d2ad628a06f0988f...,2025-08-04 23:45:00,2025-08-04T23:45:00.000,60,0.00,25.00,0.00,0.0,0.0,...,,Transit Administrative Center Inc,Credit Card,Lake View,Lake View,2025,8,4,1,False
1,0e10ee7b114d289f4ffaea92f5fe9a1cf394d24a,6c1e4e8e25a1b47575b359c5a0844cf23c50e540a86ecd...,2025-08-04 23:45:00,2025-08-04T23:45:00.000,427,1.20,6.75,0.00,0.0,0.0,...,,Flash Cab,Cash,Near North Side,Near North Side,2025,8,4,1,False
2,07187497f109f28d568a52280e6c40c89747e295,fc68d8f57ebef02a03efc9ed6c5ebd4488403fa6458091...,2025-08-04 23:45:00,2025-08-05T00:00:00.000,951,4.95,19.70,0.00,0.0,0.0,...,,Taxicab Insurance Agency Llc,Mobile,Lake View,West Town,2025,8,4,1,False
3,04e399aaf339cfccde8eab32020dc6334452cbb2,b77a2dcc078698ea493d4d703014076e4272dc7d8b420e...,2025-08-04 23:45:00,2025-08-04T23:45:00.000,420,0.00,0.00,0.00,0.0,0.0,...,,Transit Administrative Center Inc,Cash,Near South Side,Near South Side,2025,8,4,1,False
4,019730e0f5b40188b4ebb68c21d05527699f1b99,00c152d49563ae56e208fde25f6c65b08511d0f50259d1...,2025-08-04 23:45:00,2025-08-05T00:15:00.000,1238,16.99,41.75,9.25,0.0,4.0,...,,City Service,Mobile,O'Hare[11],Near West Side,2025,8,4,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394279,53a3dba257e80add0a12c5fcdc17cc40ed64e78e,3faa8cd4b08a27471c13be24573d831fea1445947afb30...,2025-10-15 00:00:00,2025-10-15T00:45:00.000,1942,28.29,67.00,14.70,0.0,6.0,...,,Chicago Independents,Credit Card,O'Hare[11],Near North Side,2025,10,15,3,False
394280,4a429e20742d21693c9b4e0385491509af341034,2e82e26afb77e809fe4a44b02a152bdc079623600ae1b7...,2025-10-15 00:00:00,2025-10-15T00:15:00.000,1140,11.80,30.25,0.00,0.0,4.0,...,,Transit Administrative Center Inc,Cash,Garfield Ridge,Near North Side,2025,10,15,3,False
394281,145ef86e849ff92ff8a425dc2332605a60d54e5c,ffda53354c610fd3af1aee46d723028a49014e35f7280c...,2025-10-15 00:00:00,2025-10-15T00:15:00.000,246,0.94,5.50,0.00,0.0,0.0,...,,Flash Cab,Cash,Near North Side,Near North Side,2025,10,15,3,False
394282,497b4f347e35cf3b2518bcacd54390acb4595889,e2c349c7cbb608d552aa0b5814031943f13641ef9e50d8...,2025-10-15 00:00:00,2025-10-15T00:15:00.000,1140,14.40,36.50,8.10,0.0,4.0,...,,Transit Administrative Center Inc,Credit Card,O'Hare[11],Lincoln Park,2025,10,15,3,False
