In [1]:
from io import StringIO
import json
import os

import boto3
import pandas as pd

In [4]:
aws_access_key = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [5]:
aws_access_key

In [6]:
s3 = boto3.client("s3", aws_access_key_id = aws_access_key, aws_secret_access_key = aws_secret_key)

In [7]:
bucket = 'cubix-chicago-taxi-ab'

dim_community_areas_path = 'transformed_data/dim_community_areas/dim_community_areas.csv'
dim_company_path = 'transformed_data/dim_company/dim_company.csv'
dim_date_path = 'transformed_data/dim_date/dim_date.csv'
dim_payment_type_path = 'transformed_data/dim_payment_type/dim_payment_type.csv'

dim_weather_path = 'transformed_data/dim_weather/'
fact_taxi_trips_path = 'transformed_data/fact_taxi_trips/'

In [8]:
def read_file_from_s3(s3, bucket: str, key: str, file_format: str = "csv") :
	"""
    read a csv or json file from S3 bucket

    :param s3:              s3 client
    :param bucket:          name of the s3 bucket where the file is stored
    :param key:             path within the s3 bucket
    :param file_format:     json or csv
    """
	response = s3.get_object(Bucket = bucket, Key = key)
	content = response['Body'].read().decode('utf-8')

	if file_format == 'csv' :
		return pd.read_csv(StringIO(content))
	elif file_format == 'json' :
		return json.loads(content)
	else :
		raise ValueError(f"Unsupported file format: {file_format}")

In [9]:
print(aws_access_key)
print(aws_secret_key)
print(bucket)
print(dim_payment_type_path)

None
None
cubix-chicago-taxi-ab
transformed_data/dim_payment_type/dim_payment_type.csv


In [12]:
dim_community_areas = read_file_from_s3(s3, bucket, dim_community_areas_path)
dim_company = read_file_from_s3(s3, bucket, dim_company_path)
dim_date = read_file_from_s3(s3, bucket, dim_date_path)
dim_payment_type = read_file_from_s3(s3, bucket, dim_payment_type_path)

NoCredentialsError: Unable to locate credentials

In [11]:
print(dim_community_areas.head(1))
print(dim_company.head(1))
print(dim_date.head(1))
print(dim_payment_type.head(1))

NameError: name 'dim_community_areas' is not defined

In [None]:
fact_taxi_trips_list = []
dim_weather_list = []

In [None]:
for file in s3.list_objects(Bucket=bucket, Prefix=fact_taxi_trips_path)["Contents"]:
    taxi_key = file["Key"]
    taxi_raw_file_name = file["Key"].split("/")[-1]

    if taxi_raw_file_name.split(".")[-1] == "csv":
        daily_file_name = fact_taxi_trips_path + taxi_raw_file_name
        taxi_trip_daily = read_file_from_s3(s3, bucket, daily_file_name)

        fact_taxi_trips_list.append(taxi_trip_daily)
        print(f"{taxi_raw_file_name} has been added")

In [None]:
fact_taxi_trips = pd.concat(fact_taxi_trips_list, ignore_index = True)

In [None]:
for file in s3.list_objects(Bucket=bucket, Prefix=dim_weather_path)["Contents"]:
    weather_key = file["Key"]
    weather_raw_file_name = file["Key"].split("/")[-1]

    if weather_raw_file_name.split(".")[-1] == "csv":
        daily_file_name = dim_weather_path + weather_raw_file_name
        weather_daily = read_file_from_s3(s3, bucket, daily_file_name)

        dim_weather_list.append(weather_daily)
        print(f"{weather_raw_file_name} has been added")

In [None]:
dim_weather = pd.concat(dim_weather_list, ignore_index = True)

In [None]:
dim_weather.info()

## create datamodel

In [None]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips, dim_weather, left_on="datetime_for_weather", right_on="datetime")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns=["datetime", "datetime_for_weather"])

fact_taxi_trips_full.head()

In [None]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_company, left_on="company_id", right_on="company_id")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns=["company_id"])

In [None]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_payment_type, left_on="payment_type_id", right_on="payment_type_id")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns=["payment_type_id"])

In [None]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_community_areas, left_on="pickup_community_area_id", right_on="area_code")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns=["pickup_community_area_id", "area_code"])
fact_taxi_trips_full.rename(columns={"community_area": "pickup_community_area_name"}, inplace=True)

In [None]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_community_areas, left_on="dropoff_community_area_id", right_on="area_code")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns=["dropoff_community_area_id", "area_code"])
fact_taxi_trips_full.rename(columns={"community_area": "dropoff_community_area_name"}, inplace=True)

In [None]:
dim_date["date"] = pd.to_datetime(dim_date["date"])
fact_taxi_trips_full["trip_start_timestamp"] = pd.to_datetime(fact_taxi_trips_full["trip_start_timestamp"])
fact_taxi_trips_full["trip_start_date"] = pd.to_datetime(fact_taxi_trips_full["trip_start_timestamp"].dt.date)

In [None]:
fact_taxi_trips_full = pd.merge(fact_taxi_trips_full, dim_date, left_on="trip_start_date", right_on="date")
fact_taxi_trips_full = fact_taxi_trips_full.drop(columns=["trip_start_date", "date"])