In [1]:
import os
import pandas as pd
import boto3
from io import StringIO
from dotenv import load_dotenv

pd.set_option('display.max_columns', 50)

In [2]:
load_dotenv()
aws_access_key = os.getenv('AWS_ACCESS_KEY')
aws_secret_key = os.getenv('AWS_SECRET_KEY')

In [3]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """
    Reads a CSV file from an Amazon S3 bucket into a Pandas DataFrame.

    Parameters:
    - bucket (str): The name of the Amazon S3 bucket.
    - path (str): The path within the bucket where the CSV file is located.
    - filename (str): The name of the CSV file to be read.

    Returns:
    - pd.DataFrame: A Pandas DataFrame containing the data read from the CSV file.

    Note:
    - Requires the boto3 library to be installed.
    - Assumes the CSV file is UTF-8 encoded.
    """
    
    s3 = boto3.client('s3', aws_access_key_id = aws_access_key, aws_secret_access_key = aws_secret_key)
    full_path = f'{path}{filename}'
    
    object = s3.get_object(Bucket=bucket, Key=full_path)
    object = object['Body'].read().decode('utf-8')
    
    return pd.read_csv(StringIO(object))

In [4]:
bucket = 'chicago-taxi'
payment_type_path = 'transformed_data/payment_type/'
payment_type_file = 'payment_type_master.csv'

community_areas_path = 'transformed_data/community_areas/'
community_areas_file = 'community_areas_master.csv'

company_path = 'transformed_data/company/'
company_file = 'company_master.csv'

date_path = 'transformed_data/date/'
date_file = 'date_dim.csv'

weather_path = 'transformed_data/weather/'
weather_file = ''

taxi_trips_path = 'transformed_data/taxi_trips/'
taxi_trips_file = ''

In [5]:
s3 = boto3.client('s3', aws_access_key_id = aws_access_key, aws_secret_access_key = aws_secret_key)

community_areas = read_csv_from_s3(bucket=bucket, path=community_areas_path, filename=community_areas_file )
company = read_csv_from_s3(bucket=bucket, path=company_path, filename=company_file )
date = read_csv_from_s3(bucket=bucket, path=date_path, filename=date_file )
payment_type = read_csv_from_s3(bucket=bucket, path=payment_type_path, filename=payment_type_file )

In [6]:
trips_list = []
weather_list = []

In [7]:
#Taxi data transformation and loading
for file in s3.list_objects(Bucket = bucket, Prefix = taxi_trips_path)['Contents']:
    taxi_trip_key = file['Key']
    if taxi_trip_key.split('/')[-1].strip() != '':
        if taxi_trip_key.split('.')[1] == 'csv':
            filename = taxi_trip_key.split('/')[-1].strip()
            trip = read_csv_from_s3(bucket=bucket, path=taxi_trips_path, filename=filename)
            trips_list.append(trip)
            print(f'{filename} has been added.')

taxi_2024-03-15.csv has been added.
taxi_2024-03-16.csv has been added.
taxi_2024-03-17.csv has been added.


In [8]:
trips_df = pd.concat(trips_list, ignore_index=True)

In [9]:
trips_df.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
0,00c6d0cc3d95795a2b9cb3fcbb3afa8eed897ac7,3ae83cc261cea27eafc3d9b18bbc93100c03762e8b6971...,2024-03-15T23:45:00.000,2024-03-16T00:15:00.000,1680,1.0,76,32,42.25,9.55,0.0,5.0,56.8,41.979071,-87.90304,41.884987,-87.620993,2024-03-15 23:00:00,1,2
1,fd729619c5458746ffe602406d2f86cb4059ee35,3c814d3baedca9be4de8ddb5547c7dec404a08e148740e...,2024-03-15T23:45:00.000,2024-03-16T00:15:00.000,1569,18.06,76,8,44.5,14.7,0.0,4.0,63.7,41.980264,-87.913625,41.899602,-87.633308,2024-03-15 23:00:00,1,1
2,f8aef1517ec3953a3805d0912f92f994e1245205,9de14279ac4dc5696c73c13b07b0aaf2b1a1796dda9f4c...,2024-03-15T23:45:00.000,2024-03-15T23:45:00.000,336,1.64,22,24,10.0,2.0,0.0,0.0,12.0,41.922761,-87.699155,41.901207,-87.676356,2024-03-15 23:00:00,4,8
3,efa623a2405100c731e62efc223d3c02ddee3d8a,8a999732f0972dda5aa358ad377427f0cb844b5ec246a9...,2024-03-15T23:45:00.000,2024-03-16T00:00:00.000,1370,5.51,38,8,18.5,0.0,0.0,1.0,19.5,41.812949,-87.61786,41.899602,-87.633308,2024-03-15 23:00:00,3,3
4,ebbf769b25db8056d8ffc0b27f982ac1102eab4e,7ff3ea8c15d902e432f0f3ca3aab1d5f20bff4c4fedfb5...,2024-03-15T23:45:00.000,2024-03-15T23:45:00.000,600,2.3,6,8,9.0,2.1,0.0,1.0,12.1,41.944227,-87.655998,41.899602,-87.633308,2024-03-15 23:00:00,1,2


In [10]:
#Weather data trandform and loading
for file in s3.list_objects(Bucket = bucket, Prefix = weather_path)['Contents']:
    weather_key = file['Key']
    
    if weather_key.split('/')[-1].strip() != '':
        if weather_key.split('.')[1] == 'csv':
            filename = weather_key.split('/')[-1].strip()
            weather = read_csv_from_s3(bucket=bucket, path=weather_path, filename=filename)
            weather_list.append(weather)
            print(f'{filename} has been added.')


weather_2024-03-15.csv has been added.
weather_2024-03-16.csv has been added.
weather_2024-03-17.csv has been added.


In [11]:
weather_df = pd.concat(weather_list, ignore_index=True)

In [12]:
weather_df.head()

Unnamed: 0,datetime,temperature,wind,precipitation,rain
0,2024-03-15 00:00:00,6.5,36.4,0.1,0.1
1,2024-03-15 01:00:00,5.8,36.1,0.0,0.0
2,2024-03-15 02:00:00,4.8,30.5,0.0,0.0
3,2024-03-15 03:00:00,4.7,29.3,0.0,0.0
4,2024-03-15 04:00:00,4.4,27.1,0.0,0.0


In [13]:
community_areas.head()

Unnamed: 0,Area code,Area name
0,1,Rogers Park
1,2,West Ridge
2,3,Uptown
3,4,Lincoln Square
4,5,North Center


In [14]:
company.head()

Unnamed: 0,company_id,company
0,1,Sun Taxi
1,2,Taxi Affiliation Services
2,3,Flash Cab
3,4,Blue Ribbon Taxi Association
4,5,5 Star Taxi


In [15]:
date.head()

Unnamed: 0,date,year,month,month name,day,day of week,day of name,is_weekend,is_holiday
0,2023-01-01,2023,1,January,1,7,Sunday,True,True
1,2023-01-02,2023,1,January,2,1,Monday,False,False
2,2023-01-03,2023,1,January,3,2,Tuesday,False,False
3,2023-01-04,2023,1,January,4,3,Wednesday,False,False
4,2023-01-05,2023,1,January,5,4,Thursday,False,False


In [16]:
payment_type.head()

Unnamed: 0,payment_type_id,payment_type
0,1,Credit Card
1,2,Cash
2,3,Prcard
3,4,Mobile
4,5,Unknown


#### Join data

In [17]:
trips_full_df = pd.merge(trips_df, weather_df, left_on='datetime_for_weather', right_on='datetime', how='inner')
trips_full_df = trips_full_df.drop(columns='datetime_for_weather')

In [18]:
trips_full_df = pd.merge(trips_full_df, company, left_on='company_id', right_on='company_id', how='inner')
trips_full_df = trips_full_df.drop(columns='company_id')

In [19]:
trips_full_df = pd.merge(trips_full_df, payment_type, left_on='payment_type_id', right_on='payment_type_id', how='inner')
trips_full_df = trips_full_df.drop(columns='payment_type_id')

In [20]:
trips_full_df = pd.merge(trips_full_df, community_areas, left_on='pickup_community_area_id', right_on='Area code', how='inner')
trips_full_df = trips_full_df.drop(columns=['pickup_community_area_id', 'Area code'])

trips_full_df = trips_full_df.rename(columns={'Area name': 'pickup_community_area'})

In [21]:
trips_full_df.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime,temperature,wind,precipitation,rain,company,payment_type,pickup_community_area
0,00c6d0cc3d95795a2b9cb3fcbb3afa8eed897ac7,3ae83cc261cea27eafc3d9b18bbc93100c03762e8b6971...,2024-03-15T23:45:00.000,2024-03-16T00:15:00.000,1680,1.0,32,42.25,9.55,0.0,5.0,56.8,41.979071,-87.90304,41.884987,-87.620993,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare
1,ac079f75688c4e0a8b01c2548807d31f817bdedc,196d0cab116da01f1cb8685842d0ce1052d6e0b368d7d5...,2024-03-15T23:45:00.000,2024-03-16T00:15:00.000,1680,0.9,8,40.0,9.3,0.0,6.0,55.3,41.979071,-87.90304,41.892508,-87.626215,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare
2,78d3c2efc2305fc04ee277ed8caa5d54d5c2123d,b5a0dd68699c0bb8cfe22442f438e3291ad2f57aa69241...,2024-03-15T23:45:00.000,2024-03-16T00:15:00.000,1560,16.3,8,40.5,7.7,0.0,4.0,52.2,41.979071,-87.90304,41.907492,-87.63576,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare
3,39dc5203647427362c1c001601b81b73a76cffca,42dc625d3d8ac379a07acaf3a3bea2ac319ad318629df1...,2024-03-15T23:45:00.000,2024-03-16T00:15:00.000,1680,13.5,6,34.75,11.75,0.0,4.0,50.5,41.980264,-87.913625,41.944227,-87.655998,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare
4,37046939504e67d740f2a6d22dfb0f4337694806,83a03e93bf25ce653700ec8360c79b5f393ab5c9167f31...,2024-03-15T23:45:00.000,2024-03-16T00:15:00.000,1740,15.4,7,39.5,8.8,0.0,4.0,52.3,41.980264,-87.913625,41.922686,-87.649489,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare


In [23]:
trips_full_df = pd.merge(trips_full_df, community_areas, left_on='dropoff_community_area_id', right_on='Area code', how='inner')
trips_full_df = trips_full_df.drop(columns=['dropoff_community_area_id', 'Area code'])
trips_full_df = trips_full_df.rename(columns={'Area name': 'dropoff_community_area'})

In [25]:
date['date'] = pd.to_datetime(date['date'])
date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1827 entries, 0 to 1826
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         1827 non-null   datetime64[ns]
 1   year         1827 non-null   int64         
 2   month        1827 non-null   int64         
 3   month name   1827 non-null   object        
 4   day          1827 non-null   int64         
 5   day of week  1827 non-null   int64         
 6   day of name  1827 non-null   object        
 7   is_weekend   1827 non-null   bool          
 8   is_holiday   1827 non-null   bool          
dtypes: bool(2), datetime64[ns](1), int64(4), object(2)
memory usage: 103.6+ KB


In [26]:
trips_full_df['trip_start_timestamp'] = pd.to_datetime(trips_full_df['trip_start_timestamp'])

In [30]:
trips_full_df['trip_start_date'] = trips_full_df['trip_start_timestamp'].dt.date
trips_full_df['trip_start_date'] = pd.to_datetime(trips_full_df['trip_start_date'])

In [33]:
trips_full_df = pd.merge(trips_full_df, date, left_on='trip_start_date', right_on='date', how='inner')

In [35]:
trips_full_df.drop(columns=['date'], inplace=True)
trips_full_df.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime,temperature,wind,precipitation,rain,company,payment_type,pickup_community_area,dropoff_community_area,trip_start_date,year,month,month name,day,day of week,day of name,is_weekend,is_holiday
0,00c6d0cc3d95795a2b9cb3fcbb3afa8eed897ac7,3ae83cc261cea27eafc3d9b18bbc93100c03762e8b6971...,2024-03-15 23:45:00,2024-03-16T00:15:00.000,1680,1.0,42.25,9.55,0.0,5.0,56.8,41.979071,-87.90304,41.884987,-87.620993,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare,Loop,2024-03-15,2024,3,March,15,5,Friday,False,False
1,cf1470edb5fc3c790282405d7b20adc84451a4e3,515dbaaba624daeb95c3dfefb93bfc1764b99ed2ff96b7...,2024-03-15 22:30:00,2024-03-15T23:00:00.000,1920,24.6,60.0,13.7,0.0,8.0,81.7,41.979071,-87.90304,41.880994,-87.632746,2024-03-15 22:00:00,4.9,12.0,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare,Loop,2024-03-15,2024,3,March,15,5,Friday,False,False
2,a4656ce1644f3c657e2be72b09a994797456b3ce,b3ee94a13b61037620cbbfc6a4a106d2ad628a06f0988f...,2024-03-15 22:00:00,2024-03-15T22:45:00.000,2700,17.7,44.5,12.5,0.0,5.0,62.0,41.979071,-87.90304,41.880994,-87.632746,2024-03-15 22:00:00,4.9,12.0,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare,Loop,2024-03-15,2024,3,March,15,5,Friday,False,False
3,4b708c1a295f2c1e90a047074fdca0e5d5025514,c4b8125a4c89c864b1f5dfc240ff17752f46dc27f574cf...,2024-03-15 22:00:00,2024-03-15T22:45:00.000,2640,1.1,44.25,3.0,0.0,5.0,52.25,41.979071,-87.90304,41.884987,-87.620993,2024-03-15 22:00:00,4.9,12.0,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare,Loop,2024-03-15,2024,3,March,15,5,Friday,False,False
4,1272df92a26cbed397fe070054553105ecfa5107,268102b5c5c93024c9b7ea7629e1d5d4b04cea3388961a...,2024-03-15 21:30:00,2024-03-15T22:15:00.000,2820,17.6,44.5,0.5,0.0,4.0,49.0,41.979071,-87.90304,41.884987,-87.620993,2024-03-15 21:00:00,5.1,14.8,0.0,0.0,Taxi Affiliation Services,Credit Card,O'Hare,Loop,2024-03-15,2024,3,March,15,5,Friday,False,False
