In [None]:
from io import StringIO
import os

import boto3
import pandas as pd

pd.set_option('display.max_columns', 50)

In [None]:
aws_access_key_id = os.getenv("AWS_ACCESS_KEY")

aws_secret_key = os.getenv('AWS_SECRET_KEY')

In [None]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """
    Reads a CSV file from an S3 bucket and returns it as a pandas DataFrame.

    Parameters:
        bucket (str): The name of the S3 bucket.
        path (str): The path within the S3 bucket where the file is located.
        filename (str): The name of the CSV file.

    Returns:
        pd.DataFrame: DataFrame containing the contents of the CSV file.

    """
    s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_key)
    
    full_path = f'{path}{filename}'
    
    object = s3.get_object(Bucket=bucket, Key=full_path)
    object = object['Body'].read().decode('utf-8')
    output_df = pd.read_csv(StringIO(object))
    
    return output_df

In [None]:
s3 = boto3.client('s3')
bucket = 'cubix-chicago-taxi-a8i9zm'

community_area_path = 'transformed_data/community_areas/'
company_path = 'transformed_data/company/'
date_path = 'transformed_data/date/'
payment_type_path = 'transformed_data/payment_type/'


community_area = read_csv_from_s3(bucket=bucket, path=community_area_path, filename='community_areas_master.csv')
company = read_csv_from_s3(bucket=bucket, path=company_path, filename='company_master.csv')
date = read_csv_from_s3(bucket=bucket, path=date_path, filename='date_dimension.csv')
payment_type = read_csv_from_s3(bucket=bucket, path=payment_type_path, filename='payment_type_master.csv')


In [None]:
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_key)
taxi_trips_path = 'transformed_data/taxi_trips/'
trips_list = []

# Taxi data transformation and loading
for file in s3.list_objects(Bucket=bucket, Prefix=taxi_trips_path)['Contents']:
    taxi_trip_key = file['Key']
    
    if taxi_trip_key.split('/')[-1].strip() != '':
        if taxi_trip_key.split('.')[1] == 'csv':
            
            filename = taxi_trip_key.split('/')[-1]
                       
            trip = read_csv_from_s3(bucket=bucket, path=taxi_trips_path, filename=filename)

            trips_list.append(trip)
            print(f'{filename} has been added.')
            # response = s3.get_object(Bucket=bucket, Key=taxi_trip_key)
            # content = response['Body']
            # taxi_trip_data_json = json.loads(content.read())
            
            # taxi_trips_raw_data = pd.DataFrame(taxi_trip_data_json)

In [None]:
trips = pd.concat(trips_list, ignore_index=True)

In [None]:
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_key)
weather_path = 'transformed_data/weather/'
weather_list = []

# Taxi data transformation and loading
for file in s3.list_objects(Bucket=bucket, Prefix=weather_path)['Contents']:
    weather_key = file['Key']
    
    if weather_key.split('/')[-1].strip() != '':
        if weather_key.split('.')[1] == 'csv':
            
            filename = weather_key.split('/')[-1]
                       
            weather = read_csv_from_s3(bucket=bucket, path=weather_path, filename=filename)

            weather_list.append(weather)
            print(f'{filename} has been added.')
            # response = s3.get_object(Bucket=bucket, Key=taxi_trip_key)
            # content = response['Body']
            # taxi_trip_data_json = json.loads(content.read())
            
            # taxi_trips_raw_data = pd.DataFrame(taxi_trip_data_json)

In [None]:
weather = pd.concat(weather_list, ignore_index=True)

In [116]:
trips_full = pd.merge(trips, weather, left_on='datetime_for_weather', right_on='datetime', how='inner')
trips_full = trips_full.drop(columns=['datetime'])

trips_full = pd.merge(trips_full, community_area, left_on='pickup_community_area_id', right_on='area_code', how='inner')
trips_full = trips_full.drop(columns=['pickup_community_area_id', 'area_code'])
trips_full.rename(columns={'community_name': 'pickup_community_area_name'}, inplace=True)

trips_full = pd.merge(trips_full, community_area, left_on='dropoff_community_area_id', right_on='area_code', how='inner')
trips_full = trips_full.drop(columns=['dropoff_community_area_id', 'area_code'])
trips_full.rename(columns={'community_name': 'dropoff_community_area_name'}, inplace=True)

trips_full = pd.merge(trips_full, company, on='company_id', how='inner')
trips_full = trips_full.drop(columns=['company_id'])
trips_full = pd.merge(trips_full, payment_type, on='payment_type_id', how='inner')
trips_full = trips_full.drop(columns=['payment_type_id'])

In [117]:
date['date'] = pd.to_datetime(date['date'])

trips_full['trip_start_timestamp'] = pd.to_datetime(trips_full['trip_start_timestamp'])

#trips_full['trip_start_date'] = trips_full['trip_start_timestamp']

# trips_full['trip_start_date'] = pd.to_datetime(trips_full['trip_start_date'])
trips_full['trip_start_date'] = trips_full['trip_start_timestamp'].dt.date

trips_full['trip_start_date'] = pd.to_datetime(trips_full['trip_start_date'])

trips_full = pd.merge(trips_full, date, left_on='trip_start_date', right_on='date', how='inner')
trips_full.head()
trips_full = trips_full.drop(columns=['date'])

In [118]:
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,temperature,wind_speed,precipitation,rain,pickup_community_area_name,dropoff_community_area_name,company,payment_type,trip_start_date,year,month,day,day_of_week,is_weekend
0,00c6d0cc3d95795a2b9cb3fcbb3afa8eed897ac7,3ae83cc261cea27eafc3d9b18bbc93100c03762e8b6971...,2024-03-15 23:45:00,2024-03-16T00:15:00.000,1680,1.0,42.25,9.55,0.0,5.0,56.8,41.979071,-87.90304,41.884987,-87.620993,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,O'Hare,Loop,Taxi Affiliation Services,Credit Card,2024-03-15,2024,3,15,5,False
1,fd729619c5458746ffe602406d2f86cb4059ee35,3c814d3baedca9be4de8ddb5547c7dec404a08e148740e...,2024-03-15 23:45:00,2024-03-16T00:15:00.000,1569,18.06,44.5,14.7,0.0,4.0,63.7,41.980264,-87.913625,41.899602,-87.633308,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,O'Hare,Near North Side,Sun Taxi,Credit Card,2024-03-15,2024,3,15,5,False
2,f8aef1517ec3953a3805d0912f92f994e1245205,9de14279ac4dc5696c73c13b07b0aaf2b1a1796dda9f4c...,2024-03-15 23:45:00,2024-03-15T23:45:00.000,336,1.64,10.0,2.0,0.0,0.0,12.0,41.922761,-87.699155,41.901207,-87.676356,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Logan Square,West Town,City Service,Mobile,2024-03-15,2024,3,15,5,False
3,efa623a2405100c731e62efc223d3c02ddee3d8a,8a999732f0972dda5aa358ad377427f0cb844b5ec246a9...,2024-03-15 23:45:00,2024-03-16T00:00:00.000,1370,5.51,18.5,0.0,0.0,1.0,19.5,41.812949,-87.61786,41.899602,-87.633308,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Grand Boulevard,Near North Side,Flash Cab,Prcard,2024-03-15,2024,3,15,5,False
4,ebbf769b25db8056d8ffc0b27f982ac1102eab4e,7ff3ea8c15d902e432f0f3ca3aab1d5f20bff4c4fedfb5...,2024-03-15 23:45:00,2024-03-15T23:45:00.000,600,2.3,9.0,2.1,0.0,1.0,12.1,41.944227,-87.655998,41.899602,-87.633308,2024-03-15 23:00:00,4.7,9.8,0.0,0.0,Lake View,Near North Side,Taxi Affiliation Services,Credit Card,2024-03-15,2024,3,15,5,False
