In [2]:
import json, os, requests
from dateutil.relativedelta import relativedelta 
from datetime import datetime
import pandas as pd
pd.set_option('display.max_columns', 30)

In [3]:
'''
1. get the data from S3
2. weather_data tranformations
3. taxi_trips transformations
4. update payment_type_master
5. update company_master
6. update taxi_trips with company and payment_type ids (replace the string values with ids from the latest master tables)
7. upload weather_data to S3
8. upload taxi data to S3
9. upload the newest payment_type_master and company_master
'''

'\n1. get the data from S3\n2. weather_data tranformations\n3. taxi_trips transformations\n4. update payment_type_master\n5. update company_master\n6. update taxi_trips with company and payment_type ids (replace the string values with ids from the latest master tables)\n7. upload weather_data to S3\n8. upload taxi data to S3\n9. upload the newest payment_type_master and company_master\n'

### taxi_trips transformation codes

In [4]:
current_datetime = datetime.now() - relativedelta(months=2)

formatted_datetime = current_datetime.strftime('%Y-%m-%d')


url = f"https://data.cityofchicago.org/resource/ajtu-isnz.json?$where=trip_start_timestamp >= '{formatted_datetime}T00:00:00.000' AND trip_start_timestamp <= '{formatted_datetime}T23:59:59.000'&$limit=30000"
headers = {'X-App-Token': os.environ.get('CHICAGO_API_TOKEN')}

response = requests.get(url, headers=headers, verify=False)

data = response.json()



In [5]:
taxi_trips = pd.DataFrame(data)

In [6]:
taxi_trips.drop(['pickup_census_tract','dropoff_census_tract', 'pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True) 

In [7]:
taxi_trips.dropna(inplace=True)

In [8]:
taxi_trips.rename(columns={
    'pickup_community_area': 'pickup_community_area_id',
    'dropoff_community_area': 'dropoff_community_area_id'},
    inplace=True
)

In [12]:
taxi_trips['datetime_for_weather'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('h')

In [10]:
taxi_trips["datetime_for_weather"] = taxi_trips['trip_start_timestamp'].dt.floor('h')

#### taxi_trips transformation function


In [16]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Perform transformations on a DataFrame containing taxi trip data.

    Parameters:
    - taxi_trips (pd.DataFrame): A pandas DataFrame containing taxi trip data.

    Returns:
    - pd.DataFrame: A DataFrame with the following transformations applied:
        - Columns 'pickup_census_tract', 'dropoff_census_tract', 'pickup_centroid_location',
          and 'dropoff_centroid_location' are dropped.
        - Rows with missing values are dropped.
        - Columns 'pickup_community_area' and 'dropoff_community_area' are renamed to
          'pickup_community_area_id' and 'dropoff_community_area_id' respectively.
        - A new column 'datetime_for_weather' is created, containing the hourly timestamp
          of the 'trip_start_timestamp' column.
    """

    # Error handling:
    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError('taxi_trips is not a valid pandas DataFrame')
    
    columns_to_drop = ['pickup_census_tract', 'dropoff_census_tract', 'pickup_centroid_location', 'dropoff_centroid_location']
    existing_columns = set(taxi_trips.columns)
    for column in columns_to_drop:
        if column not in existing_columns:
            print(f"Warning: Column '{column}' not found in DataFrame.")

    try:
        taxi_trips['datetime_for_weather'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('h')
    except ValueError as e:
        print(f"Error converting timestamps: {e}")

    try:
        taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract', 'pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)
        taxi_trips.dropna(inplace=True)
        taxi_trips.rename(columns={'pickup_community_area': 'pickup_community_area_id', 'dropoff_community_area': 'dropoff_community_area_id'}, inplace=True)
        taxi_trips['datetime_for_weather'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('h')
    except Exception as e:
        print(f"An error occurred during data transformation: {e}")
    
    
    # def:
    taxi_trips.drop(['pickup_census_tract','dropoff_census_tract', 'pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)
    
    taxi_trips.dropna(inplace=True)
    
    taxi_trips.rename(columns={
        'pickup_community_area': 'pickup_community_area_id',
        'dropoff_community_area': 'dropoff_community_area_id'},
        inplace=True
        )
    
    taxi_trips['datetime_for_weather'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('h')

    return taxi_trips
