In [1]:
import pandas as pd
import requests
import os
from dateutil.relativedelta import relativedelta
from datetime import datetime as dt
import json

pd.set_option('display.max_columns',30)

In [2]:
"""
Steps to do

1. Get the data from S3
2. weather data transformations
3. taxi trips transformation - Done
4. Update payment type - Done
5. Update company - Done
6. update taxi trips with company and paymant types (replace str values with ids from the latest master tables) - Done
7. upload the weather to S3
8. upload the taxi data to S3
9. upload the newest payment type and company

"""

'\nSteps to do\n\n1. Get the data from S3\n2. weather data transformations\n3. Do the transformation like in 06 notebooks\n4. Update payment type\n5. Update company\n6. update taxi trips with company and paymant types (replace str values with ids from the latest master tables)\n7. upload the weather to S3\n8. upload the taxi data to S3\n9. upload the newest payment type and company\n\n'

#### Taxi trips transformation code

In [3]:
start_date = (dt.now() - relativedelta(months=2)).strftime("%Y-%m-%d")
url_new = f"https://data.cityofchicago.org/resource/ajtu-isnz.json?$where=trip_start_timestamp >= '{start_date}T00:00:00' AND trip_start_timestamp <= '{start_date}T23:59:59'&$limit=213000000"
response_taxi = requests.get(url_new)
taxi_data = response_taxi.json()

In [4]:
taxi_trips = pd.DataFrame(taxi_data)

In [5]:
taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract'], axis=1, inplace=True)
taxi_trips.drop(['pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)
taxi_trips.dropna(inplace=True)

In [6]:
taxi_trips.rename(columns={'pickup_community_area' : 'pickup_community_area_id',
                           'dropoff_community_area' : 'dropoff_community_area_id'
                           },inplace=True)

In [7]:
taxi_trips['datetime_for_weather'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('H')
#taxi_trips['datetime_for_weather'] = taxi_trips['trip_start_timestamp'].dt.floor('H')

In [8]:
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
0,cfa0129694e7ae52df28e1bd6c2930e19bb17b1f,cce8c54f19d692acc5dd45d391a8bb964d15af67f314d1...,2024-03-11T23:45:00.000,2024-03-11T23:45:00.000,60,0.3,5,5,4.0,0,0,0,4.0,Cash,Taxi Affiliation Services,41.947791586,-87.683834942,41.947791586,-87.683834942,2024-03-11 23:00:00
4,b64c0785cf2bb9c826106aabe14750a5a0256d37,6e6b305564917024d22c3818670644f3ac4a45fd200490...,2024-03-11T23:45:00.000,2024-03-12T00:15:00.000,1680,0.0,56,8,34.5,8,0,5,47.5,Credit Card,Taxi Affiliation Services,41.785998518,-87.750934289,41.89503345,-87.619710672,2024-03-11 23:00:00
6,a62e5f0d8c1d23c5d1d833300c8b593b424c02c0,00aeef49089b36ed668920fac1119a26fb25d1b48102a4...,2024-03-11T23:45:00.000,2024-03-12T00:00:00.000,780,0.2,76,76,14.5,3,0,4,21.5,Credit Card,Taxi Affiliation Services,41.980264315,-87.913624596,41.980264315,-87.913624596,2024-03-11 23:00:00
7,9d4d76a8b082e72e03528a0baac3f7e42e1c7d2c,9492b268e840fcd19b554ae0d61ab86a48eee56b7fba98...,2024-03-11T23:45:00.000,2024-03-12T00:00:00.000,953,8.77,33,3,23.5,0,0,0,23.5,Prcard,City Service,41.857183858,-87.620334624,41.96581197,-87.655878786,2024-03-11 23:00:00
9,94db3b78a9c94026f94460b7e429cda6ebd68056,b5d74b92d646ed1edaff1b165977d2a4a5d06879202484...,2024-03-11T23:45:00.000,2024-03-12T00:00:00.000,1374,12.26,76,2,31.75,0,0,0,31.75,Cash,Sun Taxi,41.980264315,-87.913624596,42.001571027,-87.695012589,2024-03-11 23:00:00


#### Taxi trip transformation function

In [9]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """
    Transformations for taxi trips DataFrame.

    Parameters:
    - taxi_trips (pd.DataFrame): DataFrame containing taxi trips data.

    Returns:
    - pd.DataFrame: Transformed DataFrame with specified columns dropped, NaN values removed,
      columns renamed, and datetime rounded to the nearest hour for weather matching.
    """
    if not isinstance(taxi_trips,pd.DataFrame):
      raise TypeError('taxi_trips is not a valid data frame')
    
    taxi_trips.drop(['pickup_census_tract', 'dropoff_census_tract','pickup_centroid_location', 'dropoff_centroid_location'], axis=1, inplace=True)
    taxi_trips.dropna(inplace=True)
    taxi_trips.rename(columns={'pickup_community_area' : 'pickup_community_area_id',
                            'dropoff_community_area' : 'dropoff_community_area_id'
                            },inplace=True)
    taxi_trips['datetime_for_weather'] = pd.to_datetime(taxi_trips['trip_start_timestamp']).dt.floor('H')
    return taxi_trips

#### Company update code

In [10]:
company_master = taxi_trips['company'].drop_duplicates().reset_index(drop=True)
company_master = pd.DataFrame({
    'company_id': range(1, len(company_master) + 1),
    'company':  company_master
})

In [11]:
company_master.tail()

Unnamed: 0,company_id,company
25,26,2733 - 74600 Benny Jona
26,27,6574 - Babylon Express Inc.
27,28,3556 - 36214 RC Andrews Cab
28,29,5167 - 71969 5167 Taxi Inc
29,30,Petani Cab Corp


In [12]:
new_company_data = [
    {'company' : 'Petani Cab Corp'},
    {'company' : 'X'},
    {'company' : 'Y'}
]

new_company_mapping = pd.DataFrame(new_company_data)

In [13]:
new_company_mapping

Unnamed: 0,company
0,Petani Cab Corp
1,X
2,Y


In [14]:
company_max_id = company_master['company_id'].max()

In [15]:
new_companies = []
for company in new_company_mapping['company'].values:
    if company not in company_master['company'].values:
        new_companies.append(company)

#One line
new_companies_on_line = [company for company in new_company_mapping['company'].values if company not in company_master['company'].values]

In [16]:
new_companies_df = pd.DataFrame({
    'company_id' : range(company_max_id + 1, company_max_id + len(new_companies_on_line) + 1),
    'company' : new_companies_on_line
})

In [44]:
new_companies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   company_id  2 non-null      int64 
 1   company     2 non-null      object
dtypes: int64(1), object(1)
memory usage: 164.0+ bytes


In [18]:
updated_company_master = pd.concat([company_master, new_companies_df], ignore_index=True)

In [19]:
updated_company_master.tail()

Unnamed: 0,company_id,company
27,28,3556 - 36214 RC Andrews Cab
28,29,5167 - 71969 5167 Taxi Inc
29,30,Petani Cab Corp
30,31,X
31,32,Y


#### Create company master update function


In [20]:
def update_company_master(taxi_trips : pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame:
    """
    Update the company master dataframe with new companies from the taxi trips dataframe.
    
    Args:
        taxi_trips (pd.DataFrame): The dataframe containing taxi trips data.
        company_master (pd.DataFrame): The dataframe containing the current company master data.
    
    Returns:
        pd.DataFrame: The updated company master dataframe with new companies added.
    """
    company_max_id = company_master['company_id'].max()

    new_companies = [company for company in taxi_trips['company'].values if company not in company_master['company'].values]
    
    new_companies_df = pd.DataFrame({
        'company_id' : range(company_max_id + 1, company_max_id + len(new_companies) + 1),
        'company' : new_companies
    })

    return pd.concat([company_master, new_companies_df], ignore_index=True)

In [24]:
taxi_trips_company_only_for_test = pd.DataFrame({
    'company_id': [1,2,3],
    'company': ['5167 - 71969 5167 Taxi Inc','X','Y']
})

test_df = update_company_master(taxi_trips=taxi_trips_company_only_for_test, company_master=company_master)

In [26]:
test_df.tail(3)

Unnamed: 0,company_id,company
29,30,Petani Cab Corp
30,31,X
31,32,Y


#### Create payment type update function

In [45]:
#Ha új payment type van azt frissíteni kell
payment_type_master = taxi_trips['payment_type'].drop_duplicates().reset_index(drop=True)
payment_type_master = pd.DataFrame({
    'payment_type_id': range(1, len(payment_type_master) + 1),
    'payment_type':  payment_type_master
})

taxi_trips_payment_only_for_test = pd.DataFrame({
    'payment_type_id': [1,2,3],
    'payment_type': ['Credit Card','F','G']
})

taxi_trips_payment_only_for_test

Unnamed: 0,payment_type_id,payment_type
0,1,Credit Card
1,2,F
2,3,G


In [48]:
def update_payment_master(taxi_trips : pd.DataFrame, payment_type_master: pd.DataFrame) -> pd.DataFrame:
    """Update the payment master with new payment types from taxi trips.
    
    Args:
        taxi_trips (pd.DataFrame): DataFrame containing taxi trips data.
        payment_type_master (pd.DataFrame): DataFrame containing existing payment types.
    
    Returns:
        pd.DataFrame: Updated payment master DataFrame.
    """
    
    payment_max_id = payment_type_master['payment_type_id'].max()

    new_payments = [payments for payments in taxi_trips['payment_type'].values if payments not in payment_type_master['payment_type'].values]
    
    new_payments_df = pd.DataFrame({
        'payment_type_id' : range(payment_max_id + 1, payment_max_id + len(new_payments) + 1),
        'payment_type' : new_payments
    })

    return pd.concat([payment_type_master, new_payments_df], ignore_index=True)

In [49]:
test_df = update_payment_master(taxi_trips=taxi_trips_payment_only_for_test, payment_type_master=payment_type_master)

In [50]:
test_df

Unnamed: 0,payment_type_id,payment_type
0,1,Cash
1,2,Credit Card
2,3,Prcard
3,4,Mobile
4,5,Unknown
5,6,No Charge
6,7,Dispute
7,8,F
8,9,G


#### Unify the two update function

In [51]:
#A cél, hogy ne duplikáld a kódot.
def update_master(taxi_trips : pd.DataFrame, master: pd.DataFrame, id_col_name:str, value_col_name:str) -> pd.DataFrame:
    """Update the master dataframe with new values from the taxi trips dataframe.
    
    Args:
        taxi_trips (pd.DataFrame): The dataframe containing new values.
        master (pd.DataFrame): The master dataframe to be updated.
        id_col_name (str): The name of the column in the master dataframe that contains IDs.
        value_col_name (str): The name of the column in the master dataframe that contains values.
    
    Returns:
        pd.DataFrame: The updated master dataframe with new values added.
    """
    
    max_id = master[id_col_name].max()

    new_values = [value for value in taxi_trips[value_col_name].values if value not in master[value_col_name].values]
    
    new_data_df = pd.DataFrame({
        id_col_name : range(max_id + 1, max_id + len(new_values) + 1),
        value_col_name : new_values
    })

    return pd.concat([master, new_data_df], ignore_index=True)

In [54]:
test_df = update_master(taxi_trips=taxi_trips_payment_only_for_test, master=payment_type_master, id_col_name='payment_type_id', value_col_name='payment_type')

In [55]:
test_df

Unnamed: 0,payment_type_id,payment_type
0,1,Cash
1,2,Credit Card
2,3,Prcard
3,4,Mobile
4,5,Unknown
5,6,No Charge
6,7,Dispute
7,8,F
8,9,G


In [57]:
test_df = update_master(taxi_trips=taxi_trips_company_only_for_test, master=company_master, id_col_name='company_id', value_col_name='company')

In [58]:
test_df

Unnamed: 0,company_id,company
0,1,Taxi Affiliation Services
1,2,City Service
2,3,Sun Taxi
3,4,Flash Cab
4,5,Blue Ribbon Taxi Association
5,6,Taxicab Insurance Agency Llc
6,7,U Taxicab
7,8,Globe Taxi
8,9,"Taxicab Insurance Agency, LLC"
9,10,Choice Taxi Association


#### Update taxi_trips with the most recent company_master and payment_type codes

In [62]:
#A nevek lecserélése az id-re, ez a cella hozzáfűzi a kódokat az eredeti df-hez.
#Ezek után törölni lehet az eredeti mezőket (payment_type, company)
taxi_trips_id = taxi_trips.merge(payment_type_master, on='payment_type')
taxi_trips_id = taxi_trips_id.merge(company_master, on='company')


In [63]:
taxi_trips_id.head(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
0,cfa0129694e7ae52df28e1bd6c2930e19bb17b1f,cce8c54f19d692acc5dd45d391a8bb964d15af67f314d1...,2024-03-11T23:45:00.000,2024-03-11T23:45:00.000,60,0.3,5,5,4.0,0,0,0,4.0,Cash,Taxi Affiliation Services,41.947791586,-87.683834942,41.947791586,-87.683834942,2024-03-11 23:00:00,1,1
1,680e8a2ce6851e57ebd71bcf78d743172c80e12e,791d74df896226a452a8e223e2ec9fa0df7d80bb7ca180...,2024-03-11T23:45:00.000,2024-03-12T00:15:00.000,1800,10.7,23,1,29.5,0,0,0,29.5,Cash,Taxi Affiliation Services,41.900069603,-87.720918238,42.009622881,-87.670166857,2024-03-11 23:00:00,1,1
2,f840b110665d743e31dd1826dc72781715a8ad21,16a529a73f019759aa8f1b563ab8511b26a91d65659188...,2024-03-11T23:45:00.000,2024-03-12T00:00:00.000,360,0.0,28,32,6.0,0,0,0,6.0,Cash,Taxi Affiliation Services,41.874005383,-87.66351755,41.878865584,-87.625192142,2024-03-11 23:00:00,1,1
3,f4addc8e66fb9024419ee60bae36e0878a79d4fd,5cdde36a39ded2651da1686c7813baf589dac6eb873894...,2024-03-11T23:45:00.000,2024-03-11T23:45:00.000,120,0.5,8,8,4.5,0,0,0,4.5,Cash,Taxi Affiliation Services,41.899602111,-87.633308037,41.899602111,-87.633308037,2024-03-11 23:00:00,1,1
4,81b84987a8757908af6116aac6ec514f317e10d8,c9867d006415cbc16529555f98cdeb44cb53aeaf1d9ae7...,2024-03-11T23:30:00.000,2024-03-11T23:45:00.000,1560,17.6,76,28,44.5,0,0,5,49.5,Cash,Taxi Affiliation Services,41.97907082,-87.903039661,41.88528132,-87.6572332,2024-03-11 23:00:00,1,1


In [64]:
taxi_trips_id.drop(['payment_type', 'company'], axis=1, inplace=True)

In [65]:
taxi_trips_id.sample(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
1310,5544591fb9f88a97ac77c9d85883c55a014b83af,3665a72ee495b03f4dae72307dc6e5e58e21518f77d8e6...,2024-03-11T17:30:00.000,2024-03-11T17:45:00.000,660,1.3,8,32,8.0,4.4,0,0.0,12.4,41.89503345,-87.619710672,41.880994471,-87.632746489,2024-03-11 17:00:00,2,1
477,a5921c7039e595d1e76bc968886c989aa2f94e85,9776fb6202fccea3740fb24e33dc804041e071cd66ff7c...,2024-03-11T13:30:00.000,2024-03-11T14:00:00.000,1380,0.6,56,38,28.75,0.0,0,4.0,32.75,41.79259236,-87.769615453,41.812948939,-87.617859676,2024-03-11 13:00:00,1,1
1736,3ba4be909f8320ca80482accbf754297e993d61b,3665a72ee495b03f4dae72307dc6e5e58e21518f77d8e6...,2024-03-11T10:45:00.000,2024-03-11T11:00:00.000,1440,10.8,28,42,28.75,5.85,0,0.0,34.6,41.874005383,-87.66351755,41.77887686,-87.594925439,2024-03-11 10:00:00,2,1
4386,97b179a9c20b1768a0e448ed2b8878611983cbce,67d91d01b3ae537bcc8d9a8dbea66bd361ef3caf0b7396...,2024-03-11T11:30:00.000,2024-03-11T11:30:00.000,334,2.49,6,8,9.25,0.0,0,0.0,9.25,41.944226601,-87.655998182,41.899602111,-87.633308037,2024-03-11 11:00:00,1,5
4887,884ce355e7ce8811173c85ee0f54dbd59740fe16,ab31561a5548b1d49b0352bb8d00d18c0e136d5f674c1c...,2024-03-11T20:15:00.000,2024-03-11T20:45:00.000,2393,14.66,76,77,39.75,0.0,0,5.5,45.25,41.980264315,-87.913624596,41.9867118,-87.663416405,2024-03-11 20:00:00,1,4


In [67]:
def update_taxi_trips_with_master_data(taxi_trips: pd.DataFrame, payment_type_master: pd.DataFrame, company_master: pd.DataFrame) -> pd.DataFrame:
    """Update taxi trips with master data.
    
    Parameters:
    taxi_trips (pd.DataFrame): DataFrame containing taxi trips data.
    payment_type_master (pd.DataFrame): DataFrame containing master data for payment types.
    company_master (pd.DataFrame): DataFrame containing master data for companies.
    
    Returns:
    pd.DataFrame: Updated DataFrame with merged master data.
    """
        
    taxi_trips_id = taxi_trips.merge(payment_type_master, on='payment_type')
    taxi_trips_id = taxi_trips_id.merge(company_master, on='company')
    taxi_trips_id.drop(['payment_type', 'company'], axis=1, inplace=True)
    return taxi_trips_id

In [68]:
test_df = update_taxi_trips_with_master_data(taxi_trips=taxi_trips, payment_type_master=payment_type_master, company_master=company_master)
test_df.sample(5)

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
1217,55b9f8170678bb31232768cd83b74ad2dcd0781a,6c87f1d023a9d7146ae81b6aa4648bf98cec52cceb9f08...,2024-03-11T18:30:00.000,2024-03-11T18:30:00.000,0,0.0,28,28,3.25,10.25,0,30.5,44.0,41.874005383,-87.66351755,41.874005383,-87.66351755,2024-03-11 18:00:00,2,1
134,ad0022a17319da0e28afcac59d7b829cf793adbd,f8f44c9d76773e3c761356078d786b151d2f206685454f...,2024-03-11T18:45:00.000,2024-03-11T19:15:00.000,1740,0.5,76,12,27.25,0.0,0,5.0,32.25,41.980264315,-87.913624596,41.993930128,-87.758353588,2024-03-11 18:00:00,1,1
160,474c8473e848b98f1f0672d21b2a7c734e0e67ff,650abe6b4d67899fb67b2631a830ea90b5626ec5724ad7...,2024-03-11T18:30:00.000,2024-03-11T18:30:00.000,300,0.5,32,32,5.0,0.0,0,0.0,5.0,41.880994471,-87.632746489,41.880994471,-87.632746489,2024-03-11 18:00:00,1,1
3062,a2a39ec7493939f990ea746e20a562db1c797532,aa4c3beb61a579d4f214d3f92395a4f7bc7245a2cb21eb...,2024-03-11T13:15:00.000,2024-03-11T13:30:00.000,551,2.85,32,33,10.25,0.0,0,1.0,11.25,41.884987192,-87.620992913,41.859349715,-87.617358006,2024-03-11 13:00:00,1,3
12281,6c333dc9e5702b0c8310862468d3b4f77a5e8caa,1626953f5dfefe1c62e73836adb90d16258a243e46f69e...,2024-03-11T22:30:00.000,2024-03-11T23:00:00.000,1560,14.5,76,6,37.0,8.3,0,4.0,49.8,41.980264315,-87.913624596,41.944226601,-87.655998182,2024-03-11 22:00:00,2,8


#### Weather transformation function

In [2]:
def transform_weather_data(weather_data: json) -> pd.DataFrame:
    """Transform weather data from JSON format to a pandas DataFrame.
    
    Parameters:
    weather_data (json): JSON data containing weather information.
    
    Returns:
    pd.DataFrame: Transformed weather data in a DataFrame format.
    """
    
    weather_filtered = {
        'datetime': weather_data['hourly']['time'],
        'temperature' : weather_data['hourly']['temperature_2m'],
        'wind' : weather_data['hourly']['wind_speed_10m'],
        'precipitation' : weather_data['hourly']['precipitation'],
        'rain' : weather_data['hourly']['rain']
    }
    weather_df = pd.DataFrame(weather_filtered)
    weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
    return weather_df


In [3]:
#Test
url = f"https://archive-api.open-meteo.com/v1/era5"
startDate = (dt.now() - relativedelta(months=2)).strftime('%Y-%m-%d')

params = {
    "latitude" : 41.85,
    "longitude" : -87.65,
    "start_date" : startDate,
    "end_date" : startDate,
    "hourly" : "temperature_2m,wind_speed_10m,precipitation,rain"
}


response = requests.get(url, params=params)
weather_data = response.json()

weather_data_df = transform_weather_data(weather_data=weather_data)

In [4]:
weather_data_df.sample(5)

Unnamed: 0,datetime,temperature,wind,precipitation,rain
19,2024-03-12 19:00:00,17.9,28.7,0.0,0.0
20,2024-03-12 20:00:00,18.8,28.4,0.0,0.0
6,2024-03-12 06:00:00,10.7,28.0,0.0,0.0
5,2024-03-12 05:00:00,11.1,29.1,0.0,0.0
18,2024-03-12 18:00:00,16.6,28.3,0.0,0.0
