In [1]:
import pandas as pd
#df=pd.read_csv("https://storage.googleapis.com/uber-7730061615-project/uber_data.csv")

df=pd.read_csv("./data/uber_data.csv")
# df.head()

In [21]:
# check the fields are in the proper format or not
# df.info()

In [2]:
df=df.drop_duplicates().reset_index(drop=True)
# Assign dataframe index to trip_id field
df['trip_id']=df.index

In [3]:
df['tpep_pickup_datetime']=pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime']=pd.to_datetime(df['tpep_dropoff_datetime'])
# df.info()

In [4]:
"""
this cell contains code for datetime_dim dimention
"""
# Reset the index for the datetime_dim dataframe
datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].reset_index(drop=True)
datetime_dim['datetime_id']=datetime_dim.index
# write code to populate datetime_dimention table.
datetime_dim['pick_year']=datetime_dim['tpep_pickup_datetime'].dt.year
datetime_dim['pick_month']=datetime_dim['tpep_pickup_datetime'].dt.month
datetime_dim['pick_day']=datetime_dim['tpep_pickup_datetime'].dt.day
datetime_dim['pick_hour']=datetime_dim['tpep_pickup_datetime'].dt.hour
datetime_dim['pick_weekday']=datetime_dim['tpep_pickup_datetime'].dt.weekday

datetime_dim['drop_year']=datetime_dim['tpep_dropoff_datetime'].dt.year
datetime_dim['drop_month']=datetime_dim['tpep_dropoff_datetime'].dt.month
datetime_dim['drop_day']=datetime_dim['tpep_dropoff_datetime'].dt.day
datetime_dim['drop_hour']=datetime_dim['tpep_dropoff_datetime'].dt.hour
datetime_dim['drop_weekday']=datetime_dim['tpep_dropoff_datetime'].dt.weekday
# datetime_dim.head()

In [5]:
"""
This cell contains code for passenger_count,trip_distance_dim dimention.
"""
passenger_count_dim = df[['passenger_count']].reset_index(drop=True)
passenger_count_dim['passenger_count_id'] = passenger_count_dim.index
passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']]
# passenger_count_dim.head()
trip_distance_dim = df[['trip_distance']].reset_index(drop=True)
trip_distance_dim['trip_distance_id'] = trip_distance_dim.index
trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']]

In [6]:
"""
This cell contains code for pickup_location,dropoff_location dimention.
"""
pickup_location_dim=df[['pickup_latitude','pickup_longitude']].reset_index(drop=True)
pickup_location_dim['pickup_location_id']=pickup_location_dim.index

dropoff_location_dim=df[['dropoff_latitude','dropoff_longitude']].reset_index(drop=True)
dropoff_location_dim['dropoff_location_id']=dropoff_location_dim.index
# dropoff_location_dim.head()

In [7]:
"""
This cell contains code for ratecode dimention.
"""
rate_code_type = {
    1:"Standard rate",
    2:"JFK",
    3:"Newark",
    4:"Nassau or Westchester",
    5:"Negotiated fare",
    6:"Group ride"
}
rate_code_dim=df[['RatecodeID']].reset_index(drop=True)
rate_code_dim['rate_code_id']=rate_code_dim.index
rate_code_dim['rate_code_name']=rate_code_dim['RatecodeID'].map(rate_code_type)
# rate_code_dim.head()

In [8]:
"""
This cell contains code for payment_type dimention.
"""
payment_type_name = {
    1:"Credit card",
    2:"Cash",
    3:"No charge",
    4:"Dispute",
    5:"Unknown",
    6:"Voided trip"
}
payment_type_dim = df[['payment_type']].reset_index(drop=True)
payment_type_dim['payment_type_id'] = payment_type_dim.index
payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name)
payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']]

In [9]:
# Merge all the dimensions into Fact table
fact_table=df.merge(passenger_count_dim, left_on='trip_id', right_on='passenger_count_id') \
             .merge(trip_distance_dim, left_on='trip_id', right_on='trip_distance_id') \
             .merge(rate_code_dim, left_on='trip_id', right_on='rate_code_id') \
             .merge(pickup_location_dim, left_on='trip_id', right_on='pickup_location_id') \
             .merge(dropoff_location_dim, left_on='trip_id', right_on='dropoff_location_id')\
             .merge(datetime_dim, left_on='trip_id', right_on='datetime_id') \
             .merge(payment_type_dim, left_on='trip_id', right_on='payment_type_id') \
             [['trip_id','VendorID', 'datetime_id', 'passenger_count_id',
               'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',
               'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
               'improvement_surcharge', 'total_amount']]

In [11]:
# convert pandas dataframe to dictionary
fact_dict={
    "datetime_dim":datetime_dim.to_dict(orient="dict"),
    "passenger_count_dim":passenger_count_dim.to_dict(orient="dict"),
    "trip_distance_dim":trip_distance_dim.to_dict(orient="dict"),
    "rate_code_dim":rate_code_dim.to_dict(orient="dict"),
    "pickup_location_dim":pickup_location_dim.to_dict(orient="dict"),
    "dropoff_location_dim":dropoff_location_dim.to_dict(orient="dict"),
    "payment_type_dim":payment_type_dim.to_dict(orient="dict"),
    
}

# print(fact_dict)

In [None]:
if 'transformer' not in globals():
    from mage_ai.data_preparation.decorators import transformer
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test

import pandas as pd

@transformer
def transform(df, *args, **kwargs):
    """
    Template code for a transformer block.

    Add more parameters to this function if this block has multiple parent blocks.
    There should be one parameter for each output variable from each parent block.

    Args:
        data: The output from the upstream parent block
        args: The output from any additional upstream blocks (if applicable)

    Returns:
        Anything (e.g. data frame, dictionary, array, int, str, etc.)
    """
    # Specify your transformation logic here
    df=df.drop_duplicates().reset_index(drop=True)
# Assign dataframe index to trip_id field
    df['trip_id']=df.index
    df['tpep_pickup_datetime']=pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime']=pd.to_datetime(df['tpep_dropoff_datetime'])

    """
    this cell contains code for datetime_dim dimention
    """
    # Reset the index for the datetime_dim dataframe
    datetime_dim = df[['tpep_pickup_datetime','tpep_dropoff_datetime']].reset_index(drop=True)
    datetime_dim['datetime_id']=datetime_dim.index
    # write code to populate datetime_dimention table.
    datetime_dim['pick_year']=datetime_dim['tpep_pickup_datetime'].dt.year
    datetime_dim['pick_month']=datetime_dim['tpep_pickup_datetime'].dt.month
    datetime_dim['pick_day']=datetime_dim['tpep_pickup_datetime'].dt.day
    datetime_dim['pick_hour']=datetime_dim['tpep_pickup_datetime'].dt.hour
    datetime_dim['pick_weekday']=datetime_dim['tpep_pickup_datetime'].dt.weekday

    datetime_dim['drop_year']=datetime_dim['tpep_dropoff_datetime'].dt.year
    datetime_dim['drop_month']=datetime_dim['tpep_dropoff_datetime'].dt.month
    datetime_dim['drop_day']=datetime_dim['tpep_dropoff_datetime'].dt.day
    datetime_dim['drop_hour']=datetime_dim['tpep_dropoff_datetime'].dt.hour
    datetime_dim['drop_weekday']=datetime_dim['tpep_dropoff_datetime'].dt.weekday

    """
    This cell contains code for passenger_count,trip_distance_dim dimention.
    """
    passenger_count_dim = df[['passenger_count']].reset_index(drop=True)
    passenger_count_dim['passenger_count_id'] = passenger_count_dim.index
    passenger_count_dim = passenger_count_dim[['passenger_count_id','passenger_count']]
    # passenger_count_dim.head()
    trip_distance_dim = df[['trip_distance']].reset_index(drop=True)
    trip_distance_dim['trip_distance_id'] = trip_distance_dim.index
    trip_distance_dim = trip_distance_dim[['trip_distance_id','trip_distance']]

    """
    This cell contains code for pickup_location,dropoff_location dimention.
    """
    pickup_location_dim=df[['pickup_latitude','pickup_longitude']].reset_index(drop=True)
    pickup_location_dim['pickup_location_id']=pickup_location_dim.index

    dropoff_location_dim=df[['dropoff_latitude','dropoff_longitude']].reset_index(drop=True)
    dropoff_location_dim['dropoff_location_id']=dropoff_location_dim.index

    """
    This cell contains code for ratecode dimention.
    """
    rate_code_type = {
        1:"Standard rate",
        2:"JFK",
        3:"Newark",
        4:"Nassau or Westchester",
        5:"Negotiated fare",
        6:"Group ride"
    }
    rate_code_dim=df[['RatecodeID']].reset_index(drop=True)
    rate_code_dim['rate_code_id']=rate_code_dim.index
    rate_code_dim['rate_code_name']=rate_code_dim['RatecodeID'].map(rate_code_type)

    """
    This cell contains code for payment_type dimention.
    """
    payment_type_name = {
        1:"Credit card",
        2:"Cash",
        3:"No charge",
        4:"Dispute",
        5:"Unknown",
        6:"Voided trip"
    }
    payment_type_dim = df[['payment_type']].reset_index(drop=True)
    payment_type_dim['payment_type_id'] = payment_type_dim.index
    payment_type_dim['payment_type_name'] = payment_type_dim['payment_type'].map(payment_type_name)
    payment_type_dim = payment_type_dim[['payment_type_id','payment_type','payment_type_name']]

# Merge all the dimensions into Fact table
    fact_table=df.merge(passenger_count_dim, left_on='trip_id', right_on='passenger_count_id') \
                .merge(trip_distance_dim, left_on='trip_id', right_on='trip_distance_id') \
                .merge(rate_code_dim, left_on='trip_id', right_on='rate_code_id') \
                .merge(pickup_location_dim, left_on='trip_id', right_on='pickup_location_id') \
                .merge(dropoff_location_dim, left_on='trip_id', right_on='dropoff_location_id')\
                .merge(datetime_dim, left_on='trip_id', right_on='datetime_id') \
                .merge(payment_type_dim, left_on='trip_id', right_on='payment_type_id') \
                [['trip_id','VendorID', 'datetime_id', 'passenger_count_id',
                'trip_distance_id', 'rate_code_id', 'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',
                'payment_type_id', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
                'improvement_surcharge', 'total_amount']]
    fact_dict={
        "datetime_dim":datetime_dim.to_dict(orient="dict"),
        "passenger_count_dim":passenger_count_dim.to_dict(orient="dict"),
        "trip_distance_dim":trip_distance_dim.to_dict(orient="dict"),
        "rate_code_dim":rate_code_dim.to_dict(orient="dict"),
        "pickup_location_dim":pickup_location_dim.to_dict(orient="dict"),
        "dropoff_location_dim":dropoff_location_dim.to_dict(orient="dict"),
        "payment_type_dim":payment_type_dim.to_dict(orient="dict"),
         "fact_table":fact_table.to_dict(orient="dict"),

        }

    return fact_dict


@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'


In [None]:
# Exporter
from mage_ai.settings.repo import get_repo_path
from mage_ai.io.bigquery import BigQuery
from mage_ai.io.config import ConfigFileLoader
from pandas import DataFrame
from os import path

if 'data_exporter' not in globals():
    from mage_ai.data_preparation.decorators import data_exporter


@data_exporter
def export_data_to_big_query(data, **kwargs) -> None:
    """
    Template for exporting data to a BigQuery warehouse.
    Specify your configuration settings in 'io_config.yaml'.

    Docs: https://docs.mage.ai/design/data-loading#bigquery
    """
    table_id = 'project-uber-418412.dataset_1.fact_table'
    config_path = path.join(get_repo_path(), 'io_config.yaml')
    config_profile = 'default'

    BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).export(
        DataFrame(data['fact_table']),
        table_id,
        if_exists='replace',  # Specify resolution policy if table name already exists
    )
