We will load the input data as dataframe:

In [1]:
import os
from sqlalchemy import create_engine
from tqdm import tqdm
import psycopg2
import json
import pandas as pd

with open('../credentials.json') as f:
    data = json.load(f)
    psql_config = {
    'dbname': data['db_name'],
    'user': data['db_user'],
    'password': data['db_pwd'],
    'host': data['db_host'],
    'port': 5432
}


def get_psql_connection():
    try:
        conn = psycopg2.connect(**psql_config)
    except Exception as e:
        print("Error connecting to the database:", e)
    return conn

In [2]:
engine = create_engine(f'postgresql://{psql_config["user"]}:{psql_config["password"]}@{psql_config["host"]}/{psql_config["dbname"]}')

df = pd.read_sql("SELECT * FROM m024.citi_bike_data LIMIT 100000;", engine)

df.head()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,processed
0,255,2018-06-16 23:31:13.806,2018-06-16 23:35:29.752,3042,Fulton St & Utica Ave,40.679427,-73.929891,3052,Lewis Ave & Madison St,40.686312,-73.935775,32666,Subscriber,1982,1,False
1,810,2018-06-17 07:25:36.853,2018-06-17 07:39:07.141,3042,Fulton St & Utica Ave,40.679427,-73.929891,344,Monroe St & Bedford Ave,40.685144,-73.953809,28845,Subscriber,1975,2,False
2,1126,2018-06-17 08:10:41.355,2018-06-17 08:29:28.151,3042,Fulton St & Utica Ave,40.679427,-73.929891,3418,Plaza St West & Flatbush Ave,40.675021,-73.971115,25445,Subscriber,1993,2,False
3,1804,2018-06-17 09:45:50.627,2018-06-17 10:15:55.036,3042,Fulton St & Utica Ave,40.679427,-73.929891,258,DeKalb Ave & Vanderbilt Ave,40.689407,-73.968855,25887,Customer,1989,1,False
4,1769,2018-06-17 09:46:23.363,2018-06-17 10:15:53.184,3042,Fulton St & Utica Ave,40.679427,-73.929891,258,DeKalb Ave & Vanderbilt Ave,40.689407,-73.968855,21410,Customer,1988,2,False


Now that we have the data, we will start processing it in bunch.
For each chunk, we will apply some validation and transform the source data into fact and dimension data

In [None]:
from math import radians, cos, sin, sqrt, atan2
BATCH_SIZE = 100000

# Function to calculate distance using Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

# Extract Data
def extract_data(offset):
    query = f"""
    SELECT * FROM m024.citi_bike_data 
    WHERE processed = FALSE 
    ORDER BY starttime 
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    return pd.read_sql(query, engine)

# Load dimension tables and return mapping IDs
def get_dimension_id(df, table_name, lookup_col, return_col):
    lookup_values = df[lookup_col].drop_duplicates().tolist()
    query = f"SELECT {lookup_col}, {return_col} FROM {table_name} WHERE {lookup_col} IN :lookup_values"
    mapping = pd.read_sql(query, engine, params={"lookup_values": tuple(lookup_values)})
    return dict(zip(mapping[lookup_col], mapping[return_col]))

# Load data into dimension tables
def load_dimension_data(df, table_name, cols):
    df[cols].drop_duplicates().to_sql(table_name, con=engine, if_exists='append', index=False, method='multi')

# Transform Data
def transform_data(df):
    df.fillna(value={'userType':'Unknown','gender':10000},inplace=True)
    # Calculate trip distance
    df['distance'] = df.apply(lambda row: haversine(
        row['start_station_latitude'], row['start_station_longitude'], 
        row['end_station_latitude'], row['end_station_longitude']
    ), axis=1)
    
    # Load dimension tables
    load_dimension_data(df, 'station_dimension', ['station_id', 'station_name', 'latitude', 'longitude'])
    load_dimension_data(df, 'time_dimension', ['starttime'])
    load_dimension_data(df, 'time_dimension', ['stoptime'])
    load_dimension_data(df, 'bike_dimension', ['bikeid'])
    load_dimension_data(df, 'user_type_dimension', ['usertype'])
    load_dimension_data(df, 'gender_dimension', ['gender'])
    load_dimension_data(df, 'user_birthyear_dimension', ['birth_year'])

    # Get dimension table mappings
    station_map = get_dimension_id(df, 'station_dimension', 'start_station_id', 'station_id')
    end_station_map = get_dimension_id(df, 'station_dimension', 'end_station_id', 'station_id')
    time_map = get_dimension_id(df, 'time_dimension', 'starttime', 'time_id')
    stop_time_map = get_dimension_id(df, 'time_dimension', 'stoptime', 'time_id')
    bike_map = get_dimension_id(df, 'bike_dimension', 'bikeid', 'bike_id')
    user_type_map = get_dimension_id(df, 'user_type_dimension', 'usertype', 'user_type_id')
    gender_map = get_dimension_id(df, 'gender_dimension', 'gender', 'gender_type_id')
    birth_year_map = get_dimension_id(df, 'user_birthyear_dimension', 'birth_year', 'user_birthyear_id')

    # Map dimension table IDs
    df['start_time_id'] = df['starttime'].map(time_map)
    df['end_time_id'] = df['stoptime'].map(stop_time_map)
    df['start_station_id'] = df['start_station_id'].map(station_map)
    df['end_station_id'] = df['end_station_id'].map(end_station_map)
    df['bike_id'] = df['bikeid'].map(bike_map)
    df['user_type_id'] = df['usertype'].map(user_type_map)
    df['gender_type_id'] = df['gender'].map(gender_map)
    df['user_birthyear_id'] = df['birth_year'].map(birth_year_map)

    return df[['tripduration', 'distance', 'start_time_id', 'end_time_id', 'start_station_id', 'end_station_id', 'bike_id', 'user_type_id', 'gender_type_id', 'user_birthyear_id']]

# Load fact table
def load_fact_table(df):
    df.to_sql('trip_fact', con=engine, if_exists='append', index=False, method='multi', chunksize=10000)

# Update processed records
def update_processed():
    with engine.begin() as conn:
        conn.execute(f"UPDATE m024.citi_bike_data SET processed = TRUE WHERE processed = FALSE")

# Process batch
def process_batch(offset):
    df = extract_data(offset)
    if not df.empty:
        df_transformed = transform_data(df)
        load_fact_table(df_transformed)
        update_processed()
        print(f"Processed batch {offset}-{offset+BATCH_SIZE}")

# Run ETL
def main():
    total_query = f"SELECT COUNT(*) FROM m024.citi_bike_data WHERE processed = FALSE"
    total_records = pd.read_sql(total_query, engine).iloc[0, 0]
    offsets = list(range(0, total_records, BATCH_SIZE))

    from multiprocessing import Pool
    with Pool(4) as pool:
        pool.map(process_batch, offsets)

if __name__ == "__main__":
    main()

