We will load the input data as dataframe:

In [13]:
import os
from sqlalchemy import create_engine
from tqdm import tqdm
import psycopg2
import json
import pandas as pd
import numpy as np

with open('../credentials.json') as f:
    data = json.load(f)
    psql_config = {
    'dbname': data['db_name'],
    'user': data['db_user'],
    'password': data['db_pwd'],
    'host': data['db_host'],
    'port': 5432
}


def get_psql_connection():
    try:
        conn = psycopg2.connect(**psql_config)
    except Exception as e:
        print("Error connecting to the database:", e)
    return conn

In [2]:
engine = create_engine(f'postgresql://{psql_config["user"]}:{psql_config["password"]}@{psql_config["host"]}/{psql_config["dbname"]}')

df = pd.read_sql("SELECT * FROM m024.citi_bike_data LIMIT 1000;", engine)

df.head()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,processed
0,176,2018-02-08 18:19:27.370,2018-02-08 18:22:23.376,314,Cadman Plaza West & Montague St,40.69383,-73.990539,391,Clark St & Henry St,40.697601,-73.993446,21643,Subscriber,1986,2,False
1,258,2018-02-08 18:47:03.281,2018-02-08 18:51:21.659,314,Cadman Plaza West & Montague St,40.69383,-73.990539,407,Henry St & Poplar St,40.700469,-73.991454,32405,Subscriber,1971,2,False
2,389,2018-02-08 18:51:08.796,2018-02-08 18:57:38.442,314,Cadman Plaza West & Montague St,40.69383,-73.990539,3407,Union St & Nevins St,40.679098,-73.987655,32214,Subscriber,1979,1,False
3,162,2018-02-08 18:51:49.820,2018-02-08 18:54:32.161,314,Cadman Plaza West & Montague St,40.69383,-73.990539,406,Hicks St & Montague St,40.695128,-73.995951,21566,Subscriber,1962,1,False
4,616,2018-02-08 19:08:12.738,2018-02-08 19:18:28.853,314,Cadman Plaza West & Montague St,40.69383,-73.990539,3407,Union St & Nevins St,40.679098,-73.987655,16798,Subscriber,1957,1,False


Now that we have the data, we will start processing it in bunch.
For each chunk, we will apply some validation and transform the source data into fact and dimension data

In [15]:
from math import radians, cos, sin, sqrt, atan2
from psycopg2.extras import execute_values
BATCH_SIZE = 10

# Function to calculate distance using Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

# Extract Data
def extract_data(offset):
    query = f"""
    SELECT * FROM m024.citi_bike_data 
    WHERE processed = FALSE 
    ORDER BY starttime 
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    return pd.read_sql(query, engine)

# Load data into dimension tables
def load_dimension_data(df, table_name, cols, db_cols):
    df = df[cols].drop_duplicates()
    try:
        with get_psql_connection() as conn:
            with conn.cursor() as cur:
                statement = f"""
                INSERT INTO m024.p_{table_name} ({', '.join(db_cols)})
                VALUES %s 
                ON CONFLICT DO NOTHING;"""
                data_tuples = [tuple(row) for row in df.to_numpy()]
                    
                # Execute batch insert
                execute_values(cur, statement, data_tuples)
                
                # Commit changes
                conn.commit() 
                print(f"Data inserted/updated successfully in {table_name}")
    except Exception as e:
        print(f"Error in inserting/updating data for table {table_name}- {e}")

# Load dimension tables and return mapping IDs
def get_dimension_id(df, table_name, lookup_col, db_lookup_col,db_return_col):
    lookup_values = df[lookup_col].drop_duplicates().tolist()
    query = f"SELECT {db_lookup_col}, {db_return_col} FROM m024.p_{table_name} WHERE {db_lookup_col} IN %s"
    mapping = pd.read_sql(query, engine, params=(tuple(lookup_values),))
    return dict(zip(mapping[db_lookup_col], mapping[db_return_col]))

def get_time_dimension_id(df, table_name, lookup_col, db_lookup_cols, db_return_col):
    lookup_values = df[lookup_col].drop_duplicates().tolist()

    # Build query based on date components to avoid precision issues with timestamps
    query = f"""
    SELECT {', '.join(db_lookup_cols)}, {db_return_col}
    FROM m024.p_{table_name}
    WHERE ({', '.join(db_lookup_cols)}) IN %s
    """
    lookup_tuples = [
        (row['year'], row['month'], row['day'], row['hour']) for _, row in df.iterrows()
    ]
    
    mapping = pd.read_sql(query, engine, params=(tuple(lookup_tuples),))
    return dict(zip(mapping[db_lookup_cols], mapping[db_return_col]))


In [None]:
# Transform Data
def transform_data(df):
    #transform gender and load the dimension
    df.gender = df.gender.map(dict(zip([1, 2],['Male','Female']))).fillna('Unknown')
    load_dimension_data(df, 'gender_dimension', ['gender'],['gender_type'])
    df.usertype = df.usertype.apply(lambda x: x if x in ['Subscriber', 'Customer'] else 'Unknown')
    load_dimension_data(df, 'user_type_dimension', ['usertype'],['user_type'])
    #df['birth_year'] = df['birth_year'].apply(lambda x: int(x) if isinstance(x, (int, np.int64)) else 0)
    df.birth_year = df.birth_year.apply(lambda x: x if x > 1940 and x <2013 else 0) # Assuming you need to be atleast 5 to ride the bike
    load_dimension_data(df, 'user_birthyear_dimension', ['birth_year'],['user_birthyear'])

    # Clean station names
    df['start_station_name'] = df['start_station_name'].str.strip().fillna('Unknown')
    df['end_station_name'] = df['end_station_name'].str.strip().fillna('Unknown')

    # Validate Latitude and Longitude
    df['start_station_latitude'] = df['start_station_latitude'].apply(
        lambda x: x if -90 <= x <= 90 else None
    )
    df['start_station_longitude'] = df['start_station_longitude'].apply(
        lambda x: x if -180 <= x <= 180 else None
    )
    df['end_station_latitude'] = df['end_station_latitude'].apply(
        lambda x: x if -90 <= x <= 90 else None
    )
    df['end_station_longitude'] = df['end_station_longitude'].apply(
        lambda x: x if -180 <= x <= 180 else None
    )

    # For missing latitude/longitude values
    df['start_station_latitude'].fillna('Unknown', inplace=True)
    df['start_station_longitude'].fillna('Unknown', inplace=True)
    df['end_station_latitude'].fillna('Unknown', inplace=True)
    df['end_station_longitude'].fillna('Unknown', inplace=True)
    load_dimension_data(df, 'station_dimension',
                         ['start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude'],
                           ['station_key', 'station_name', 'latitude', 'longitude'])
    load_dimension_data(df, 'station_dimension',
                         ['end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude'],
                           ['station_key', 'station_name', 'latitude', 'longitude'])
    
    # Fill missing values
    df['starttime'].fillna('1.1.1900', inplace=True)
    df['stoptime'].fillna('1.1.1900', inplace=True)

    # Convert to datetime format
    df['starttime_dt'] = pd.to_datetime(df['starttime'])
    df['stoptime_dt'] = pd.to_datetime(df['stoptime'])

    # Extract fields for start time
    df['start_date'] = df['starttime_dt'].dt.date
    df['start_year'] = df['starttime_dt'].dt.year
    df['start_month'] = df['starttime_dt'].dt.month
    df['start_day'] = df['starttime_dt'].dt.day
    df['start_hour'] = df['starttime_dt'].dt.hour

    # Extract fields for stop time
    df['stop_date'] = df['stoptime_dt'].dt.date
    df['stop_year'] = df['stoptime_dt'].dt.year
    df['stop_month'] = df['stoptime_dt'].dt.month
    df['stop_day'] = df['stoptime_dt'].dt.day
    df['stop_hour'] = df['stoptime_dt'].dt.hour

    load_dimension_data(df, 'time_dimension',
                         ['starttime', 'start_date', 'start_year', 'start_month', 'start_day', 'start_hour'],
                         ['time', 'date', 'year', 'month', 'day', 'hour'])
    
    load_dimension_data(df, 'time_dimension',
                         ['stoptime', 'stop_date', 'stop_year', 'stop_month', 'stop_day', 'stop_hour'],
                         ['time', 'date', 'year', 'month', 'day', 'hour'])
    
    load_dimension_data(df, 'bike_dimension', ['bikeid'], ['bike_id'])

    df['starttime_dt'] = pd.to_datetime(df['starttime'])
    df['stoptime_dt'] = pd.to_datetime(df['stoptime'])

    # Extract fields for start time
    df['start_date'] = df['starttime_dt'].dt.date
    df['start_year'] = df['starttime_dt'].dt.year
    df['start_month'] = df['starttime_dt'].dt.month
    df['start_day'] = df['starttime_dt'].dt.day
    df['start_hour'] = df['starttime_dt'].dt.hour

    # Extract fields for stop time
    df['stop_date'] = df['stoptime_dt'].dt.date
    df['stop_year'] = df['stoptime_dt'].dt.year
    df['stop_month'] = df['stoptime_dt'].dt.month
    df['stop_day'] = df['stoptime_dt'].dt.day
    df['stop_hour'] = df['stoptime_dt'].dt.hour

    load_dimension_data(df, 'time_dimension',
                         ['starttime', 'start_date', 'start_year', 'start_month', 'start_day', 'start_hour'],
                         ['time', 'date', 'year', 'month', 'day', 'hour'])

    load_dimension_data(df, 'time_dimension',
                         ['stoptime', 'stop_date', 'stop_year', 'stop_month', 'stop_day', 'stop_hour'],
                         ['time', 'date', 'year', 'month', 'day', 'hour'])
    # Calculate trip distance
    df['distance'] = df.apply(lambda row: haversine(
        row['start_station_latitude'], row['start_station_longitude'], 
        row['end_station_latitude'], row['end_station_longitude']
    ), axis=1)

    # Get dimension table mappings
    station_map = get_dimension_id(df, 'station_dimension', 'start_station_id', 'station_key','station_id')
    end_station_map = get_dimension_id(df, 'station_dimension', 'end_station_id', 'station_key','station_id')
    start_time_map = get_dimension_id(df, 'time_dimension', 'starttime', 'time', 'time_id')
    stop_time_map = get_dimension_id(df, 'time_dimension', 'stoptime', 'time', 'time_id')
    user_type_map = get_dimension_id(df, 'user_type_dimension', 'usertype', 'user_type','user_type_id')
    gender_map = get_dimension_id(df, 'gender_dimension', 'gender', 'gender_type','gender_id')
    birth_year_map = get_dimension_id(df, 'user_birthyear_dimension', 'birth_year', 'user_birthyear', 'user_birthyear_id')

    # Map dimension table IDs
    df['start_time_id'] = df['starttime'].map(start_time_map)
    df['end_time_id'] = df['stoptime'].map(stop_time_map)
    df['start_station_id'] = df['start_station_id'].map(station_map)
    df['end_station_id'] = df['end_station_id'].map(end_station_map)
    df['bike_id'] = df['bikeid']
    df['user_type_id'] = df['usertype'].map(user_type_map)
    df['gender_type_id'] = df['gender'].map(gender_map)
    df['user_birthyear_id'] = df['birth_year'].map(birth_year_map)
    df['duration'] = df['tripduration']

    return df[['duration', 'distance', 'start_time_id', 'end_time_id', 'start_station_id', 'end_station_id', 'bike_id', 'user_type_id', 'gender_type_id', 'user_birthyear_id']]

# Load fact table
def load_fact_table(df):
    df.to_sql('m024.p_trip_fact', con=engine, if_exists='append', index=False, method='multi', chunksize=10000)

# Update processed records
def update_processed(df):
    query = "UPDATE m024.citi_bike_data SET processed = TRUE WHERE processed = FALSE"
    
    try:
        with get_psql_connection() as conn:
            with conn.cursor() as cur:
                # Execute the update statement
                cur.execute(query)
                conn.commit()  # Commit the changes
                print("Successfully updated the processed flag.")
    except Exception as e:
        print(f"Error in updating processed flag: {e}")

# Process batch
def process_batch(offset):
    df = extract_data(offset)
    if not df.empty:
        df_transformed = transform_data(df)
        load_fact_table(df_transformed)
        update_processed(df)
        print(f"Processed batch {offset}-{offset+BATCH_SIZE}")


process_batch(0)


Data inserted/updated successfully in gender_dimension
Data inserted/updated successfully in user_type_dimension
Error in inserting/updating data for table user_birthyear_dimension- can't adapt type 'numpy.int64'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['start_station_latitude'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['start_station_longitude'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object o

Data inserted/updated successfully in station_dimension
Data inserted/updated successfully in station_dimension
Data inserted/updated successfully in time_dimension
Data inserted/updated successfully in time_dimension


ObjectNotExecutableError: Not an executable object: 'UPDATE m024.citi_bike_data SET processed = TRUE WHERE processed = FALSE'