Due to large volume of input data, we cannot upload it to Github.
As a workaround, we will load the data into Postgres DB locally.
As a first step, we will load the required dependencies and create the database connection.

In [None]:
import os
from sqlalchemy import create_engine
from tqdm import tqdm
import psycopg2
import json
import pandas as pd

with open('../credentials.json') as f:
    data = json.load(f)
    psql_config = {
    'dbname': data['db_name'],
    'user': data['db_user'],
    'password': data['db_pwd'],
    'host': data['db_host'],
    'port': 5432
}


def get_psql_connection():
    try:
        conn = psycopg2.connect(**psql_config)
    except Exception as e:
        print("Error connecting to the database:", e)
    return conn

Next, we define and execute the functions to create the staging table and truncate it incase it already exists.

In [None]:
def truncate_citibike_data_table():
    try:
        statement = """truncate table m024._bike_data;"""
        with get_psql_connection() as conn:
            with conn.cursor() as cur:
                cur.execute(statement)
            conn.commit()  
            print(f"citibike_data table truncated successfully")
    except Exception as e:
        print(f"Error in truncating citibike table - {e}")

def create_citibike_data_table():
    try:
        statement = """create table if not exists m024.citi_bike_data
(
    tripduration            integer,
    starttime               timestamp,
    stoptime                timestamp,
    start_station_id        integer,
    start_station_name      varchar(255),
    start_station_latitude  double precision,
    start_station_longitude double precision,
    end_station_id          integer,
    end_station_name        varchar(255),
    end_station_latitude    double precision,
    end_station_longitude   double precision,
    bikeid                  integer,
    usertype                varchar(50),
    birth_year              integer,
    gender                  integer,
    processed               boolean
);
        """
        with get_psql_connection() as conn:
            with conn.cursor() as cur:
                cur.execute(statement)
            conn.commit()  
            print(f"citibike_data table created successfully/already exists")
    except Exception as e:
        print(f"Error in creating citibike table - {e}")

def insert_data_to_citibike_table(df):
    try:
        with get_psql_connection() as conn:
            with conn.cursor() as cur:
                # Prepare the insert statement
                insert_query = """
                INSERT INTO m024.citi_bike_data (
                    tripduration, starttime, stoptime, start_station_id, start_station_name, 
                    start_station_latitude, start_station_longitude, end_station_id, 
                    end_station_name, end_station_latitude, end_station_longitude, bikeid, 
                    usertype, birth_year, gender
                ) 
                VALUES %s
                ON CONFLICT DO NOTHING;
                """
                # Convert DataFrame rows into tuples
                rows = [
                    (
                        row['tripduration'], row['starttime'], row['stoptime'], row['start station id'], 
                        row['start station name'], row['start station latitude'], row['start station longitude'], row['end station id'],
                        row['end station name'], row['end station latitude'], row['end station longitude'],
                        row['bikeid'], row['usertype'], row['birth year'], row['gender']
                    )
                    for _, row in df.iterrows()
                ]

                # Use psycopg2.extras.execute_values for efficient bulk insert
                from psycopg2.extras import execute_values
                execute_values(cur, insert_query, rows)
            
            conn.commit()
            #print(f"Data inserted successfully")
    except Exception as e:
        print(f"Error in inserting data: {e}") 

def load_processed_files(log_file="processed_files.csv"):
    """Load the processed files list from the log file."""
    if os.path.exists(log_file):
        processed_files_df = pd.read_csv(log_file)
        return set(processed_files_df['file_name'].tolist())
    else:
        # If no log file exists, return an empty set
        return set()

def save_processed_file(file_name, log_file="processed_files.csv"):
    """Save the processed file to the log."""
    processed_files_df = pd.DataFrame({'file_name': [file_name]})
    if os.path.exists(log_file):
        processed_files_df.to_csv(log_file, mode='a', header=False, index=False)
    else:
        processed_files_df.to_csv(log_file, index=False)

In [4]:
engine = create_engine(f'postgresql://{psql_config["user"]}:{psql_config["password"]}@{psql_config["host"]}/{psql_config["dbname"]}')

create_citibike_data_table()
truncate_citibike_data_table()

citibike_data table created successfully/already exists
Error in truncating citibike table - relation "m024._bike_data" does not exist



Finally, we will start processing the csv files one by one and maintain a log to track the already processed files to avoid re-processing them.

In [None]:
folder_path="../2018-citibike-tripdata"
log_file="processed_files.csv"

csv_files = sorted([file for file in os.listdir(folder_path) if file.endswith("tripdata.csv")])
processed_files = load_processed_files(log_file)
for file in tqdm(csv_files, desc="Processing CSV files in alphabetical order"):
    try:
        # Skip already processed files
        if file in processed_files:
            print(f"Skipping already processed file: {file}")
            continue

        file_path = os.path.join(folder_path, file)
        
        # Read the CSV file
        print(f"Processing file: {file_path}")
        df = pd.read_csv(file_path,on_bad_lines='skip')

        required_columns = [
            'tripduration', 'starttime', 'stoptime', 'start station id', 
            'start station name', 'start station latitude', 
            'start station longitude', 'end station id', 'end station name', 
            'end station latitude', 'end station longitude'
        ]
        if not set(required_columns).issubset(df.columns):
            raise ValueError(f"Missing columns in {file_path}. Required Columns: {required_columns} and columns in file {df.columns}")
        insert_data_to_citibike_table(df)
        # After processing, log the file as processed
        save_processed_file(file, log_file)
    except Exception as e:
        print(f"Error processing file {file}: {e}")