In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
import psycopg2
from psycopg2 import sql
from psycopg2.extras import execute_values
from io import StringIO

# Define constants
CSV_FILE_PATH = 'C:/Users/yvonn/Downloads/Resale_Flat_Txn20241201.csv'
DB_USER = 'postgres'
DB_PASS = 'admin'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'Data_Gov_Project'

# Define date range for filtering
START_DATE = pd.Timestamp('2023-10-01')
END_DATE = pd.Timestamp('2024-09-30')

def load_data_from_csv(file_path):
    """Load data from a CSV file and filter by date range."""
    df = pd.read_csv(file_path)

    # Convert 'month' to a datetime representing resale_date
    df['resale_date'] = pd.to_datetime(df['month'], format='%Y-%m')
    
    # Generate daily dates for each transaction within the month
    df['resale_date'] = df.apply(lambda row: pd.date_range(start=row['resale_date'], 
                                                           periods=row['resale_date'].days_in_month, 
                                                           freq='D')[row.name % row['resale_date'].days_in_month], 
                                 axis=1)

    # Convert numeric columns to appropriate types
    df['floor_area_sqm'] = pd.to_numeric(df['floor_area_sqm'], errors='coerce')
    df['resale_price'] = pd.to_numeric(df['resale_price'], errors='coerce')
    df['lease_commence_year'] = pd.to_datetime(df['lease_commence_date'], errors='coerce').dt.year

    # Prepare DataFrame and rename columns for PostgreSQL compatibility
    processed_df = df.rename(columns={
        'town': 'town_name',
        'block': 'block_no',
        'storey_range': 'storey_range'
    })

    # Select relevant columns for PostgreSQL, excluding 'resale_id'
    processed_df = processed_df[['resale_date', 'town_name', 'flat_type', 'block_no', 
                                 'street_name', 'storey_range', 'floor_area_sqm', 
                                 'flat_model', 'lease_commence_year', 'remaining_lease', 
                                 'resale_price']]
    
    # Filter for valid date ranges
    filtered_df = processed_df[(processed_df['resale_date'] >= START_DATE) & 
                               (processed_df['resale_date'] <= END_DATE)]

    filtered_df = filtered_df.sort_values('resale_date')

    print(f"Filtered records count: {len(filtered_df)}")
    print("Unique resale prices:", filtered_df['resale_price'].unique())

    return filtered_df

def load_data_to_postgres(data_frame):
    """Load the provided DataFrame into the PostgreSQL database using copy_from()."""
    engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')
    
    with engine.connect() as conn:
        # Start a transaction
        with conn.begin():
            # Create a temporary table
            conn.execute(text("""
                CREATE TEMPORARY TABLE temp_resale_flat_txn (
                    resale_date DATE,
                    town_name VARCHAR(255),
                    flat_type VARCHAR(50),
                    block_no VARCHAR(50),
                    street_name VARCHAR(255),
                    storey_range VARCHAR(50),
                    floor_area_sqm NUMERIC,
                    flat_model VARCHAR(100),
                    lease_commence_year INTEGER,
                    remaining_lease VARCHAR(100),
                    resale_price NUMERIC
                )
            """))
            
            # Use copy_from to load data into the temporary table
            output = StringIO()
            data_frame.to_csv(output, sep='\t', header=False, index=False)
            output.seek(0)
            
            cursor = conn.connection.cursor()
            cursor.copy_from(output, 'temp_resale_flat_txn', sep='\t')
            
            # Insert data from temporary table to the main table
            conn.execute(text("""
                INSERT INTO resale_flat_txn (
                    resale_date, town_name, flat_type, block_no, street_name,
                    storey_range, floor_area_sqm, flat_model, lease_commence_year,
                    remaining_lease, resale_price
                )
                SELECT 
                    resale_date, town_name, flat_type, block_no, street_name,
                    storey_range, floor_area_sqm, flat_model, lease_commence_year,
                    remaining_lease, resale_price
                FROM temp_resale_flat_txn
                ORDER BY resale_date
            """))
            
            # Drop the temporary table
            conn.execute(text("DROP TABLE temp_resale_flat_txn"))
    
    print(f"Successfully loaded {len(data_frame)} records to PostgreSQL.")

def main():
    """Main function to execute the script."""
    print("Starting the script...")
    
    filtered_df = load_data_from_csv(CSV_FILE_PATH)
    load_data_to_postgres(filtered_df)

    print("Script completed.")

if __name__ == "__main__":
    main()

Starting the script...
Filtered records count: 27980
Unique resale prices: [ 341800.  388000.  395000. ... 1200888.  534388.  458988.]
Successfully loaded 27980 records to PostgreSQL.
Script completed.
