In [None]:
import pandas as pd # For data manipulation
from sqlalchemy import create_engine # For database connectivity
import psycopg2 # For PostgreSQL connectivity
from psycopg2 import sql # For SQL queries
from psycopg2.extras import execute_values # For bulk insert

# Define constants
CSV_FILE_PATH = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 5 Interim Project/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv' # Update your file path
DB_USER = 'postgres'             # Update with your PostgreSQL username
DB_PASS = 'admin'                # Update with your PostgreSQL password
DB_HOST = 'localhost'            # Update with your database host
DB_PORT = '5432'                 # Update with your database port
DB_NAME = 'data_gov_project'     # Update with your PostgreSQL database name

# Define date range for filtering
START_DATE = pd.Timestamp('2023-10-01')
END_DATE = pd.Timestamp('2024-9-30')

def load_data_from_csv(file_path):
    """Load data from a CSV file and filter by date range."""
    df = pd.read_csv(file_path)

    # Convert 'month' to a datetime representing resale_date
    df['resale_date'] = pd.to_datetime(df['month'], format='%Y-%m', errors='coerce')

    # Convert numeric columns to appropriate types
    df['floor_area_sqm'] = pd.to_numeric(df['floor_area_sqm'], errors='coerce')
    df['resale_price'] = pd.to_numeric(df['resale_price'], errors='coerce')
    df['lease_commence_year'] = pd.to_datetime(df['lease_commence_date'], errors='coerce').dt.year

    # Add resale_id (sequential numbers starting from 1)
    df['resale_id'] = range(1, len(df) + 1)  # Sequential numbers starting from 1
        
    # Prepare DataFrame and rename columns for PostgreSQL compatibility
    processed_df = df.rename(columns={
        'town': 'town_name',
        'block': 'block_no',
        'storey_range': 'storey_range'  # This remains unchanged
    })

    # Select relevant columns for PostgreSQL, excluding 'resale_id'
    processed_df = processed_df[['resale_id', 'resale_date', 'town_name', 'flat_type', 'block_no', 
                                  'street_name', 'storey_range', 'floor_area_sqm', 
                                  'flat_model', 'lease_commence_year', 'remaining_lease', 
                                  'resale_price']]
    
    # Filter for valid date ranges
    filtered_df = processed_df[(processed_df['resale_date'] >= START_DATE) & 
                                (processed_df['resale_date'] <= END_DATE)]

    print(f"Filtered records count: {len(filtered_df)}")
    print("Unique resale prices:", filtered_df['resale_price'].unique())

    return filtered_df

def load_data_to_postgres(data_frame):
    """Load the provided DataFrame into the PostgreSQL database."""
    # Create database engine for SQLAlchemy usage
    engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

    # Prepare for bulk insert using psycopg2
    conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USER, password=DB_PASS)
    cur = conn.cursor()

    # Prepare the insert query
    insert_query = sql.SQL("""
        INSERT INTO resale_flat_txn (resale_id, resale_date, town_name, flat_type, block_no, street_name, 
                                      storey_range, floor_area_sqm, flat_model, lease_commence_year, 
                                      remaining_lease, resale_price)
        VALUES %s
    """)

    # Prepare tuples for the insert query
    data_tuples = [tuple(x) for x in data_frame.values]

    try:
        # Insert in batches using execute_values
        execute_values(cur, insert_query, data_tuples, template=None, page_size=1000)
        conn.commit()
        print(f"Successfully loaded {len(data_tuples)} records to PostgreSQL.")
    except Exception as e:
        print(f"Error loading data into PostgreSQL: {e}")
    finally:
        cur.close()
        conn.close()

def main():
    """Main function to execute the script."""
    print("Starting the script...")
    
    # Load the data from CSV
    filtered_df = load_data_from_csv(CSV_FILE_PATH)
    
    # Load the filtered data into PostgreSQL
    load_data_to_postgres(filtered_df)

    print("Script completed.")

# Execute the script
if __name__ == "__main__":
    main()

Starting the script...
Filtered records count: 27980
Unique resale prices: [ 341800.  388000.  395000. ... 1200888.  534388.  458988.]
Successfully loaded 27980 records to PostgreSQL.
Script completed.
