In [None]:
#Insert Property Values

In [2]:
# Environment Setting Up
!pip install psycopg2-binary sqlalchemy pandas
from sqlalchemy import create_engine, text
from sqlalchemy import create_engine
from sqlalchemy import text
import pandas as pd
from decimal import Decimal



In [4]:
# Connect to PostgreSQL 
engine = create_engine("postgresql+psycopg2://postgres:123@localhost:5432/5310Group")
conn = engine.connect()

In [54]:
# Create Tables
create_table_sql = """
DROP TABLE IF EXISTS neighborhoods, properties, house_locations, sale CASCADE;

CREATE TABLE neighborhoods (
    neighborhood_id SERIAL PRIMARY KEY,
    neighborhood_name VARCHAR(100),
    borough_id INT
);

CREATE TABLE properties (
    property_id SERIAL PRIMARY KEY,
    tax_class_present INT,
    building_class_present VARCHAR(10),
    year_built INT,
    residential_units INT,
    commercial_units INT,
    total_units INT,
    land_square_feet INT,
    gross_square_feet INT
);

CREATE TABLE house_locations (
    house_location_id SERIAL PRIMARY KEY,
    property_id INT REFERENCES properties(property_id) ON DELETE CASCADE,
    location_id INT REFERENCES unified_locations(location_id),
    address VARCHAR(255),
    block INT,
    neighborhood_id INT REFERENCES neighborhoods(neighborhood_id)
);

CREATE TABLE sale (
    sale_id SERIAL PRIMARY KEY,
    property_id INT REFERENCES properties(property_id),
    sale_price BIGINT,
    sale_date DATE
);

"""

# to execute
with engine.begin() as conn:
    conn.execute(text(create_table_sql))

In [4]:
#Insert Values

In [62]:
# Read Data 
property_df = pd.read_excel("/Users/celine/Desktop/property.xlsx")
# Normalization names
property_df.columns = property_df.columns.str.lower().str.strip()
# Clean data
property_df["latitude"] = property_df["latitude"].apply(lambda x: Decimal(str(round(float(x), 6))))
property_df["longitude"] = property_df["longitude"].apply(lambda x: Decimal(str(round(float(x), 6))))
property_df["zipcode"] = property_df["zipcode"].astype(str).str.zfill(5)

In [64]:
# Step 1：Insert data into unified_locations

# Extract the deduplicated coordinates
location_df = property_df[["latitude", "longitude", "zipcode"]].drop_duplicates()

# Insert into unified_locations table and construct location_id mapping
location_id_map = {}

with engine.begin() as conn:
    for _, row in location_df.iterrows():
        result = conn.execute(text("""
            INSERT INTO unified_locations (latitude, longitude, zipcode)
            VALUES (:lat, :lon, :zip)
            ON CONFLICT (latitude, longitude) DO NOTHING
            RETURNING location_id
        """), {
            "lat": row["latitude"],
            "lon": row["longitude"],
            "zip": row["zipcode"]
        })
        
        location_id = result.scalar()
        
        # If no value is returned (it is skipped), manually look for location_id
        if location_id is None:
            location_id = conn.execute(text("""
                SELECT location_id FROM unified_locations
                WHERE latitude = :lat AND longitude = :lon
            """), {
                "lat": row["latitude"],
                "lon": row["longitude"]
            }).scalar()
        
        # Save the mapping: to facilitate subsequent insertion into the house_locations table
        location_id_map[(row["latitude"], row["longitude"])] = location_id

print("data successfully inserted into unified_locations")

data successfully inserted into unified_locations


In [68]:
# Step 2：Insert data into neighborhoods
# Extract unique neighborhoods
neighborhoods_df = property_df[["neighborhood_name", "borough"]].drop_duplicates()
neighborhoods_df = neighborhoods_df.rename(columns={"borough": "borough_id"})

with engine.begin() as conn:
    for _, row in neighborhoods_df.iterrows():
        conn.execute(text("""
            INSERT INTO neighborhoods (neighborhood_name, borough_id)
            VALUES (:name, :borough)
            ON CONFLICT DO NOTHING
        """), {
            "name": row["neighborhood_name"],
            "borough": int(row["borough_id"])
        })

print(f" Data successfully inserted into neighborhoods")

 Data successfully inserted into neighborhoods


In [72]:
# Step 3：Insert data into properties 
# Fill in N/A values
property_df.fillna({
    'tax_class_present': 0,
    'year_built': 0,
    'residential_units': 0,
    'commercial_units': 0,
    'total_units': 0,
    'land_square_feet': 0,
    'gross_square_feet': 0
}, inplace=True)

property_id_map = {}

with engine.begin() as conn:
    for _, row in property_df.iterrows():
        result = conn.execute(text("""
            INSERT INTO properties (
                tax_class_present, building_class_present, year_built,
                residential_units, commercial_units, total_units,
                land_square_feet, gross_square_feet
            ) VALUES (
                :tax, :bld, :year, :res, :com, :total, :land, :gross
            )
            RETURNING property_id
        """), {
            "tax": int(row["tax_class_present"]),
            "bld": row["building_class_present"],
            "year": int(row["year_built"]),
            "res": int(row["residential_units"]),
            "com": int(row["commercial_units"]),
            "total": int(row["total_units"]),
            "land": int(row["land_square_feet"]),
            "gross": int(row["gross_square_feet"])
        })

        # Save property_id, corresponding to address
        property_id_map[row["address"]] = result.scalar()

print(f"✅ Inserted {len(property_id_map)} unique properties into `properties` table.")

✅ Inserted 105790 unique properties into `properties` table.


In [76]:
# Step 4：Insert data into house_locations
with engine.begin() as conn:
    for _, row in property_df.iterrows():
        conn.execute(text("""
            INSERT INTO house_locations (
                property_id, location_id, address, block, neighborhood_id
            ) VALUES (
                :prop_id, :loc_id, :addr, :blk,
                (
                    SELECT neighborhood_id FROM neighborhoods 
                    WHERE neighborhood_name = :nb AND borough_id = :borough
                    LIMIT 1
                )
            )
        """), {
            "prop_id": property_id_map[row["address"]],
            "loc_id": location_id_map[(row["latitude"], row["longitude"])],
            "addr": row["address"],
            "blk": int(row["block"]),
            "nb": row["neighborhood_name"],
            "borough": int(row["borough"])
        })
print(f" Data successfully inserted into house_locations")

 Data successfully inserted into house_locations


In [78]:
# Step 5：Insert data into sale 
with engine.begin() as conn:
    for _, row in property_df.iterrows():
        conn.execute(text("""
            INSERT INTO sale (property_id, sale_price, sale_date)
            VALUES (:prop_id, :price, :date)
        """), {
            "prop_id": property_id_map[row["address"]],
            "price": int(row["sale_price"]) if pd.notnull(row["sale_price"]) else 0,
            "date": row["sale_date"]
        })
print(f"data successfully inserted into sale!")

data successfully inserted into sale!


In [80]:
#Check Tables
#pd.read_sql("SELECT * FROM unified_locations ORDER BY location_id DESC LIMIT 10", engine)
#pd.read_sql("SELECT * FROM sale ORDER BY sale_id DESC LIMIT 10", engine)

Unnamed: 0,sale_id,property_id,sale_price,sale_date
0,130576,130583,672043,2008-07-18
1,130575,130582,610950,2018-07-23
2,130574,130581,793216,2018-05-25
3,130573,130580,680000,2018-11-28
4,130572,130579,0,2018-10-31
5,130571,130578,0,2018-01-25
6,130570,130577,630000,2018-07-24
7,130569,130576,675000,2018-12-28
8,130568,130576,0,2010-09-18
9,130567,130576,0,2009-09-18
