In [2]:
from sqlalchemy import create_engine, text
import pandas as pd

# connect to PostgreSQL 
engine = create_engine("postgresql+psycopg2://postgres:123@localhost:5432/5310Group")
conn = engine.connect()

In [4]:
# Create tables
create_table_sql = """
DROP TABLE IF EXISTS schools,school_Locations,administrations,principals,school_administrations,school_principals CASCADE;


CREATE TABLE schools (
    ats_system_code VARCHAR(20) PRIMARY KEY,
    school_name VARCHAR(255) NOT NULL,
    managed_by_name VARCHAR(100),
    school_type VARCHAR(100),
    school_category_description VARCHAR(100),
    open_date DATE,
    status_description VARCHAR(50)
);

CREATE TABLE school_Locations (
    school_location_id SERIAL PRIMARY KEY,
    ats_system_code VARCHAR(20) REFERENCES schools(ats_system_code) ON DELETE CASCADE,
    location_id INT REFERENCES Unified_Locations(location_id),
    primary_address_line_1 VARCHAR(255),
    x_coordinate FLOAT,
    y_coordinate FLOAT,
    UNIQUE(ats_system_code, location_id)
);

CREATE TABLE administrations (
admin_id SERIAL PRIMARY KEY,
    admin_district_location_code VARCHAR(50),
    administrative_district_name VARCHAR(100),
    school_support_team_name VARCHAR(255),
    community_district VARCHAR(50),
    census_tract VARCHAR(50)
);

CREATE TABLE principals (
    principal_id SERIAL PRIMARY KEY,
    principal_name VARCHAR(100),
    principal_title VARCHAR(50),
    principal_phone_number VARCHAR(20),
    fax_number VARCHAR(20)
);

CREATE TABLE school_administrations (
    ats_system_code VARCHAR(20) REFERENCES schools(ats_system_code) ON DELETE CASCADE,
    admin_id INT REFERENCES administrations(admin_id) ON DELETE CASCADE,
    PRIMARY KEY (ats_system_code, admin_id)
);

CREATE TABLE school_principals (
    ats_system_code VARCHAR(20) REFERENCES schools(ats_system_code) ON DELETE CASCADE,
    principal_id INT REFERENCES principals(principal_id) ON DELETE CASCADE,
    PRIMARY KEY (ats_system_code, principal_id)
);

"""

# to execute
with engine.begin() as conn:
    conn.execute(text(create_table_sql))

In [6]:
import pandas as pd
from sqlalchemy import create_engine

# read data
df = pd.read_excel("/Users/celine/Desktop/School_Locations.xlsx")
df.columns = [col.strip() for col in df.columns]
df['open_date'] = pd.to_datetime(df['open_date'], errors='coerce')
# Standarize precision to 6 digits
df['latitude'] = df['latitude'].round(6)
df['longitude'] = df['longitude'].round(6)

In [8]:
df.head(5)

Unnamed: 0,ats_system_code,school_name,managed_by_name,school_category_description,school_type,open_date,status_description,primary_address_line_1,x_coordinate,y_coordinate,...,principal_name,principal_title,principal_phone_number,fax_number,admin_district_location_code,administrative_district_name,school_support_team_name,latitude,longitude,zipcode
0,01M015,P.S. 015 Roberto Clemente,DOE,General Academic,Elementary,2007-01-04,Open,333 EAST 4 STREET,990141,202349,...,IRENE SANCHEZ,PRINCIPAL,212-228-8730,212-477-0931,M801,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",40.722075,-73.978747,10009
1,01M019,P.S. 019 Asher Levy,DOE,General Academic,Elementary,2056-07-01,Open,185 1 AVENUE,988547,205239,...,JACQUELINE FLANAGAN,PRINCIPAL,212-533-5340,212-673-1477,M801,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",40.730009,-73.984496,10003
2,01M020,P.S. 020 Anna Silver,DOE,General Academic,Elementary,2063-07-01,Open,166 ESSEX STREET,988044,202068,...,Carmen Colon,PRINCIPAL,212-254-9577,212-254-3526,M801,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",40.721305,-73.986312,10002
3,01M034,P.S. 034 Franklin D. Roosevelt,DOE,General Academic,K-8,2055-07-01,Open,730 EAST 12 STREET,991163,203782,...,Rosemarie Gonzalez,PRINCIPAL,212-228-4433,212-353-1973,M801,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",40.726008,-73.975058,10009
4,01M063,The STAR Academy - P.S.63,DOE,General Academic,Elementary,2007-01-05,Open,121 EAST 3 STREET,988071,203210,...,DARLENE CAMERON,PRINCIPAL,212-674-3180,212-420-9018,M801,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",40.72444,-73.986214,10009


In [10]:
# -------- Insert data into schools --------
school_cols = [
    'ats_system_code', 'school_name', 'managed_by_name', 'school_type',
    'school_category_description', 'open_date', 'status_description'
]
schools_df = df[school_cols].drop_duplicates()
schools_df.to_sql("schools", engine, if_exists='append', index=False, method='multi')
print("data successfully inserted into schools")

data successfully inserted into schools


In [12]:
!pip install sqlalchemy
school_data=df



In [14]:
#Insert data into unified_locations 
from decimal import Decimal
from sqlalchemy import text
import pandas as pd

# Ensure latitude and longitude are of Decimal type, and zipcode is of CHAR(5).
school_data["latitude"] = school_data["latitude"].apply(lambda x: Decimal(str(round(float(x), 6))))
school_data["longitude"] = school_data["longitude"].apply(lambda x: Decimal(str(round(float(x), 6))))
school_data["zipcode"] = school_data["zipcode"].astype(str).str.zfill(5)

# Retrieve the current maximum number of location_id
with engine.begin() as conn:
    max_id_result = conn.execute(text("SELECT MAX(location_id) FROM unified_locations")).scalar()
    max_existing_location_id = max_id_result if max_id_result is not None else 0

print(f" the current maximum number of location_id : {max_existing_location_id}")

# Construct a coordinate-to-id mapping
with engine.begin() as conn:
    existing_coords = conn.execute(text("SELECT latitude, longitude, location_id FROM unified_locations")).fetchall()
existing_map = { (r[0], r[1]): r[2] for r in existing_coords }


location_id_map = dict(existing_map)
next_id = max_existing_location_id + 1
new_count = 0

with engine.begin() as conn:
    for _, row in school_data[["latitude", "longitude", "zipcode"]].drop_duplicates().iterrows():
        lat, lon, zipc = row["latitude"], row["longitude"], row["zipcode"]
        key = (lat, lon)

        if key in location_id_map:
            continue  

        # Inserting new location_id
        conn.execute(text("""
            INSERT INTO unified_locations (location_id, latitude, longitude, zipcode)
            VALUES (:id, :lat, :lon, :zip)
        """), {"id": next_id, "lat": lat, "lon": lon, "zip": zipc})

        location_id_map[key] = next_id
        next_id += 1
        new_count += 1

print(f" data successfully inserted into unified_locations.  {new_count} records added. total recoreds of location_id_map：{len(location_id_map)}")

 the current maximum number of location_id : 493
 data successfully inserted into unified_locations.  1245 records added. total recoreds of location_id_map：1738


In [16]:
#Insert data into school_locations
with engine.begin() as conn:
    for _, row in school_data.iterrows():
        lat = Decimal(str(round(float(row["latitude"]), 6)))
        lon = Decimal(str(round(float(row["longitude"]), 6)))
        location_id = location_id_map[(lat, lon)]

        insert_sql = text("""
            INSERT INTO school_locations (
                ats_system_code, location_id, primary_address_line_1,
                x_coordinate, y_coordinate
            ) VALUES (
                :ats, :loc, :addr, :x, :y
            )
            ON CONFLICT (ats_system_code, location_id) DO NOTHING
        """)

        conn.execute(insert_sql, {
            "ats": row["ats_system_code"],
            "loc": location_id,
            "addr": row["primary_address_line_1"],
            "x": row["x_coordinate"],
            "y": row["y_coordinate"]
        })

print("Step 3: data sucecessfully inserted into school_locations ")

Step 3: data sucecessfully inserted into school_locations 


In [18]:
#Insert data into administrations and school_administrations
admin_id_map = {}
admin_records = school_data[[
    "admin_district_location_code",
    "administrative_district_name",
    "school_support_team_name",
    "community_district",
    "census_tract"
]].drop_duplicates()

with engine.begin() as conn:
    for _, row in admin_records.iterrows():
        key = (
            row["admin_district_location_code"],
            row["administrative_district_name"],
            row["school_support_team_name"],
            row["community_district"],
            row["census_tract"]
        )

        insert_sql = text("""
            INSERT INTO administrations (
                admin_district_location_code,
                administrative_district_name,
                school_support_team_name,
                community_district,
                census_tract
            ) VALUES (
                :code, :name, :team, :district, :tract
            )
            RETURNING admin_id
        """)

        result = conn.execute(insert_sql, {
            "code": key[0],
            "name": key[1],
            "team": key[2],
            "district": key[3],
            "tract": key[4]
        })
        admin_id_map[key] = result.scalar()

with engine.begin() as conn:
    for _, row in school_data.iterrows():
        ats = row["ats_system_code"]
        admin_key = (
            row["admin_district_location_code"],
            row["administrative_district_name"],
            row["school_support_team_name"],
            row["community_district"],
            row["census_tract"]
        )
        admin_id = admin_id_map.get(admin_key)
        if admin_id:
            conn.execute(text("""
                INSERT INTO school_administrations (ats_system_code, admin_id)
                VALUES (:ats, :aid)
                ON CONFLICT DO NOTHING
            """), {"ats": ats, "aid": admin_id})

print("✅ Step 4: data sucecessfully inserted into administrations and school_administrations ")

✅ Step 4: data sucecessfully inserted into administrations and school_administrations 


In [20]:
#Step 5：Insert data into principals and school_principals
principal_id_map = {}
principal_records = school_data[[
    "principal_name",
    "principal_title",
    "principal_phone_number",
    "fax_number"
]].drop_duplicates()

with engine.begin() as conn:
    for _, row in principal_records.iterrows():
        key = (
            row["principal_name"],
            row["principal_title"],
            row["principal_phone_number"],
            row["fax_number"]
        )

        insert_sql = text("""
            INSERT INTO principals (
                principal_name, principal_title,
                principal_phone_number, fax_number
            ) VALUES (
                :name, :title, :phone, :fax
            )
            RETURNING principal_id
        """)

        result = conn.execute(insert_sql, {
            "name": key[0],
            "title": key[1],
            "phone": key[2],
            "fax": key[3]
        })
        principal_id_map[key] = result.scalar()

with engine.begin() as conn:
    for _, row in school_data.iterrows():
        ats = row["ats_system_code"]
        principal_key = (
            row["principal_name"],
            row["principal_title"],
            row["principal_phone_number"],
            row["fax_number"]
        )
        principal_id = principal_id_map.get(principal_key)
        if principal_id:
            conn.execute(text("""
                INSERT INTO school_principals (ats_system_code, principal_id)
                VALUES (:ats, :pid)
                ON CONFLICT DO NOTHING
            """), {"ats": ats, "pid": principal_id})

print(" Step 5: data sucecessfully inserted into principals and school_principals")

 Step 5: data sucecessfully inserted into principals and school_principals


In [22]:
#test
with engine.begin() as conn:
    tables = [
        "unified_locations",
        "schools",
        "school_locations",
        "administrations",
        "school_administrations",
        "principals",
        "school_principals"
    ]
    
    for table in tables:
        result = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
        count = result.scalar()
        print(f"✅ Table {table} inserted {count} records")

✅ Table unified_locations inserted 1738 records
✅ Table schools inserted 1835 records
✅ Table school_locations inserted 1835 records
✅ Table administrations inserted 1354 records
✅ Table school_administrations inserted 1835 records
✅ Table principals inserted 1826 records
✅ Table school_principals inserted 1835 records
