In [44]:
!pip install pulp scikit-learn



In [45]:
import pandas as pd
import numpy as np
import math
from math import ceil
from collections import defaultdict

import pulp  
from sklearn.neighbors import BallTree  


DIST_MIN = 0.06  
R_MI = 3959.0              
RAD_EPS = DIST_MIN / R_MI  

In [46]:
df_pop = pd.read_csv('/Users/fengxiaopei/Desktop/population.csv')
df_inc = pd.read_csv('/Users/fengxiaopei/Desktop/avg_individual_income.csv')
df_emp = pd.read_csv('/Users/fengxiaopei/Desktop/employment_rate.csv')
df_fac = pd.read_csv('/Users/fengxiaopei/Desktop/child_care_regulated.csv')
df_pot = pd.read_csv('/Users/fengxiaopei/Desktop/potential_locations.csv')

df_inc = df_inc.rename(columns={'ZIP code':'zipcode', 'average income':'avg_income'})
df_emp = df_emp.rename(columns={'employment rate':'employment_rate'})
if 'zip_code' in df_fac.columns:
    df_fac = df_fac.rename(columns={'zip_code':'zipcode'})

for df in [df_pop, df_inc, df_emp, df_fac, df_pot]:
    df['zipcode'] = df['zipcode'].astype(str).str[:5]

data = df_pop[['zipcode','-5','5-9','10-14']].copy()
data = data.merge(df_inc[['zipcode','avg_income']], on='zipcode', how='left')
data = data.merge(df_emp[['zipcode','employment_rate']], on='zipcode', how='left')

data['avg_income'].fillna(data['avg_income'].median(), inplace=True)
data['employment_rate'].fillna(data['employment_rate'].median(), inplace=True)

data['children_0_5']   = data['-5']
data['children_5_9']   = data['5-9']
data['children_10_12'] = (3/5) * data['10-14']
data['children_5_12']  = data['children_5_9'] + data['children_10_12']
data['children_0_12']  = data['children_0_5'] + data['children_5_12']

data['high_demand'] = ((data['avg_income'] <= 60000) | (data['employment_rate'] >= 0.60)).astype(int)

data['required_total_slots'] = data.apply(
    lambda r: ceil(0.5 * r['children_0_12']) if r['high_demand'] else ceil(0.3 * r['children_0_12']),
    axis=1
)
data['required_0_5_slots'] = np.ceil((2/3) * data['children_0_5']).astype(int)

for c in ['total_capacity','infant_capacity','toddler_capacity','preschool_capacity']:
    if c not in df_fac.columns: df_fac[c] = 0
    df_fac[c] = df_fac[c].fillna(0)

df_fac['cap_0_5'] = df_fac['infant_capacity'] + df_fac['toddler_capacity'] + df_fac['preschool_capacity']

existing_summary = df_fac.groupby('zipcode').agg(
    existing_total_slots=('total_capacity','sum'),
    existing_0_5_slots =('cap_0_5','sum')
).reset_index()

data = data.merge(existing_summary, on='zipcode', how='left')
data[['existing_total_slots','existing_0_5_slots']] = data[['existing_total_slots','existing_0_5_slots']].fillna(0)


zipcodes = data['zipcode'].tolist()
zip_data = data.set_index('zipcode').T.to_dict()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['avg_income'].fillna(data['avg_income'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['employment_rate'].fillna(data['employment_rate'].median(), inplace=True)


In [47]:
if 'facility_id' not in df_fac.columns:
    df_fac = df_fac.reset_index().rename(columns={'index':'facility_id'})

for c in ['latitude','longitude']:
    if c not in df_fac.columns:
        df_fac[c] = np.nan

facilities = []
for _, r in df_fac.iterrows():
    nf = float(r.get('total_capacity',0) or 0)
    if nf <= 0: 
        continue
    facilities.append({
        'facility_id': int(r['facility_id']),
        'zipcode'    : r['zipcode'],
        'nf'         : nf,
        'n5'         : float(r.get('cap_0_5',0) or 0),
        'latitude'   : float(r.get('latitude', np.nan)),
        'longitude'  : float(r.get('longitude', np.nan)),
    })
fac_df = pd.DataFrame(facilities)


df_pot = df_pot[['zipcode','latitude','longitude']].copy()
df_pot['site_id'] = np.arange(len(df_pot))
pot_df = df_pot[['site_id','zipcode','latitude','longitude']]

In [48]:
sites_in_zip = pot_df.groupby('zipcode')['site_id'].apply(list).to_dict()
facs_in_zip  = fac_df.groupby('zipcode')['facility_id'].apply(list).to_dict()

too_close_new_pairs = []  
forbidden_site = set()

for z, S in sites_in_zip.items():
    if not S:
        continue

    
    sub_sites = pot_df.set_index('site_id').loc[S, ['latitude','longitude']].copy()

    
    mask_valid_sites = np.isfinite(sub_sites['latitude']) & np.isfinite(sub_sites['longitude'])
    invalid_ids = sub_sites.index[~mask_valid_sites].tolist()
    if invalid_ids:
        forbidden_site.update(invalid_ids)

    
    sub_sites = sub_sites[mask_valid_sites]
    if sub_sites.empty:
        continue

    site_ids = sub_sites.index.to_numpy(dtype=int)
    site_coords_rad = np.radians(sub_sites[['latitude','longitude']].to_numpy())

    
    tree_sites = BallTree(site_coords_rad, metric='haversine')
    neigh_idx = tree_sites.query_radius(site_coords_rad, r=RAD_EPS, return_distance=False)

    
    for a, js in enumerate(neigh_idx):
        i = site_ids[a]
        for b in js:
            if b <= a:
                continue
            j = site_ids[b]
            too_close_new_pairs.append((i, j))

    
    F = facs_in_zip.get(z, [])
    if F:
        sub_facs = fac_df.set_index('facility_id').loc[F, ['latitude','longitude']].copy()
        mask_valid_facs = np.isfinite(sub_facs['latitude']) & np.isfinite(sub_facs['longitude'])
        sub_facs = sub_facs[mask_valid_facs]
        if not sub_facs.empty:
            fac_coords_rad = np.radians(sub_facs[['latitude','longitude']].to_numpy())
            tree_facs = BallTree(fac_coords_rad, metric='haversine')
            hits = tree_facs.query_radius(site_coords_rad, r=RAD_EPS, return_distance=False)
            for a, js in enumerate(hits):
                if len(js) > 0:
                    forbidden_site.add(int(site_ids[a]))

len(too_close_new_pairs), len(forbidden_site)

(81375, 4209)

In [49]:
prob = pulp.LpProblem("Q2_Realistic_Capacity_Expansion_and_Location", pulp.LpMinimize)

#Expansion segments (0–10%, 10–15%, 15–20%) per existing facility
exp1, exp2, exp3 = {}, {}, {}
for _, r in fac_df.iterrows():
    fid = int(r['facility_id']); nf = float(r['nf'])
    exp1[fid] = pulp.LpVariable(f"exp1_{fid}", lowBound=0, upBound=0.10*nf)
    exp2[fid] = pulp.LpVariable(f"exp2_{fid}", lowBound=0, upBound=0.05*nf)  # 10–15%
    exp3[fid] = pulp.LpVariable(f"exp3_{fid}", lowBound=0, upBound=0.05*nf)  # 15–20%

#New facility build decisions
yS, yM, yL = {}, {}, {}
for _, r in pot_df.iterrows():
    i = int(r['site_id'])
    yS[i] = pulp.LpVariable(f"yS_{i}", cat='Binary')
    yM[i] = pulp.LpVariable(f"yM_{i}", cat='Binary')
    yL[i] = pulp.LpVariable(f"yL_{i}", cat='Binary')

def y_total(i):  
    return yS[i] + yM[i] + yL[i]

#Costs
expansion_cost = 0
for _, r in fac_df.iterrows():
    fid, nf = int(r['facility_id']), float(r['nf'])
    c1 = 20000.0/nf + 200.0
    c2 = 20000.0/nf + 400.0
    c3 = 20000.0/nf + 1000.0
    expansion_cost += c1*exp1[fid] + c2*exp2[fid] + c3*exp3[fid]

# New builds
# S/M/L
build_cost = (
    pulp.lpSum((65000 +  5000) * yS[i] for i in yS) +
    pulp.lpSum((95000 + 10000) * yM[i] for i in yM) +
    pulp.lpSum((115000 + 20000) * yL[i] for i in yL)
)

prob += expansion_cost + build_cost

In [50]:
# capacity contribution by new sites
def new_total_slots(i): return 100*yS[i] + 200*yM[i] + 400*yL[i]
def new_05_slots(i):    return  50*yS[i] + 100*yM[i] + 200*yL[i]

for i in yS.keys():
    prob += y_total(i) <= 1, f"OneSizePerSite_{i}"

for z in zipcodes:
    exist_total = float(zip_data[z]['existing_total_slots'])
    exist_05    = float(zip_data[z]['existing_0_5_slots'])
    req_total   = int(zip_data[z]['required_total_slots'])
    req_05      = int(zip_data[z]['required_0_5_slots'])

    Fz = fac_df.loc[fac_df['zipcode']==z, 'facility_id'].tolist()
    exp_total_z = pulp.lpSum( (exp1[f] + exp2[f] + exp3[f]) for f in Fz )
    Sz = sites_in_zip.get(z, [])
    new_total_z = pulp.lpSum( new_total_slots(i) for i in Sz )
    new_05_z    = pulp.lpSum( new_05_slots(i)   for i in Sz )

    
    prob += exist_total + exp_total_z + new_total_z >= req_total, f"ReqTotal_{z}"
    prob += exist_05 + new_05_z >= req_05, f"Req05_{z}"

for _, r in fac_df.iterrows():
    fid, nf = int(r['facility_id']), float(r['nf'])
    prob += exp1[fid] + exp2[fid] + exp3[fid] <= 0.20*nf, f"Cap20pct_{fid}"


for (i, j) in too_close_new_pairs:
    prob += y_total(i) + y_total(j) <= 1, f"FacilitySpacing_pair_{i}_{j}"

for i in forbidden_site:
    prob += y_total(i) == 0, f"FacilitySpacing_forbid_{i}"

In [51]:
print("Solving Q2 model (CBC default solver)...")
status = prob.solve(pulp.PULP_CBC_CMD(msg=0))
print("Status:", pulp.LpStatus[status])
print("Min Cost: ${:,.0f}".format(pulp.value(prob.objective)))


sel_sites = []
for _, r in pot_df.iterrows():
    i = int(r['site_id'])
    choice = ('S' if pulp.value(yS[i])>0.5 else
              'M' if pulp.value(yM[i])>0.5 else
              'L' if pulp.value(yL[i])>0.5 else '')
    if choice:
        sel_sites.append({
            'site_id': i,
            'zipcode': r['zipcode'],
            'size'   : choice,
            'latitude': r['latitude'],
            'longitude': r['longitude'],
            'total_slots': 100 if choice=='S' else 200 if choice=='M' else 400,
            'slots_0_5'  :  50 if choice=='S' else 100 if choice=='M' else 200
        })
out_new = pd.DataFrame(sel_sites)

# Expansions
exp_list = []
for _, r in fac_df.iterrows():
    fid = int(r['facility_id'])
    v1 = pulp.value(exp1[fid]) or 0
    v2 = pulp.value(exp2[fid]) or 0
    v3 = pulp.value(exp3[fid]) or 0
    if v1+v2+v3 > 1e-6:
        exp_list.append({
            'facility_id': fid,
            'zipcode'    : r['zipcode'],
            'nf'         : r['nf'],
            'exp_0_10pct': v1,
            'exp_10_15'  : v2,
            'exp_15_20'  : v3,
            'exp_total'  : v1+v2+v3
        })
out_exp = pd.DataFrame(exp_list)

print("\n=== Selected New Sites ===")
if not out_new.empty:
    display(out_new.sort_values(['zipcode','site_id']).reset_index(drop=True))
else:
    print("(none)")

print("\n=== Expansions at Existing Facilities ===")
if not out_exp.empty:
    display(out_exp.sort_values(['zipcode','facility_id']).reset_index(drop=True))
else:
    print("(none)")

# Quick spacing audit (sanity)
sel = {i for i in yS if (pulp.value(yS[i])>0.5 or pulp.value(yM[i])>0.5 or pulp.value(yL[i])>0.5)}
viol_pairs = [(i,j) for (i,j) in too_close_new_pairs if i in sel and j in sel]
viol_forbid = [i for i in forbidden_site if i in sel]
print("\nSpacing audit:")
print("Too-close selected pairs (<0.06 mi):", viol_pairs)
print("Forbidden sites selected:", viol_forbid)

# Demand feasibility audit
def ytot(i):
    return (100*(pulp.value(yS[i])>0.5) + 200*(pulp.value(yM[i])>0.5) + 400*(pulp.value(yL[i])>0.5))
def y05(i):
    return ( 50*(pulp.value(yS[i])>0.5) + 100*(pulp.value(yM[i])>0.5) + 200*(pulp.value(yL[i])>0.5))

ok = True
for z in zipcodes:
    exist_total = float(zip_data[z]['existing_total_slots'])
    exist_05    = float(zip_data[z]['existing_0_5_slots'])
    req_total   = int(zip_data[z]['required_total_slots'])
    req_05      = int(zip_data[z]['required_0_5_slots'])

    Fz = fac_df.loc[fac_df['zipcode']==z, 'facility_id'].tolist()
    exp_total_z = sum((pulp.value(exp1[f]) or 0)+(pulp.value(exp2[f]) or 0)+(pulp.value(exp3[f]) or 0) for f in Fz)
    new_total_z = sum(ytot(i) for i in sites_in_zip.get(z,[]))
    new_05_z    = sum(y05(i)  for i in sites_in_zip.get(z,[]))

    if exist_total + exp_total_z + new_total_z < req_total or exist_05 + new_05_z < req_05:
        print(f"[Demand not met] {z}")
        ok = False

print("Demand check:", "OK" if ok else "Fail")

Solving Q2 model (CBC default solver)...
Status: Optimal
Min Cost: $465,377,896

=== Selected New Sites ===


Unnamed: 0,site_id,zipcode,size,latitude,longitude,total_slots,slots_0_5
0,306,10001,M,40.757810,-74.004031,200,100
1,366,10001,L,40.745698,-74.001482,400,200
2,381,10001,L,40.747292,-73.999978,400,200
3,408,10002,L,40.708861,-73.989639,400,200
4,413,10002,L,40.709508,-73.980533,400,200
...,...,...,...,...,...,...,...
3835,205523,14774,S,42.091144,-78.150359,100,50
3836,206786,14788,S,42.064797,-78.381663,100,50
3837,207281,14805,S,42.359098,-76.731266,100,50
3838,207307,14806,M,42.154522,-77.789252,200,100



=== Expansions at Existing Facilities ===


Unnamed: 0,facility_id,zipcode,nf,exp_0_10pct,exp_10_15,exp_15_20,exp_total
0,232415,10580,144.0,14.4,0.0,0.0,14.4
1,874157,10580,102.0,4.6,0.0,0.0,4.6
2,40584,10583,239.0,23.9,0.0,0.0,23.9
3,43561,10583,190.0,19.0,0.0,0.0,19.0
4,43657,10583,157.0,15.7,0.0,0.0,15.7
...,...,...,...,...,...,...,...
75,318513,14304,72.0,3.1,0.0,0.0,3.1
76,598901,14304,112.0,11.2,0.0,0.0,11.2
77,503506,14510,234.0,23.4,11.7,0.0,35.1
78,904282,14510,56.0,5.6,1.3,0.0,6.9



Spacing audit:
Too-close selected pairs (<0.06 mi): []
Forbidden sites selected: []
Demand check: OK


In [53]:
# Final Summary Output (for report)
print("\n===== FINAL SUMMARY (for Part 2) =====")

# Optimal cost
opt_cost = pulp.value(prob.objective)
print(f"Total minimum funding required: ${opt_cost:,.0f}")

# New facilities
print(f"\nNew facilities built: {len(out_new)}")
print(f"  - Small (S): {(out_new['size']=='S').sum()}")
print(f"  - Medium (M): {(out_new['size']=='M').sum()}")
print(f"  - Large (L): {(out_new['size']=='L').sum()}")
print(f"  → Total new slots: {out_new['total_slots'].sum()} "
      f"(0–5 year old slots: {out_new['slots_0_5'].sum()})")

# Expansions
print(f"\nExisting facilities expanded: {len(out_exp)}")
print(f"  → Total expansion slots added: {out_exp['exp_total'].sum():.1f}")


===== FINAL SUMMARY (for Part 2) =====
Total minimum funding required: $465,377,896

New facilities built: 3840
  - Small (S): 657
  - Medium (M): 357
  - Large (L): 2826
  → Total new slots: 1267500 (0–5 year old slots: 633750)

Existing facilities expanded: 80
  → Total expansion slots added: 943.0
