In [1]:
import numpy as np
import pandas as pd
import sys
import os
import duckdb
sys.path.append('..')
from helper import *
setwd()

Objects = Objects()

In [3]:
select_features = {"housing":"*","schools":"X, Y, OBJECTID, POPULATION, LEVEL_","hospitals":"X, Y, objectid, type, beds, owner",}

con = duckdb.connect(database = "./data/exploitation/exploitation.db", read_only=False)
for datasource in Objects:
    id = datasource['id']
    con_trust = duckdb.connect(database = f"data/trusted/db_{id}.db")
    df = con_trust.execute("SELECT {} FROM {};".format(select_features[id],id)).fetchdf()
    con_trust.close()
    con.execute("CREATE OR REPLACE TABLE {} AS SELECT * FROM df".format(id))

con.execute("SHOW TABLES").fetchall()

[('hospitals',), ('housing',), ('schools',)]

In [9]:
con.execute("SELECT * FROM hospitals;").fetchdf().head()

Unnamed: 0,X,Y,OBJECTID,TYPE,BEDS,OWNER
0,-94.945477,29.74762,8497,GENERAL ACUTE CARE,182.0,NON-PROFIT
1,-101.294085,48.232226,8504,GENERAL ACUTE CARE,251.0,NON-PROFIT
2,-80.231076,36.418068,8505,GENERAL ACUTE CARE,93.0,PROPRIETARY
3,-95.440687,30.014288,8506,GENERAL ACUTE CARE,16.0,PROPRIETARY
4,-98.22573,26.186992,8507,GENERAL ACUTE CARE,441.0,PROPRIETARY


In [3]:
!rm ./temp/*

In [8]:
con.execute("CREATE OR REPLACE TABLE pos_house AS SELECT lat, long FROM housing;")
con.execute("SELECT * FROM pos_house;").fetchdf().head()

Unnamed: 0,lat,long
0,36.9917,-86.423
1,36.9745,-86.4199
2,36.9745,-86.4199
3,36.9745,-86.4199
4,36.9745,-86.4199


In [71]:
con.execute("""SELECT lat, long, COUNT(*) FROM pos_house GROUP BY lat, long""").fetchdf().head()

Unnamed: 0,lat,long,count_star()
0,29.3001,-94.7998,20
1,29.3622,-95.0201,1
2,29.5298,-95.0334,157
3,29.374,-94.9598,1
4,29.2837,-94.8214,1


In [43]:
%%time
con.execute(f"""
create or replace table pos_house_hospital as (
with H as (SELECT * FROM pos_house),
S as (SELECT * from hospitals)
SELECT
    lat, long,
    count(S.objectid) as num_hospitals,
    sum(S.beds) as num_beds,
    sum(if(S.type='GENERAL ACUTE CARE', 1, 0)) as hospital_type_general,
    sum(if(S.type='CRITICAL ACCESS', 1, 0)) as hospital_type_critical,
    sum(if(S.type='PSYCHIATRIC', 1, 0)) as hospital_type_psychiatric,
    sum(if(S.type='LONG TERM CARE', 1, 0)) as hospital_type_longterm,
    sum(if(S.type='CHILDREN', 1, 0)) as hospital_type_children,
    sum(if(S.owner in('GOVERNMENT - DISTRICT/AUTHORITY', 'GOVERNMENT - FEDERAL',
        'GOVERNMENT - LOCAL', 'GOVERNMENT - STATE'), 1, 0)) as government_hospital,
    sum(if(S.owner = 'NON-PROFIT', 1, 0)) as nonprofit_hospital,
    sum(if(S.owner='PROPRIETARY', 1, 0)) as private_hospital,
    
FROM H left join S on (
    (H.long - S.X)*(H.long - S.X) + (H.lat - S.Y)*(H.lat - S.Y) < 0.01
)
GROUP BY lat, long
)
""")
con.execute("SELECT * FROM pos_house_hospital;").fetchdf()

CPU times: user 2min 19s, sys: 79.4 ms, total: 2min 19s
Wall time: 36.1 s


Unnamed: 0,lat,long,num_hospitals,num_beds,hospital_type_general,hospital_type_critical,hospital_type_psychiatric,hospital_type_longterm,hospital_type_children,government_hospital,nonprofit_hospital,private_hospital
0,34.5856,-98.4398,85,9146.0,34.0,17.0,17.0,0.0,0.0,68.0,0.0,17.0
1,34.6333,-98.4397,230,24748.0,92.0,46.0,46.0,0.0,0.0,184.0,0.0,46.0
2,34.6234,-98.4618,285,30666.0,114.0,57.0,57.0,0.0,0.0,228.0,0.0,57.0
3,41.5605,-83.5359,6,860.0,3.0,0.0,1.0,1.0,0.0,2.0,2.0,2.0
4,41.6312,-83.6612,11,2288.0,5.0,0.0,2.0,2.0,0.0,2.0,5.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
79711,34.7602,-92.2120,5,657.0,2.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0
79712,36.3606,-94.2684,2,128.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
79713,35.3605,-94.3754,8,1153.0,5.0,0.0,1.0,1.0,0.0,0.0,3.0,4.0
79714,33.4962,-94.0897,7,903.0,2.0,0.0,1.0,2.0,0.0,0.0,2.0,4.0


In [17]:
df_schools = con.execute("SELECT * FROM schools;").fetchdf()
df_schools["LEVEL_"].replace("N","unknown", inplace=True)
df_schools["LEVEL_"].replace("1","preschool", inplace=True)
df_schools["LEVEL_"].replace("2","elementary_school", inplace=True)
df_schools["LEVEL_"].replace("3","middle_school", inplace=True)
df_schools["LEVEL_"].replace("4","high_school", inplace=True)
con.execute("CREATE OR REPLACE TABLE schools AS SELECT * FROM df_schools;")
con.execute("SELECT * FROM schools;").fetchdf().head()

Unnamed: 0,X,Y,OBJECTID,POPULATION,LEVEL_
0,-94.754595,32.58544,10588,583.0,elementary_school
1,-99.78315,29.224599,10589,411.0,high_school
2,-78.489226,37.998854,10591,1186.0,middle_school
3,-88.554699,44.031136,10592,336.0,preschool
4,-105.518183,44.270256,10593,1082.0,elementary_school


In [44]:
#%%time
con.execute("""
create or replace table pos_house_schools as (
with H as (SELECT * FROM pos_house),
S as (SELECT * from schools)
SELECT
    lat, long,
    count(S.OBJECTID) as num_schools,
    sum(S.POPULATION) as num_students,
    sum(if(S.LEVEL_='preschool', 1, 0)) as preschool,
    sum(if(S.LEVEL_='elementary_school', 1, 0)) as elementary_school,
    sum(if(S.LEVEL_= 'middle_school', 1, 0)) as middle_school,
    sum(if(S.LEVEL_= 'high_school', 1, 0)) as high_school
    
FROM H left join S on (
    (H.long - S.X)*(H.long - S.X) + (H.lat - S.Y)*(H.lat - S.Y) < 0.01
    )
GROUP BY lat, long
)
""")
con.execute("SELECT * FROM pos_house_schools;").fetchdf().head()

Unnamed: 0,lat,long,num_schools,num_students,preschool,elementary_school,middle_school,high_school
0,41.6312,-83.6612,78,45640.0,50.0,6.0,10.0,8.0
1,41.6608,-83.6745,75,45685.0,46.0,6.0,11.0,8.0
2,41.6257,-83.5788,112,43509.0,59.0,5.0,20.0,19.0
3,35.4056,-97.4962,99,62999.0,63.0,17.0,14.0,1.0
4,35.4661,-97.403,495,248562.0,261.0,117.0,90.0,0.0


In [59]:
#%%time 
con.execute("""
create or replace table houses as (
with H as (SELECT * FROM housing),
T1 as (SELECT * from pos_house_hospital),
T2 as (SELECT * from pos_house_schools)
SELECT *
FROM H, T1, T2
WHERE H.long = T1.long and H.lat == T1.lat
      and T1.long == T2.long and T1.lat == T2.lat
)
""")
con.execute("SELECT * FROM houses;").fetchdf().head()
#H.*, T1.* EXCLUDE (lat, long), T2.* EXCLUDE (lat, long)

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,hospital_type_children,government_hospital,nonprofit_hospital,private_hospital,num_schools,num_students,preschool,elementary_school,middle_school,high_school
0,7028979182,https://bgky.craigslist.org/apa/d/bowling-gree...,bowling green,https://bgky.craigslist.org,679,apartment,660,1,1.0,1,...,0.0,0.0,4.0,12.0,132,58828.0,52.0,20.0,32.0,12.0
1,7025677802,https://bgky.craigslist.org/apa/d/bowling-gree...,bowling green,https://bgky.craigslist.org,779,apartment,900,2,1.0,1,...,0.0,0.0,2.0,6.0,66,29414.0,26.0,10.0,16.0,6.0
2,7046214032,https://batonrouge.craigslist.org/apa/d/baton-...,baton rouge,https://batonrouge.craigslist.org,595,apartment,700,1,1.0,0,...,0.0,0.0,33.0,66.0,726,322311.0,374.0,121.0,121.0,66.0
3,7051131571,https://lexington.craigslist.org/apa/d/lexingt...,eastern kentucky,https://eastky.craigslist.org,679,apartment,662,1,1.0,1,...,128.0,384.0,512.0,128.0,7296,3905024.0,3968.0,1024.0,896.0,640.0
4,7051134389,https://lexington.craigslist.org/apa/d/lexingt...,eastern kentucky,https://eastky.craigslist.org,809,apartment,900,2,2.0,1,...,80.0,400.0,400.0,80.0,5600,3267840.0,2960.0,880.0,720.0,400.0


In [46]:
con.execute("SELECT * FROM houses;").fetchdf()

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,nonprofit_hospital,private_hospital,lat:2,long:2,num_schools,num_students,preschool,elementary_school,middle_school,high_school
0,7028979182,https://bgky.craigslist.org/apa/d/bowling-gree...,bowling green,https://bgky.craigslist.org,679,apartment,660,1,1.0,1,...,4.0,12.0,36.9916,-86.4233,132,58828.0,52.0,20.0,32.0,12.0
1,7025677802,https://bgky.craigslist.org/apa/d/bowling-gree...,bowling green,https://bgky.craigslist.org,779,apartment,900,2,1.0,1,...,2.0,6.0,36.9913,-86.4232,66,29414.0,26.0,10.0,16.0,6.0
2,7046214032,https://batonrouge.craigslist.org/apa/d/baton-...,baton rouge,https://batonrouge.craigslist.org,595,apartment,700,1,1.0,0,...,33.0,66.0,30.4193,-91.1882,726,322311.0,374.0,121.0,121.0,66.0
3,7051131571,https://lexington.craigslist.org/apa/d/lexingt...,eastern kentucky,https://eastky.craigslist.org,679,apartment,662,1,1.0,1,...,512.0,128.0,37.9651,-84.4708,7296,3905024.0,3968.0,1024.0,896.0,640.0
4,7051134389,https://lexington.craigslist.org/apa/d/lexingt...,eastern kentucky,https://eastky.craigslist.org,809,apartment,900,2,2.0,1,...,400.0,80.0,37.9877,-84.4989,5600,3267840.0,2960.0,880.0,720.0,400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383054,7024388305,https://siouxfalls.craigslist.org/apa/d/sioux-...,sioux falls / SE SD,https://siouxfalls.craigslist.org,900,townhouse,1040,2,2.0,1,...,165.0,132.0,43.5026,-96.7659,2079,928950.0,1122.0,297.0,231.0,330.0
383055,7023734591,https://siouxfalls.craigslist.org/apa/d/sioux-...,sioux falls / SE SD,https://siouxfalls.craigslist.org,895,apartment,1050,2,2.0,1,...,165.0,132.0,43.5026,-96.7659,2079,928950.0,1122.0,297.0,231.0,330.0
383056,7025177130,https://siouxfalls.craigslist.org/apa/d/sioux-...,sioux falls / SE SD,https://siouxfalls.craigslist.org,790,apartment,840,2,1.0,1,...,12.0,12.0,43.5553,-96.6914,188,72428.0,100.0,16.0,20.0,40.0
383057,7028197152,https://hiltonhead.craigslist.org/apa/d/blufft...,hilton head,https://hiltonhead.craigslist.org,1169,apartment,561,1,1.0,1,...,0.0,0.0,32.2705,-80.9177,8,6699.0,5.0,1.0,2.0,0.0


In [60]:
df = con.execute("SELECT * FROM houses;").fetchdf()
columns_names = list(df.columns)
print(columns_names)

['id', 'url', 'region', 'region_url', 'price', 'type', 'sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished', 'laundry_options', 'parking_options', 'image_url', 'description', 'lat', 'long', 'state', 'num_hospitals', 'num_beds', 'hospital_type_general', 'hospital_type_critical', 'hospital_type_psychiatric', 'hospital_type_longterm', 'hospital_type_children', 'government_hospital', 'nonprofit_hospital', 'private_hospital', 'num_schools', 'num_students', 'preschool', 'elementary_school', 'middle_school', 'high_school']
