In [1]:
import numpy as np
import pandas as pd
import sys
import os
import duckdb
sys.path.append('..')
from helper import *
setwd()

Objects = Objects()

In [2]:
con = duckdb.connect(database = "./data/exploitation/exploitation.db", read_only=False)

for datasource in Objects:
    id = datasource['id']
    con_trust = duckdb.connect(database = f"data/trusted/db_{id}.db")
    con_trust.execute("EXPORT DATABASE './temp'")
    con_trust.close()
    con.execute(f"IMPORT DATABASE './temp'")

con.execute("SHOW TABLES").fetchall()

[('hospitals',),
 ('hospitals_profiling',),
 ('housing',),
 ('housing_profiling',),
 ('schools',),
 ('schools_profiling',)]

In [3]:
!rm ./temp/*

In [73]:
# lat long
con.execute("CREATE OR REPLACE TABLE pos_house AS SELECT lat, long FROM housing;")
con.execute("SELECT * FROM pos_house;").fetchdf().head

<bound method NDFrame.head of             lat      long
0       28.8090  -96.9993
1       29.5613  -97.9628
2       29.8634  -98.0037
3       29.5613  -97.9628
4       29.8716  -97.9649
...         ...       ...
384972  34.9829 -101.9020
384973  35.0175 -101.4000
384974  35.2703 -101.9430
384975  34.9829 -101.9020
384976  34.9829 -101.9020

[384977 rows x 2 columns]>

In [71]:
con.execute("""SELECT lat, long, COUNT(*) FROM pos_house GROUP BY lat, long""").fetchdf().head()

Unnamed: 0,lat,long,count_star()
0,29.3001,-94.7998,20
1,29.3622,-95.0201,1
2,29.5298,-95.0334,157
3,29.374,-94.9598,1
4,29.2837,-94.8214,1


In [46]:
#%%time
con.execute(f"""
create or replace table pos_house_hospital as (
with H as (SELECT * FROM pos_house limit 1000),
S as (SELECT X, Y, objectid, type, beds, owner from hospitals)
SELECT
    lat, long,
    count(S.objectid) as num_hospitals,
    sum(S.beds) as num_beds,
    sum(if(S.type='GENERAL ACUTE CARE', 1, 0)) as hospital_type_general,
    sum(if(S.type='CRITICAL ACCESS', 1, 0)) as hospital_type_critical,
    sum(if(S.type='PSYCHIATRIC', 1, 0)) as hospital_type_psychiatric,
    sum(if(S.type='LONG TERM CARE', 1, 0)) as hospital_type_longterm,
    sum(if(S.type='CHILDREN', 1, 0)) as hospital_type_children,
    sum(if(S.owner in('GOVERNMENT - DISTRICT/AUTHORITY', 'GOVERNMENT - FEDERAL',
        'GOVERNMENT - LOCAL', 'GOVERNMENT - STATE'), 1, 0)) as government_hospital,
    sum(if(S.owner = 'NON-PROFIT', 1, 0)) as nonprofit_hospital,
    sum(if(S.owner='PROPRIETARY', 1, 0)) as private_hospital,
    
FROM H left join S on (
    (H.long - S.X)*(H.long - S.X) + (H.lat - S.Y)*(H.lat - S.Y) < 0.01
)
GROUP BY lat, long
)
""").fetchall()

[(322,)]

In [47]:
con.execute("SELECT * FROM pos_house_hospital;").fetchdf()

Unnamed: 0,lat,long,num_hospitals,num_beds,hospital_type_general,hospital_type_critical,hospital_type_psychiatric,hospital_type_longterm,hospital_type_children,government_hospital,nonprofit_hospital,private_hospital
0,33.5924,-101.9370,36,6132.0,16.0,0.0,0.0,4.0,4.0,4.0,12.0,20.0
1,33.5865,-101.8610,30,4689.0,12.0,0.0,3.0,3.0,3.0,3.0,12.0,15.0
2,33.4892,-101.8400,3,153.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0
3,33.5196,-101.8620,18,2978.0,8.0,0.0,2.0,2.0,2.0,2.0,8.0,8.0
4,33.5491,-101.7510,2,60.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
317,29.5795,-98.4558,9,1300.0,5.0,0.0,1.0,0.0,0.0,0.0,3.0,6.0
318,26.1972,-98.3499,4,1188.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
319,37.3646,-79.2057,3,738.0,2.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0
320,26.3189,-98.3701,0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df_schools = con.execute("SELECT * FROM schools;").fetchdf()
df_schools["LEVEL_"].replace("N","unknown", inplace=True)
df_schools["LEVEL_"].replace("1","preschool", inplace=True)
df_schools["LEVEL_"].replace("2","elementary_school", inplace=True)
df_schools["LEVEL_"].replace("3","middle_school", inplace=True)
df_schools["LEVEL_"].replace("4","high_school", inplace=True)
con.execute("CREATE OR REPLACE TABLE schools AS SELECT * FROM df_schools;")

<duckdb.DuckDBPyConnection at 0x7fb1c0c44eb0>

In [43]:
con.execute("SELECT * FROM schools;").fetchdf().head()

Unnamed: 0,X,Y,OBJECTID,NCESID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,...,VAL_METHOD,VAL_DATE,WEBSITE,LEVEL_,ENROLLMENT,ST_GRADE,END_GRADE,DISTRICTID,FT_TEACHER,SHELTER_ID
0,-84.20776,30.161179,43484,120195003673,WAKULLA COAST CHARTER SCHOOL OF ARTS SCIENCE &...,48 SHELL ISLAND ROAD,ST MARKS,FL,32355,0338,...,IMAGERY,2010-06-07,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,179,PK,8,1201950,17,unknow
1,-83.66469,32.850619,43486,130042003621,PRICE EDUCATIONAL CENTER,304 PIERCE AVE,MACON,GA,31204,unknow,...,GEOCODE,2010-08-23,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,elementary_school,16,04,7,1300420,1,unknow
2,-86.480974,40.544106,43488,180129002103,CARROLL ELEMENTARY,105 S 225 E,FLORA,IN,46929,9697,...,IMAGERY/OTHER,2010-08-03,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,631,PK,6,1801290,31,unknow
3,-95.633258,38.086786,43489,200867000179,LEROY ELEM,1013 N MAIN ST,LEROY,KS,66857,0188,...,IMAGERY,2010-09-02,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,45,PK,2,2008670,4,unknow
4,-82.606361,38.093999,43491,210324000872,LOUISA MIDDLE SCHOOL,9 BULLDOG LANE,LOUISA,KY,41230,unknow,...,IMAGERY,2010-08-31,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,elementary_school,425,06,8,2103240,22,10802173


In [44]:
#%%time
con.execute(f"""
create or replace table pos_house_schools as (
with H as (SELECT * FROM pos_house limit 1000),
S as (SELECT X, Y, OBJECTID, POPULATION, LEVEL_ from schools)
SELECT
    lat, long,
    count(S.OBJECTID) as num_schools,
    sum(S.POPULATION) as num_students,
    sum(if(S.LEVEL_='preschool', 1, 0)) as preschool,
    sum(if(S.LEVEL_='elementary_school', 1, 0)) as elementary_school,
    sum(if(S.LEVEL_= 'middle_school', 1, 0)) as middle_school,
    sum(if(S.LEVEL_= 'high_school', 1, 0)) as high_school
    
FROM H left join S on (
    (H.long - S.X)*(H.long - S.X) + (H.lat - S.Y)*(H.lat - S.Y) < 0.01
    )
GROUP BY lat, long
)
""").fetchall()

[(322,)]

In [45]:
con.execute("SELECT * FROM pos_house_schools;").fetchdf()

Unnamed: 0,lat,long,num_schools,num_students,preschool,elementary_school,middle_school,high_school
0,26.2154,-98.2359,1215,819576.0,639.0,180.0,234.0,108.0
1,26.3018,-98.1511,68,54061.0,41.0,10.0,8.0,8.0
2,26.1745,-98.2160,256,166066.0,130.0,38.0,48.0,24.0
3,26.1632,-98.1838,114,71322.0,59.0,16.0,21.0,10.0
4,26.2044,-98.1537,244,161264.0,134.0,34.0,42.0,20.0
...,...,...,...,...,...,...,...,...
317,37.3646,-79.2057,29,14948.0,15.0,4.0,4.0,0.0
318,29.5795,-98.4558,88,75644.0,51.0,17.0,14.0,5.0
319,31.9383,-102.3920,37,28415.0,24.0,5.0,4.0,2.0
320,29.6736,-98.1097,36,19284.0,17.0,4.0,6.0,3.0


In [67]:
%%time 
con.execute(f"""
create or replace table houses as (
with H as (SELECT * FROM housing limit 1000),
T1 as (SELECT * from pos_house_hospital),
T2 as (SELECT * from pos_house_schools)
SELECT *
FROM H, T1, T2
WHERE H.long == T1.long and H.lat == T1.lat
      and T1.long == T2.long and T1.lat == T2.lat
)
""").fetchall()
#pos_house_schools
#T2 as (SELECT * from pos_house_schools)
#FROM H left join T1 on ( H.long == T1.long and H.lat == T1.lat)

CPU times: user 70.7 ms, sys: 11.2 ms, total: 81.9 ms
Wall time: 832 ms


[(1000,)]

In [68]:
df = con.execute("SELECT * FROM houses;").fetchdf()

In [69]:
columns_names = list(df.columns)
print(columns_names)

['id', 'url', 'region', 'region_url', 'price', 'type', 'sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished', 'laundry_options', 'parking_options', 'image_url', 'description', 'lat', 'long', 'state', 'lat:1', 'long:1', 'num_hospitals', 'num_beds', 'hospital_type_general', 'hospital_type_critical', 'hospital_type_psychiatric', 'hospital_type_longterm', 'hospital_type_children', 'government_hospital', 'nonprofit_hospital', 'private_hospital', 'lat:2', 'long:2', 'num_schools', 'num_students', 'preschool', 'elementary_school', 'middle_school', 'high_school']
