In [21]:
import numpy as np
import pandas as pd
import sys
import os
import duckdb
sys.path.append('..')
from helper import *
setwd()

Objects = Objects()

In [13]:
con.execute("SHOW TABLES").fetchall()

[]

In [23]:
con.close()

In [24]:
con = duckdb.connect(database = "./data/exploitation/exploitation.db", read_only=False)

for datasource in Objects:
    id = datasource['id']
    con_trust = duckdb.connect(database = f"data/trusted/db_{id}.db")
    con_trust.execute("EXPORT DATABASE './temp'")
    con_trust.close()
    con.execute(f"IMPORT DATABASE './temp'")

con.execute("SHOW TABLES").fetchall()

[('hospitals',),
 ('hospitals_profiling',),
 ('housing',),
 ('housing_profiling',),
 ('schools',),
 ('schools_profiling',)]

In [25]:
!rm ./temp/*

In [26]:
# lat long
con.execute("CREATE OR REPLACE TABLE pos_house AS SELECT lat, long FROM housing;")
con.execute("SELECT * FROM pos_house;").fetchdf().head

<bound method NDFrame.head of             lat      long
0       29.5920  -98.4700
1       29.5042  -98.5697
2       29.3595  -98.4499
3       29.4682  -98.5298
4       29.3466  -98.5074
...         ...       ...
384972  35.1898 -101.9340
384973  34.9829 -101.9020
384974  35.1653 -101.8840
384975  35.1653 -101.8840
384976  35.1653 -101.8840

[384977 rows x 2 columns]>

In [27]:
con.execute("""SELECT lat, long, COUNT(*) FROM pos_house GROUP BY lat, long""").fetchdf().head()

Unnamed: 0,lat,long,count_star()
0,38.8518,-94.3944,38
1,39.0754,-94.5738,1
2,38.9583,-94.6332,5
3,39.2168,-94.568,1
4,39.1024,-94.5986,8


In [32]:
con.execute("select pi()").fetchall()

[(3.141592653589793,)]

**distances filter:**

for each place (either house, school, hospital), we have its latitude and longitude (in degrees)

when the points are close enough, we can compute the distances using the formula of the distance on a plane, d^2 = dx^2 + dy^2

in this case, y = R·latitude and x = R·longitude·cos(latitude), where R is the Earth radius and latitude and longitude are in radians

hence, we define a new variable long_cos_lat = long(deg)·cos(latitude). Therefore, we can query the distances using the formula: 

d^2 = R^2 · pi^2/180^2 · ( ( long_cos_lat_1 - long_cos_lat_2 )^2 + (lat_1 - lat_2)^2 )

In [41]:
distance = 100 # distance in km
distance_transformed = str(( distance / (6371*np.pi/180) )**2 )
distance_transformed

'0.8087793508722445'

In [50]:
%%time
con.execute(f"""
create or replace table pos_house_hospital as (
with H as (SELECT *, long*cos(lat*pi()/180) as long_cos_lat FROM pos_house limit 10000),
S as (SELECT X, Y, X*cos(Y*pi()/180) as X_cos_Y, objectid, type, beds, owner from hospitals)
SELECT
    lat, long,
    count(S.objectid) as num_hospitals,
    sum(S.beds) as num_beds,
    sum(if(S.type='GENERAL ACUTE CARE', 1, 0)) as hospital_type_general,
    sum(if(S.type='CRITICAL ACCESS', 1, 0)) as hospital_type_critical,
    sum(if(S.type='PSYCHIATRIC', 1, 0)) as hospital_type_psychiatric,
    sum(if(S.type='LONG TERM CARE', 1, 0)) as hospital_type_longterm,
    sum(if(S.type='CHILDREN', 1, 0)) as hospital_type_children,
    sum(if(S.owner in('GOVERNMENT - DISTRICT/AUTHORITY', 'GOVERNMENT - FEDERAL',
        'GOVERNMENT - LOCAL', 'GOVERNMENT - STATE'), 1, 0)) as government_hospital,
    sum(if(S.owner = 'NON-PROFIT', 1, 0)) as nonprofit_hospital,
    sum(if(S.owner='PROPRIETARY', 1, 0)) as private_hospital,
    
FROM H left join S on (
    (H.long_cos_lat - S.X_cos_Y)*(H.long_cos_lat - S.X_cos_Y) + (H.lat - S.Y)*(H.lat - S.Y) < {distance_transformed}
)
GROUP BY lat, long
)
""").fetchall()

CPU times: total: 3.83 s
Wall time: 4.01 s


[(3856,)]

In [53]:
con.execute("SELECT * FROM pos_house_hospital;").fetchdf()

Unnamed: 0,lat,long,num_hospitals,num_beds,hospital_type_general,hospital_type_critical,hospital_type_psychiatric,hospital_type_longterm,hospital_type_children,government_hospital,nonprofit_hospital,private_hospital
0,45.6721,-122.5470,3,1080.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
1,45.5408,-122.5780,6,458.0,5.0,0.0,0.0,1.0,0.0,1.0,4.0,1.0
2,45.6359,-122.5160,22,7920.0,22.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0
3,45.5415,-122.5760,12,916.0,10.0,0.0,0.0,2.0,0.0,2.0,8.0,2.0
4,45.5556,-122.5560,4,1050.0,3.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
79685,36.0480,-95.7835,6,442.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0
79686,46.7327,-120.7140,0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79687,37.4580,-77.4700,4,473.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0
79688,36.2809,-95.8639,2,109.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [48]:
df_schools = con.execute("SELECT * FROM schools;").fetchdf()
df_schools["LEVEL_"].replace("N","unknown", inplace=True)
df_schools["LEVEL_"].replace("1","preschool", inplace=True)
df_schools["LEVEL_"].replace("2","elementary_school", inplace=True)
df_schools["LEVEL_"].replace("3","middle_school", inplace=True)
df_schools["LEVEL_"].replace("4","high_school", inplace=True)
con.execute("CREATE OR REPLACE TABLE schools AS SELECT * FROM df_schools;")

<duckdb.DuckDBPyConnection at 0x1c3633de470>

In [49]:
con.execute("SELECT * FROM schools;").fetchdf().head()

Unnamed: 0,X,Y,OBJECTID,NCESID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,...,VAL_METHOD,VAL_DATE,WEBSITE,LEVEL_,ENROLLMENT,ST_GRADE,END_GRADE,DISTRICTID,FT_TEACHER,SHELTER_ID
0,-74.062989,40.898942,19537,340000700081,MAYWOOD CAMPUS,404 MAYWOOD AVENUE,MAYWOOD,NJ,7607,unknow,...,IMAGERY/OTHER,2016-08-05,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,152,PK,5,3400007,26,unknow
1,-97.073579,36.710031,19543,402469002367,ROOSEVELT ES,815 EAST HIGHLAND AVENUE,PONCA CITY,OK,74601,4601,...,IMAGERY/OTHER,2016-08-31,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,441,PK,5,4024690,22,unknow
2,-79.754242,40.493811,19544,421935007298,PIVIK EL SCH,151 SCHOOL RD,PITTSBURGH,PA,15239,unknow,...,IMAGERY/OTHER,2016-09-02,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,582,KG,6,4219350,33,unknow
3,-89.04486,36.418973,19545,470426001730,UNION CITY MIDDLE SCHOOL,1111 HIGH SCHOOL DR,UNION CITY,TN,38261,unknow,...,IMAGERY,2016-09-19,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,elementary_school,353,06,8,4704260,21,unknow
4,-95.521884,29.724352,19546,482364002556,PINEY POINT EL,8921 PAGEWOOD LN,HOUSTON,TX,77063,5543,...,IMAGERY/OTHER,2016-09-30,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,1244,PK,5,4823640,65,unknow


In [53]:
%%time
con.execute(f"""
create or replace table pos_house_schools as (
with H as (SELECT *, long*cos(lat*pi()/180) as long_cos_lat FROM pos_house limit 10000),
S as (SELECT X, Y, X*cos(Y*pi()/180) as X_cos_Y, OBJECTID, POPULATION, LEVEL_ from schools)
SELECT
    lat, long,
    count(S.OBJECTID) as num_schools,
    sum(S.POPULATION) as num_students,
    sum(if(S.LEVEL_='preschool', 1, 0)) as preschool,
    sum(if(S.LEVEL_='elementary_school', 1, 0)) as elementary_school,
    sum(if(S.LEVEL_= 'middle_school', 1, 0)) as middle_school,
    sum(if(S.LEVEL_= 'high_school', 1, 0)) as high_school
    
FROM H left join S on (
    (H.long_cos_lat - S.X_cos_Y)*(H.long_cos_lat - S.X_cos_Y) + (H.lat - S.Y)*(H.lat - S.Y) < {distance_transformed}
)
GROUP BY lat, long
)
""").fetchall()

CPU times: total: 1min 43s
Wall time: 53.9 s


[(3856,)]

In [57]:
con.execute("SELECT * FROM pos_house_schools;").fetchdf()

Unnamed: 0,lat,long,num_schools,num_students,preschool,elementary_school,middle_school,high_school
0,45.6721,-122.5470,207,130908.0,99.0,33.0,51.0,24.0
1,45.5313,-122.6450,96,49609.0,67.0,10.0,14.0,4.0
2,45.6537,-122.6670,134,68676.0,74.0,16.0,22.0,20.0
3,45.5396,-122.6600,92,48294.0,62.0,10.0,14.0,5.0
4,45.6914,-122.5800,1584,949014.0,748.0,198.0,396.0,242.0
...,...,...,...,...,...,...,...,...
79685,36.0480,-95.7835,45,31682.0,30.0,6.0,2.0,3.0
79686,46.7327,-120.7140,7,2402.0,3.0,2.0,2.0,0.0
79687,37.4580,-77.4700,57,30755.0,31.0,10.0,6.0,5.0
79688,36.2809,-95.8639,17,13002.0,11.0,3.0,2.0,1.0


In [54]:
%%time 
con.execute(f"""
create or replace table houses as (
with H as (SELECT * FROM housing),
T1 as (SELECT * from pos_house_hospital),
T2 as (SELECT * from pos_house_schools)
SELECT *
FROM H, T1, T2
WHERE H.long == T1.long and H.lat == T1.lat
      and T1.long == T2.long and T1.lat == T2.lat
)
""").fetchall()
#pos_house_schools
#T2 as (SELECT * from pos_house_schools)
#FROM H left join T1 on ( H.long == T1.long and H.lat == T1.lat)

CPU times: total: 6.5 s
Wall time: 8.37 s


[(56409,)]

In [62]:
df = con.execute("SELECT * FROM houses;").fetchdf()

In [60]:
columns_names = list(df.columns)
print(columns_names)

['id', 'url', 'region', 'region_url', 'price', 'type', 'sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished', 'laundry_options', 'parking_options', 'image_url', 'description', 'lat', 'long', 'state', 'lat:1', 'long:1', 'num_hospitals', 'num_beds', 'hospital_type_general', 'hospital_type_critical', 'hospital_type_psychiatric', 'hospital_type_longterm', 'hospital_type_children', 'government_hospital', 'nonprofit_hospital', 'private_hospital', 'lat:2', 'long:2', 'num_schools', 'num_students', 'preschool', 'elementary_school', 'middle_school', 'high_school']


In [66]:
con.close()

In [64]:
con.execute("select count(*) from housing").fetchall()

[(384977,)]

In [65]:
con.execute("select count(*) from houses").fetchall()

[(382919,)]