In [1]:
import numpy as np
import pandas as pd
import sys
import os
import duckdb
sys.path.append('..')
from helper import *
setwd()

Objects = Objects()

In [2]:
con = duckdb.connect(database = "./data/exploitation/exploitation.db", read_only=False)

In [47]:
con.close()

In [None]:
con = duckdb.connect(database = "./data/exploitation/exploitation.db", read_only=False)

for datasource in Objects:
    id = datasource['id']
    con_trust = duckdb.connect(database = f"data/trusted/db_{id}.db")
    con_trust.execute("EXPORT DATABASE './temp'")
    con_trust.close()
    con.execute(f"IMPORT DATABASE './temp'")

con.execute("SHOW TABLES").fetchall()

In [15]:
!rm ./temp/*

In [17]:
# lat long
con.execute("CREATE OR REPLACE TABLE pos_house AS SELECT lat, long FROM housing;")
con.execute("SELECT * FROM pos_house;").fetchdf().head

<bound method NDFrame.head of             lat     long
0       29.5920 -98.4700
1       29.5042 -98.5697
2       29.3595 -98.4499
3       29.4682 -98.5298
4       29.3466 -98.5074
...         ...      ...
384972  34.3879 -77.6628
384973  34.7700 -78.0221
384974  34.0859 -77.8976
384975  34.2325 -77.8708
384976  34.2325 -77.8708

[384977 rows x 2 columns]>

In [18]:
con.execute("""SELECT lat, long, COUNT(*) FROM pos_house GROUP BY lat, long""").fetchdf().head()

Unnamed: 0,lat,long,count_star()
0,35.4856,-97.6174,33
1,35.395,-97.4972,2
2,35.5102,-97.6106,79
3,35.4902,-97.6162,2
4,35.4897,-97.6183,11


In [19]:
con.execute("select pi()").fetchall()

[(3.141592653589793,)]

**distances filter:**

for each place (either house, school, hospital), we have its latitude and longitude (in degrees)

when the points are close enough, we can compute the distances using the formula of the distance on a plane, d^2 = dx^2 + dy^2

in this case, y = R·latitude and x = R·longitude·cos(latitude), where R is the Earth radius and latitude and longitude are in radians

hence, we define a new variable long_cos_lat = long(deg)·cos(latitude). Therefore, we can query the distances using the formula: 

d^2 = R^2 · pi^2/180^2 · ( ( long_cos_lat_1 - long_cos_lat_2 )^2 + (lat_1 - lat_2)^2 )

In [20]:
distance = 100 # distance in km
distance_transformed = str(( distance / (6371*np.pi/180) )**2 )
distance_transformed

'0.8087793508722445'

In [21]:
%%time
con.execute(f"""
create or replace table pos_house_hospital as (
with H as (SELECT *, long*cos(lat*pi()/180) as long_cos_lat FROM pos_house limit 100),
S as (SELECT X, Y, X*cos(Y*pi()/180) as X_cos_Y, objectid, type, beds, owner from hospitals)
SELECT
    lat, long,
    count(S.objectid) as num_hospitals,
    sum(S.beds) as num_beds,
    sum(if(S.type='GENERAL ACUTE CARE', 1, 0)) as hospital_type_general,
    sum(if(S.type='CRITICAL ACCESS', 1, 0)) as hospital_type_critical,
    sum(if(S.type='PSYCHIATRIC', 1, 0)) as hospital_type_psychiatric,
    sum(if(S.type='LONG TERM CARE', 1, 0)) as hospital_type_longterm,
    sum(if(S.type='CHILDREN', 1, 0)) as hospital_type_children,
    sum(if(S.owner in('GOVERNMENT - DISTRICT/AUTHORITY', 'GOVERNMENT - FEDERAL',
        'GOVERNMENT - LOCAL', 'GOVERNMENT - STATE'), 1, 0)) as government_hospital,
    sum(if(S.owner = 'NON-PROFIT', 1, 0)) as nonprofit_hospital,
    sum(if(S.owner='PROPRIETARY', 1, 0)) as private_hospital,
    
FROM H left join S on (
    (H.long_cos_lat - S.X_cos_Y)*(H.long_cos_lat - S.X_cos_Y) + (H.lat - S.Y)*(H.lat - S.Y) < {distance_transformed}
)
GROUP BY lat, long
)
""").fetchall()

CPU times: total: 93.8 ms
Wall time: 275 ms


[(44,)]

In [23]:
con.execute("SELECT * FROM pos_house_hospital;").fetchdf()

Unnamed: 0,lat,long,num_hospitals,num_beds,hospital_type_general,hospital_type_critical,hospital_type_psychiatric,hospital_type_longterm,hospital_type_children,government_hospital,nonprofit_hospital,private_hospital
0,29.592,-98.47,67,9494.0,38.0,2.0,7.0,6.0,1.0,13.0,14.0,40.0
1,29.5042,-98.5697,124,18398.0,72.0,2.0,12.0,10.0,2.0,24.0,24.0,76.0
2,29.3595,-98.4499,427,63791.0,245.0,7.0,42.0,35.0,7.0,84.0,77.0,266.0
3,29.4682,-98.5298,62,9199.0,36.0,1.0,6.0,5.0,1.0,12.0,12.0,38.0
4,29.3466,-98.5074,120,18128.0,68.0,2.0,12.0,10.0,2.0,22.0,22.0,76.0
5,29.5484,-98.5186,198,28380.0,111.0,6.0,21.0,18.0,3.0,36.0,42.0,120.0
6,29.5612,-98.6117,124,18656.0,72.0,2.0,12.0,10.0,2.0,22.0,26.0,76.0
7,29.6428,-98.6226,335,47375.0,190.0,10.0,35.0,30.0,5.0,60.0,70.0,205.0
8,32.8958,-96.9744,1086,131670.0,558.0,6.0,126.0,162.0,36.0,66.0,234.0,786.0
9,29.5578,-98.5167,264,37840.0,148.0,8.0,28.0,24.0,4.0,48.0,56.0,160.0


In [24]:
df_schools = con.execute("SELECT * FROM schools;").fetchdf()
df_schools["LEVEL_"].replace("N","unknown", inplace=True)
df_schools["LEVEL_"].replace("1","preschool", inplace=True)
df_schools["LEVEL_"].replace("2","elementary_school", inplace=True)
df_schools["LEVEL_"].replace("3","middle_school", inplace=True)
df_schools["LEVEL_"].replace("4","high_school", inplace=True)
con.execute("CREATE OR REPLACE TABLE schools AS SELECT * FROM df_schools;")

<duckdb.DuckDBPyConnection at 0x12b65ac4fb0>

In [25]:
con.execute("SELECT * FROM schools;").fetchdf().head()

Unnamed: 0,X,Y,OBJECTID,NCESID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,...,VAL_METHOD,VAL_DATE,WEBSITE,LEVEL_,ENROLLMENT,ST_GRADE,END_GRADE,DISTRICTID,FT_TEACHER,SHELTER_ID
0,-111.816961,40.58732,10145,490014200768,QUAIL HOLLOW SCHOOL,2625 E 9070 S,SANDY,UT,84093,unknow,...,GEOCODE,2013-05-28,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,487,KG,5,4900142,-1,unknow
1,-111.994451,33.480387,10155,40081102855,OMBUDSMAN - CHARTER EAST II,4041 EAST THOMAS ROAD,PHOENIX,AZ,85018,unknow,...,GEOCODE,2013-05-28,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,middle_school,126,09,12,400811,-1,unknow
2,-71.683301,41.969033,10224,440009000022,WILLIAM CALLAHAN SCHOOL,75 CALLAHAN SCHOOL STREET,HARRISVILLE,RI,2830,unknow,...,IMAGERY,2013-05-28,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,303,02,5,4400090,20,unknow
3,-84.028771,36.033081,10226,470222000754,POWELL ELEMENTARY,1711 SPRING ST,POWELL,TN,37849,unknow,...,IMAGERY,2013-05-28,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,941,PK,5,4702220,54,unknow
4,-122.015768,39.740086,10239,60133902076,HAMILTON ELEMENTARY,277 CAPAY AVE.,HAMILTON CITY,CA,95951,0277,...,GEOCODE,2013-05-28,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,preschool,416,KG,8,601339,17,unknow


In [26]:
%%time
con.execute(f"""
create or replace table pos_house_schools as (
with H as (SELECT *, long*cos(lat*pi()/180) as long_cos_lat FROM pos_house limit 100),
S as (SELECT X, Y, X*cos(Y*pi()/180) as X_cos_Y, OBJECTID, POPULATION, LEVEL_ from schools)
SELECT
    lat, long,
    count(S.OBJECTID) as num_schools,
    sum(S.POPULATION) as num_students,
    sum(if(S.LEVEL_='preschool', 1, 0)) as preschool,
    sum(if(S.LEVEL_='elementary_school', 1, 0)) as elementary_school,
    sum(if(S.LEVEL_= 'middle_school', 1, 0)) as middle_school,
    sum(if(S.LEVEL_= 'high_school', 1, 0)) as high_school
    
FROM H left join S on (
    (H.long_cos_lat - S.X_cos_Y)*(H.long_cos_lat - S.X_cos_Y) + (H.lat - S.Y)*(H.lat - S.Y) < {distance_transformed}
)
GROUP BY lat, long
)
""").fetchall()

CPU times: total: 1.45 s
Wall time: 905 ms


[(44,)]

In [27]:
con.execute("SELECT * FROM pos_house_schools;").fetchdf()

Unnamed: 0,lat,long,num_schools,num_students,preschool,elementary_school,middle_school,high_school
0,29.592,-98.47,840,502027.0,413.0,158.0,132.0,76.0
1,29.5042,-98.5697,1662,981156.0,800.0,312.0,258.0,166.0
2,29.3595,-98.4499,5747,3390961.0,2786.0,1071.0,875.0,574.0
3,29.4682,-98.5298,831,488356.0,400.0,155.0,130.0,83.0
4,29.3466,-98.5074,1644,967702.0,796.0,304.0,252.0,166.0
5,40.7697,-111.909,698,486589.0,432.0,97.0,104.0,56.0
6,29.5484,-98.5186,2547,1505484.0,1233.0,480.0,396.0,243.0
7,29.5612,-98.6117,1650,986310.0,804.0,308.0,256.0,160.0
8,29.6428,-98.6226,4165,2487400.0,2050.0,775.0,655.0,385.0
9,32.8958,-96.9744,12930,8552376.0,7062.0,2394.0,1896.0,972.0


In [28]:
%%time 
con.execute(f"""
create or replace table houses as (
with H as (SELECT * FROM housing limit 100),
T1 as (SELECT * from pos_house_hospital),
T2 as (SELECT * from pos_house_schools)
SELECT *
FROM H, T1, T2
WHERE H.long == T1.long and H.lat == T1.lat
      and T1.long == T2.long and T1.lat == T2.lat
)
""").fetchall()
#pos_house_schools
#T2 as (SELECT * from pos_house_schools)
#FROM H left join T1 on ( H.long == T1.long and H.lat == T1.lat)

CPU times: total: 31.2 ms
Wall time: 36.9 ms


[(100,)]

In [3]:
df = con.execute("SELECT * FROM houses;").fetchdf()

CatalogException: Catalog Error: Table with name houses does not exist!
Did you mean "pg_am"?
LINE 1: SELECT * FROM houses;
                      ^

In [30]:
columns_names = list(df.columns)
print(columns_names)

['id', 'url', 'region', 'region_url', 'price', 'type', 'sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished', 'laundry_options', 'parking_options', 'image_url', 'description', 'lat', 'long', 'state', 'lat:1', 'long:1', 'num_hospitals', 'num_beds', 'hospital_type_general', 'hospital_type_critical', 'hospital_type_psychiatric', 'hospital_type_longterm', 'hospital_type_children', 'government_hospital', 'nonprofit_hospital', 'private_hospital', 'lat:2', 'long:2', 'num_schools', 'num_students', 'preschool', 'elementary_school', 'middle_school', 'high_school']


In [42]:
con.close()

In [64]:
con.execute("select count(*) from housing").fetchall()

[(384977,)]

In [65]:
con.execute("select count(*) from houses").fetchall()

[(382919,)]

remove unneeded tables:

In [4]:
tables = con.execute("SHOW TABLES").fetchall()
tables = [t[0] for t in tables]
tables

[]

In [32]:
for table in tables:
    if not table == "houses":
        con.execute(f"DROP TABLE {table};")