In [2]:
import pandas as pd
import numpy as np
import duckdb

In [3]:
sys.path.append('..')
from helper import *
setwd()

Objects = Objects()

### move temporally the trusted tables into a new database

In [4]:
import os

In [6]:
con = duckdb.connect(database = "./data/exploitation/exploitation.db", read_only=False) # connection for the exploatation zone

In [7]:
for datasource in Objects:
    id = datasource['id']
    con_trust = duckdb.connect(database = f"data/trusted/db_{id}.db")
    con_trust.execute("EXPORT DATABASE './temp'")
    con_trust.close()
    con.execute(f"IMPORT DATABASE './temp'")

In [8]:
!rm ./temp/*

In [14]:
con.execute("SHOW TABLES").fetchall()

[('hospitals',), ('housing',), ('schools',), ('tbl_house_hospital',)]

### now everything is in the exploitation database

In [16]:
vars_housing = "id, lat, long, price, type, sqfeet, beds, baths, cats_allowed, dogs_allowed, smoking_allowed, wheelchair_access, electric_vehicle_charge, comes_furnished, laundry_options, parking_options, state"

vars_housing_reference = vars_housing.replace("type", "H.type").replace("beds", "H.beds")
vars_housing_names = vars_housing.replace("type", "H.type as type").replace("beds", "H.beds as beds")


In [11]:
# options for the distance formula:

# option 1: distance in a plane -> more accurate and slower
"""
FROM H left join S on (
    (H.long - S.X)**2 + (H.lat - S.Y)**2 < 0.01
)
"""

# option 2: chebyshev distance -> less accurate and faster to compute
"""
FROM H left join S on (
    H.long - S.X < 0.1
    and H.long - S.X > -0.1
    and H.lat - S.Y < 0.1
    and H.lat - S.Y > -0.1
)
"""

# option 3: distance on a sphere

'\nFROM H left join S on (\n    H.long - S.X < 0.1\n    and H.long - S.X > -0.1\n    and H.lat - S.Y < 0.1\n    and H.lat - S.Y > -0.1\n)\n'

In [20]:
%%time
con.execute(f"""
create or replace table tbl_house_hospital as (
with H as (SELECT {vars_housing} FROM housing limit 100),
S as (SELECT X, Y, objectid, type, beds, owner from hospitals)
SELECT
    {vars_housing_names} ,
    count(S.objectid) as num_hospitals,
    sum(S.beds) as num_beds,
    sum(if(S.type='GENERAL ACUTE CARE', 1, 0)) as hospital_type_general,
    sum(if(S.type='CRITICAL ACCESS', 1, 0)) as hospital_type_critical,
    sum(if(S.type='PSYCHIATRIC', 1, 0)) as hospital_type_psychiatric,
    sum(if(S.type='LONG TERM CARE', 1, 0)) as hospital_type_longterm,
    sum(if(S.type='CHILDREN', 1, 0)) as hospital_type_children,
    sum(if(S.owner in('GOVERNMENT - DISTRICT/AUTHORITY', 'GOVERNMENT - FEDERAL',
        'GOVERNMENT - LOCAL', 'GOVERNMENT - STATE'), 1, 0)) as government_hospital,
    sum(if(S.owner = 'NON-PROFIT', 1, 0)) as nonprofit_hospital,
    sum(if(S.owner='PROPRIETARY', 1, 0)) as private_hospital,
    
FROM H left join S on (
    (H.long - S.X)**2 + (H.lat - S.Y)**2 < 0.01
)
GROUP BY {vars_housing_reference}
)
""").fetchall()

CPU times: total: 109 ms
Wall time: 871 ms


[(100,)]

In [21]:
%%time
con.execute(f"""
with H as (SELECT {vars_housing} FROM housing limit 1000),
S as (SELECT X, Y, objectid, type, beds, owner from hospitals)
SELECT
    {vars_housing_names} ,
    count(S.objectid) as num_hospitals,
    sum(S.beds) as num_beds,
    sum(if(S.type='GENERAL ACUTE CARE', 1, 0)) as hospital_type_general,
    sum(if(S.type='CRITICAL ACCESS', 1, 0)) as hospital_type_critical,
    sum(if(S.type='PSYCHIATRIC', 1, 0)) as hospital_type_psychiatric,
    sum(if(S.type='LONG TERM CARE', 1, 0)) as hospital_type_longterm,
    sum(if(S.type='CHILDREN', 1, 0)) as hospital_type_children,
    sum(if(S.owner in('GOVERNMENT - DISTRICT/AUTHORITY', 'GOVERNMENT - FEDERAL',
        'GOVERNMENT - LOCAL', 'GOVERNMENT - STATE'), 1, 0)) as government_hospital,
    sum(if(S.owner = 'NON-PROFIT', 1, 0)) as nonprofit_hospital,
    sum(if(S.owner='PROPRIETARY', 1, 0)) as private_hospital,
    
FROM H left join S on (
    (H.long - S.X)**2 + (H.lat - S.Y)**2 < 0.01
)
GROUP BY {vars_housing_reference}
""").fetchall()

CPU times: total: 2.47 s
Wall time: 1.56 s


[(7047341204,
  29.592,
  -98.47,
  933,
  'apartment',
  949,
  2,
  1.0,
  0,
  0,
  1,
  0,
  0,
  0,
  None,
  None,
  'tx',
  11,
  1402.0,
  7,
  0,
  1,
  0,
  0,
  0,
  3,
  8),
 (7050722769,
  29.5042,
  -98.5697,
  709,
  'apartment',
  665,
  1,
  1.0,
  0,
  0,
  1,
  0,
  0,
  0,
  None,
  None,
  'tx',
  19,
  2672.0,
  8,
  0,
  3,
  2,
  0,
  2,
  4,
  13),
 (7047297643,
  29.3595,
  -98.4499,
  874,
  'apartment',
  987,
  2,
  2.0,
  0,
  0,
  1,
  0,
  0,
  0,
  None,
  None,
  'tx',
  13,
  2776.0,
  9,
  0,
  1,
  2,
  1,
  2,
  2,
  9),
 (7040844249,
  29.4682,
  -98.5298,
  906,
  'apartment',
  950,
  2,
  1.0,
  1,
  1,
  0,
  1,
  0,
  0,
  'w/d hookups',
  'street parking',
  'tx',
  24,
  4558.0,
  11,
  0,
  3,
  4,
  1,
  2,
  5,
  17),
 (7050721837,
  29.3466,
  -98.5074,
  700,
  'apartment',
  750,
  2,
  1.0,
  0,
  0,
  1,
  0,
  0,
  0,
  None,
  None,
  'tx',
  13,
  2776.0,
  9,
  0,
  1,
  2,
  1,
  2,
  2,
  9),
 (7041452997,
  29.5484,
  -98.518

In [22]:
con.close()

## tests

In [65]:
con.execute("""
SELECT count(objectid) from schools
""").fetchall()

[(102370,)]

In [70]:
con.execute("""
SELECT count(id) from housing
""").fetchall()

[(384977,)]

In [76]:
con.execute("""
SELECT level_ from schools group by 1
""").fetchall()


[('1',), ('2',), ('N',), ('3',), ('4',)]

In [77]:
con.execute("""
SELECT st_grade, end_grade from schools group by 1,2
""").fetchall()

[('PK', '05'),
 ('06', '08'),
 ('PK', 'PK'),
 ('07', '08'),
 ('04', '05'),
 ('05', '06'),
 ('09', '12'),
 ('KG', '05'),
 ('PK', '03'),
 ('08', '12'),
 ('PK', '06'),
 ('07', '07'),
 ('10', '12'),
 ('07', '09'),
 ('03', '05'),
 ('07', '12'),
 ('KG', '06'),
 ('KG', '12'),
 ('05', '08'),
 ('PK', '08'),
 ('04', '08'),
 ('N', 'N'),
 ('KG', '08'),
 ('06', '12'),
 ('PK', '04'),
 ('KG', '09'),
 ('KG', '04'),
 ('06', '09'),
 ('01', '05'),
 ('PK', '01'),
 ('01', '12'),
 ('04', '06'),
 ('KG', '03'),
 ('PK', 'KG'),
 ('09', '13'),
 ('09', '09'),
 ('10', '10'),
 ('AE', 'AE'),
 ('03', '06'),
 ('PK', '12'),
 ('11', '12'),
 ('02', '04'),
 ('08', '09'),
 ('03', '08'),
 ('06', '06'),
 ('05', '12'),
 ('01', '04'),
 ('PK', '07'),
 ('09', '10'),
 ('04', '12'),
 ('01', '02'),
 ('09', '11'),
 ('01', '06'),
 ('KG', '01'),
 ('UG', 'UG'),
 ('02', '06'),
 ('02', '09'),
 ('04', '10'),
 ('KG', '07'),
 ('06', '07'),
 ('08', '08'),
 ('KG', '10'),
 ('01', '01'),
 ('05', '05'),
 ('01', '03'),
 ('PK', '13'),
 ('01', '07'