# Lecture 16: Imputation and Entity Resolution

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)

sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline
%config SqlMagic.displaylimit = 100

# GNIS data

This is the GNIS dataset from the previous lecture. If you didn't load in the database, run the below cell before connecting.

In [2]:
!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis' 
!psql -h localhost -d gnis -f ../lec14/data/gnis.sql

 pg_terminate_backend 
----------------------
(0 rows)

DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 3195
COPY 11533
CREATE INDEX


In [3]:
%reload_ext sql
%sql postgresql://localhost:5432/gnis

## Make a "holey" dataset

In [4]:
%sql SELECT setseed(0.12345);

setseed


In [5]:
%%sql
DROP TABLE IF EXISTS holey CASCADE;
CREATE TABLE holey AS 
SELECT feature_id, feature_name, feature_class, state_alpha, county_name, 
       prim_lat_dec, prim_long_dec, 
       CASE WHEN random() > 0.9 THEN NULL
            ELSE elev_in_m
        END AS elev_in_m
  FROM national;
SELECT count(elev_in_m)::float / count(*) FROM holey;

?column?
0.8755744385675887


In [6]:
%%sql
SELECT * FROM holey TABLESAMPLE BERNOULLI(.1);

feature_id,feature_name,feature_class,state_alpha,county_name,prim_lat_dec,prim_long_dec,elev_in_m
62218,Hill Chapel School (historical),School,AR,Arkansas,34.2692678,-91.2101177,57.0
798304,Transistor Number 1 Dam,Dam,MT,Blaine,48.6199998,-108.5773692,836.0
883621,Windsor Park,Populated Place,NJ,Ocean,39.9534514,-74.1495839,
1000848,Welchs Chapel,Church,NC,Chowan,36.264045,-76.6552257,11.0
1025544,Miller Chapel (historical),Church,NC,Bladen,34.8029441,-78.8525242,45.0
1093145,Gary Park,Park,OK,Tulsa,36.1450941,-95.9413832,231.0
1370838,Walton Park,Park,TX,Winkler,31.8537377,-103.0876649,872.0
1442597,Lily Lake Creek,Stream,UT,Uintah,40.6549525,-109.9957091,2456.0
1661671,West Saticoy,Populated Place,CA,Ventura,34.2852803,-119.160385,69.0
1774635,Cranberry Lake Dam,Dam,ME,Washington,45.06882,-67.29718,55.0


## Schema 1. Default Value Imputation with SQL

In [11]:
%%sql
WITH elevavg AS (SELECT avg(elev_in_m) AS av_elev FROM holey)
SELECT h.*, 
       CASE WHEN h.elev_in_m IS NOT NULL THEN h.elev_in_m::integer
            ELSE e.av_elev::integer
        END AS imputed_elev_in_m
  FROM holey h, elevavg e
LIMIT 10;

feature_id,feature_name,feature_class,state_alpha,county_name,prim_lat_dec,prim_long_dec,elev_in_m,imputed_elev_in_m
1230,Belmont Mountains,Range,AZ,Maricopa,33.642258,-112.9010129,931.0,931
1839,Bootlegger Saddle,Gap,AZ,Cochise,31.8931474,-109.2831176,2707.0,2707
2336,Cabeza Prieta Game Range,Park,AZ,Yuma,32.250056,-113.45074,275.0,275
2750,Chandler Springs,Spring,AZ,Navajo,35.3766788,-110.4754096,1685.0,1685
3032,Cline Well,Well,AZ,Cochise,31.9000849,-110.3428525,,484
3039,Clover Tank,Reservoir,AZ,Gila,33.8509816,-110.2577249,1563.0,1563
3060,Coat Spring,Spring,AZ,Navajo,36.12678,-110.3330424,1926.0,1926
3143,Comar Spring,Spring,AZ,Navajo,35.5308428,-110.4162424,1732.0,1732
3333,Cottonwood Creek,Stream,AZ,Coconino,36.050817,-111.9865535,800.0,800
3342,Cottonwood Creek,Stream,AZ,Mohave,36.901931,-112.5632547,1389.0,1389


## Scheme 2. Correlation across columns

In [12]:
# Training is an aggregate function
# Here we'll train the model in SQL just for fun
result = %sql SELECT regr_slope(elev_in_m, prim_long_dec), \
               regr_intercept(elev_in_m, prim_long_dec) FROM holey
slope, intercept = result[0]
slope, intercept

(-10.314179001097786, -477.9603219322606)

In [13]:
%%sql
SELECT prim_long_dec, elev_in_m
FROM holey
WHERE prim_long_dec >= -122
ORDER BY prim_long_dec
OFFSET 40;

prim_long_dec,elev_in_m
-121.8142546,469.0
-121.8099602,
-121.8083604,1454.0
-121.8056805,
-121.803556,1614.0
-121.7974945,1264.0
-121.7960906,219.0
-121.784236,1684.0
-121.7832846,2.0
-121.7785678,46.0


In [15]:
%%sql
SELECT *, 
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m::integer
            ELSE ({{slope}}*prim_long_dec + {{intercept}})::integer
        END AS imputed_elev_in_m
  FROM holey
LIMIT 10;

feature_id,feature_name,feature_class,state_alpha,county_name,prim_lat_dec,prim_long_dec,elev_in_m,imputed_elev_in_m
1230,Belmont Mountains,Range,AZ,Maricopa,33.642258,-112.9010129,931.0,931
1839,Bootlegger Saddle,Gap,AZ,Cochise,31.8931474,-109.2831176,2707.0,2707
2336,Cabeza Prieta Game Range,Park,AZ,Yuma,32.250056,-113.45074,275.0,275
2750,Chandler Springs,Spring,AZ,Navajo,35.3766788,-110.4754096,1685.0,1685
3032,Cline Well,Well,AZ,Cochise,31.9000849,-110.3428525,,660
3039,Clover Tank,Reservoir,AZ,Gila,33.8509816,-110.2577249,1563.0,1563
3060,Coat Spring,Spring,AZ,Navajo,36.12678,-110.3330424,1926.0,1926
3143,Comar Spring,Spring,AZ,Navajo,35.5308428,-110.4162424,1732.0,1732
3333,Cottonwood Creek,Stream,AZ,Coconino,36.050817,-111.9865535,800.0,800
3342,Cottonwood Creek,Stream,AZ,Mohave,36.901931,-112.5632547,1389.0,1389


## Scheme 3. General model-based interpolation

We won't show the demo; check slides for the general idea.

## Scheme 4. [simple] Correlation across ordered rows

In [17]:
%%sql
-- The following does not work in PostgreSQL!
WITH buggy AS (
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE lag(elev_in_m, 1)
                 OVER (ORDER BY feature_id)
        END AS imputed_elev_in_m
  FROM holey
)
SELECT feature_id, elev_in_m, imputed_elev_in_m FROM buggy ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

feature_id,elev_in_m,imputed_elev_in_m
48638,46.0,46.0
49192,401.0,401.0
49214,194.0,194.0
49350,,194.0
49578,,
49802,47.0,47.0
49925,111.0,111.0
50059,71.0,71.0
50309,,71.0
50661,26.0,26.0


In [8]:
%%sql
-- Here's a UDA fix from
-- https://stackoverflow.com/questions/18987791/how-do-i-efficiently-select-the-previous-non-null-value
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
    SELECT coalesce(value, state);
$$ LANGUAGE SQL;

CREATE OR REPLACE AGGREGATE coalesce_agg(anyelement) (
    SFUNC = coalesce_agg_sfunc,
    STYPE  = anyelement);

In [19]:
%%sql
-- Fixed to handle repeated NULLs
WITH fixed AS (
SELECT *,
       coalesce_agg(elev_in_m) OVER (order by feature_id) AS imputed_elev_in_m
  FROM holey
)
SELECT feature_id, elev_in_m, imputed_elev_in_m FROM fixed ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

feature_id,elev_in_m,imputed_elev_in_m
48638,46.0,46.0
49192,401.0,401.0
49214,194.0,194.0
49350,,194.0
49578,,194.0
49802,47.0,47.0
49925,111.0,111.0
50059,71.0,71.0
50309,,71.0
50661,26.0,26.0


## Generalized interpolation across rows

Forward pass

In [9]:
%%sql
-- 1. Forward assign run numbers to rows, propagate val, get nextval
CREATE OR REPLACE VIEW forward AS
SELECT feature_id, elev_in_m,
       SUM(CASE WHEN elev_in_m IS NULL THEN 0 ELSE 1 END) 
         OVER (ORDER BY feature_id) AS run,
       coalesce_agg(elev_in_m) OVER (ORDER BY feature_id) AS run_start,
       CASE WHEN elev_in_m IS NULL 
              THEN lead(elev_in_m, 1) OVER (ORDER BY feature_id)
            ELSE NULL
             END AS nextval
  FROM holey;
SELECT * FROM forward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

feature_id,elev_in_m,run,run_start,nextval
48638,46.0,168,46.0,
49192,401.0,169,401.0,
49214,194.0,170,194.0,
49350,,170,194.0,
49578,,170,194.0,47.0
49802,47.0,171,47.0,
49925,111.0,172,111.0,
50059,71.0,173,71.0,
50309,,173,71.0,26.0
50661,26.0,174,26.0,


Backward pass

In [10]:
%%sql
-- 2. Backward: assign run_end, run_size, run_rank
CREATE OR REPLACE VIEW backward AS
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE coalesce_agg(nextval) OVER (PARTITION BY run ORDER BY feature_id DESC)
        END AS run_end,
       count(*) OVER (PARTITION BY run) AS run_size,
       -1 + (RANK() OVER (PARTITION BY run ORDER BY feature_id)) AS run_rank
  FROM forward;
SELECT * FROM backward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

feature_id,elev_in_m,run,run_start,nextval,run_end,run_size,run_rank
48638,46.0,168,46.0,,46.0,1,0
49192,401.0,169,401.0,,401.0,1,0
49214,194.0,170,194.0,,194.0,3,0
49350,,170,194.0,,47.0,3,1
49578,,170,194.0,47.0,47.0,3,2
49802,47.0,171,47.0,,47.0,1,0
49925,111.0,172,111.0,,111.0,1,0
50059,71.0,173,71.0,,71.0,2,0
50309,,173,71.0,26.0,26.0,2,1
50661,26.0,174,26.0,,26.0,1,0


Scalar function pass

In [12]:
%%sql
-- 3. Simple scalar pass
CREATE OR REPLACE VIEW final AS
SELECT *, 
       run_start + run_rank*((run_end-run_start)/(run_size))
         AS interpolated
  FROM backward;
SELECT * FROM final  ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

feature_id,elev_in_m,run,run_start,nextval,run_end,run_size,run_rank,interpolated
48638,46.0,168,46.0,,46.0,1,0,46.0
49192,401.0,169,401.0,,401.0,1,0,401.0
49214,194.0,170,194.0,,194.0,3,0,194.0
49350,,170,194.0,,47.0,3,1,145.0
49578,,170,194.0,47.0,47.0,3,2,96.0
49802,47.0,171,47.0,,47.0,1,0,47.0
49925,111.0,172,111.0,,111.0,1,0,111.0
50059,71.0,173,71.0,,71.0,2,0,71.0
50309,,173,71.0,26.0,26.0,2,1,48.5
50661,26.0,174,26.0,,26.0,1,0,26.0


In [13]:
%sql EXPLAIN Analyze SELECT * from final LIMIT 500;

QUERY PLAN
Limit (cost=2196.29..2267.30 rows=500 width=72) (actual time=28.733..30.304 rows=500 loops=1)
-> Subquery Scan on backward (cost=2196.29..3834.15 rows=11533 width=72) (actual time=28.732..30.274 rows=500 loops=1)
-> WindowAgg (cost=2196.29..3545.82 rows=11533 width=64) (actual time=28.728..30.206 rows=500 loops=1)
-> WindowAgg (cost=2196.29..3344.00 rows=11533 width=56) (actual time=28.722..29.917 rows=501 loops=1)
-> Incremental Sort (cost=2196.29..3142.17 rows=11533 width=48) (actual time=28.717..29.670 rows=502 loops=1)
"Sort Key: forward.run, forward.feature_id"
Presorted Key: forward.run
Full-sort Groups: 16 Sort Method: quicksort Average Memory: 27kB Peak Memory: 27kB
-> WindowAgg (cost=2192.36..2423.02 rows=11533 width=48) (actual time=28.618..29.540 rows=514 loops=1)
-> Sort (cost=2192.36..2221.19 rows=11533 width=40) (actual time=28.543..28.574 rows=515 loops=1)


# String distance

In [27]:
%reload_ext sql
%sql postgresql://localhost:5432/postgres

In [28]:
%sql CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;

In [29]:
%%sql
DROP TABLE IF EXISTS Strings;
CREATE TABLE Strings (str1 TEXT, str2 TEXT);
INSERT INTO Strings VALUES
 ('Lisa', 'List'),
 ('Lisa', 'License'),
 ('Joe', 'Noel');

In [30]:
%%sql
SELECT *,
       levenshtein(str1, str2),
       soundex(str1) as soundex1,
       soundex(str2) as soundex2,
       dmetaphone(str1) AS dmetaphone1,
       dmetaphone(str2) AS dmetaphone2,
       dmetaphone_alt(str1) AS dmetaphone_alt1,
       dmetaphone_alt(str2) AS dmetaphone_alt2
FROM Strings;

str1,str2,levenshtein,soundex1,soundex2,dmetaphone1,dmetaphone2,dmetaphone_alt1,dmetaphone_alt2
Lisa,List,1,L200,L230,LS,LST,LS,LST
Lisa,License,4,L200,L252,LS,LSNS,LS,LSNS
Joe,Noel,2,J000,N400,J,NL,A,NL
