# Imputation and Entity Resolution

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)

sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline

# GNIS data

This is the GNIS dataset from a previous lecture. If you didn't load in the database, run the below cell before connecting.

In [3]:
!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis' 
!psql -h localhost -d gnis -f ../lec11/data/gnis.sql

 pg_terminate_backend 
----------------------
(0 rows)

DROP DATABASE
CREATE DATABASE
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE TABLE
ALTER TABLE
COPY 3195
COPY 11533
CREATE INDEX


In [2]:
%reload_ext sql
%sql postgresql://localhost:5432/gnis

In [3]:
%sql SELECT setseed(0.12345);

setseed


Adding some missing values (that we will then impute later)

In [4]:
%%sql
DROP TABLE IF EXISTS holey CASCADE;
CREATE TABLE holey AS 
SELECT feature_id, feature_name, feature_class, state_alpha, county_name, 
       prim_lat_dec, prim_long_dec, 
       CASE WHEN random() > 0.9 THEN NULL
            ELSE elev_in_m
        END AS elev_in_m
  FROM national;
SELECT count(elev_in_m)::float / count(*) FROM holey;

?column?
0.8755744385675887


In [5]:
%%sql
SELECT * FROM holey LIMIT 10;

feature_id,feature_name,feature_class,state_alpha,county_name,prim_lat_dec,prim_long_dec,elev_in_m
1230,Belmont Mountains,Range,AZ,Maricopa,33.642258,-112.9010129,931.0
1839,Bootlegger Saddle,Gap,AZ,Cochise,31.8931474,-109.2831176,2707.0
2336,Cabeza Prieta Game Range,Park,AZ,Yuma,32.250056,-113.45074,275.0
2750,Chandler Springs,Spring,AZ,Navajo,35.3766788,-110.4754096,1685.0
3032,Cline Well,Well,AZ,Cochise,31.9000849,-110.3428525,
3039,Clover Tank,Reservoir,AZ,Gila,33.8509816,-110.2577249,1563.0
3060,Coat Spring,Spring,AZ,Navajo,36.12678,-110.3330424,1926.0
3143,Comar Spring,Spring,AZ,Navajo,35.5308428,-110.4162424,1732.0
3333,Cottonwood Creek,Stream,AZ,Coconino,36.050817,-111.9865535,800.0
3342,Cottonwood Creek,Stream,AZ,Mohave,36.901931,-112.5632547,1389.0


## [At home] Method 1. Default Value Imputation with SQL

In [6]:
%%sql
WITH elevavg AS (SELECT avg(elev_in_m) FROM holey)
SELECT h.*, 
       CASE WHEN h.elev_in_m IS NOT NULL
            THEN h.elev_in_m
            ELSE e.avg
        END AS imputed_elev_in_m
  FROM holey h, elevavg e
LIMIT 10;

feature_id,feature_name,feature_class,state_alpha,county_name,prim_lat_dec,prim_long_dec,elev_in_m,imputed_elev_in_m
1230,Belmont Mountains,Range,AZ,Maricopa,33.642258,-112.9010129,931.0,931.0
1839,Bootlegger Saddle,Gap,AZ,Cochise,31.8931474,-109.2831176,2707.0,2707.0
2336,Cabeza Prieta Game Range,Park,AZ,Yuma,32.250056,-113.45074,275.0,275.0
2750,Chandler Springs,Spring,AZ,Navajo,35.3766788,-110.4754096,1685.0,1685.0
3032,Cline Well,Well,AZ,Cochise,31.9000849,-110.3428525,,483.962467815409
3039,Clover Tank,Reservoir,AZ,Gila,33.8509816,-110.2577249,1563.0,1563.0
3060,Coat Spring,Spring,AZ,Navajo,36.12678,-110.3330424,1926.0,1926.0
3143,Comar Spring,Spring,AZ,Navajo,35.5308428,-110.4162424,1732.0,1732.0
3333,Cottonwood Creek,Stream,AZ,Coconino,36.050817,-111.9865535,800.0,800.0
3342,Cottonwood Creek,Stream,AZ,Mohave,36.901931,-112.5632547,1389.0,1389.0


## [At home] Method 2. Correlation across columns

In [7]:
# Training is an aggregate function
# Here we'll train the model in SQL just for fun
result = %sql SELECT regr_slope(elev_in_m, prim_long_dec), \
               regr_intercept(elev_in_m, prim_long_dec) FROM holey
slope, intercept = result[0]
slope, intercept

(-10.314179001097786, -477.9603219322606)

In [8]:
%%sql
SELECT *, 
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE {{slope}}*prim_long_dec + {{intercept}}
        END AS imputed_elev_in_m
  FROM holey
LIMIT 10;

feature_id,feature_name,feature_class,state_alpha,county_name,prim_lat_dec,prim_long_dec,elev_in_m,imputed_elev_in_m
1230,Belmont Mountains,Range,AZ,Maricopa,33.642258,-112.9010129,931.0,931.0
1839,Bootlegger Saddle,Gap,AZ,Cochise,31.8931474,-109.2831176,2707.0,2707.0
2336,Cabeza Prieta Game Range,Park,AZ,Yuma,32.250056,-113.45074,275.0,275.0
2750,Chandler Springs,Spring,AZ,Navajo,35.3766788,-110.4754096,1685.0,1685.0
3032,Cline Well,Well,AZ,Cochise,31.9000849,-110.3428525,,660.1356102444698
3039,Clover Tank,Reservoir,AZ,Gila,33.8509816,-110.2577249,1563.0,1563.0
3060,Coat Spring,Spring,AZ,Navajo,36.12678,-110.3330424,1926.0,1926.0
3143,Comar Spring,Spring,AZ,Navajo,35.5308428,-110.4162424,1732.0,1732.0
3333,Cottonwood Creek,Stream,AZ,Coconino,36.050817,-111.9865535,800.0,800.0
3342,Cottonwood Creek,Stream,AZ,Mohave,36.901931,-112.5632547,1389.0,1389.0


## [At home] Method 3. General model-based interpolation

We won't show the demo; check slides for the general idea.

In [9]:

%config SqlMagic.displaylimit = 100

## [At home] Method 4. [simple] Correlation across ordered rows

In [10]:
%%sql
-- The following doesn't work in PostgreSQL!
WITH buggy AS (
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE lag(elev_in_m, 1)
                 OVER (ORDER BY feature_id)
        END AS imputed_elev_in_m
  FROM holey
)
SELECT * FROM buggy ORDER BY elev_in_m DESC LIMIT 100;

feature_id,feature_name,feature_class,state_alpha,county_name,prim_lat_dec,prim_long_dec,elev_in_m,imputed_elev_in_m
214923,Lewes Junior High School,School,DE,Sussex,38.7687232,-75.1460168,,89.0
209568,Ox Hill,Populated Place,CT,Fairfield,41.2284297,-73.2190027,,31.0
204786,San Acacio,Populated Place,CO,Costilla,37.2139025,-105.5644584,,1712.0
205451,Berkshire,Populated Place,CT,Fairfield,41.4087069,-73.2595601,,1945.0
205915,Calebs Peak,Summit,CT,Litchfield,41.7559276,-73.4648456,,116.0
194300,Bull Pasture Well,Well,CO,Las Animas,37.6594568,-104.3541397,,1692.0
198468,Gurley Dam,Dam,CO,San Miguel,38.035547,-108.2459128,,
189182,Lienhart Mine,Mine,CO,Chaffee,38.9377726,-106.2547437,,3173.0
197727,Pine Brook Hill,Populated Place,CO,Boulder,40.0499857,-105.3147137,,2601.0
198347,King Reservoir Number 1,Reservoir,CO,Mesa,38.8657673,-108.6665656,,1568.0


In [19]:
%%sql
-- Here's a UDA fix from
-- https://stackoverflow.com/questions/18987791/how-do-i-efficiently-select-the-previous-non-null-value
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
    SELECT coalesce(value, state);
$$ LANGUAGE SQL;

CREATE OR REPLACE AGGREGATE coalesce_agg(anyelement) (
    SFUNC = coalesce_agg_sfunc,
    STYPE  = anyelement);

In [12]:
%%sql
-- Fixed to handle repeated NULLs
WITH fixed AS (
SELECT *,
       coalesce_agg(elev_in_m) OVER (order by feature_id) AS imputed_elev_in_m
  FROM holey
)
SELECT * FROM fixed ORDER BY imputed_elev_in_m DESC LIMIT 100;

feature_id,feature_name,feature_class,state_alpha,county_name,prim_lat_dec,prim_long_dec,elev_in_m,imputed_elev_in_m
1416580,Browne Tower,Summit,AK,Denali,63.1000583,-150.9314441,4628.0,4628.0
188010,Grizzly Peak,Summit,CO,La Plata,37.6076296,-107.5818648,4171.0,4171.0
191939,Spread Eagle Peak,Summit,CO,Custer,38.1252369,-105.6435197,4090.0,4090.0
187393,Emery Peak,Summit,CO,San Juan,37.8882556,-107.6210535,4054.0,4054.0
181832,Baker Mountain,Summit,CO,Clear Creek,39.6607654,-105.8402538,3832.0,3832.0
1703800,Lucky Jim One Mine,Mine,CO,Park,39.1519347,-105.742509,,3728.0
1703775,Park County Claims Mine,Mine,CO,Park,39.3510997,-105.9869614,3728.0,3728.0
393894,Donaldson Peak,Summit,ID,Custer,44.0639981,-113.7003853,3661.0,3661.0
1599419,Dundee Mountain,Summit,WY,Park,43.8771622,-109.2682693,3640.0,3640.0
939450,Master Man Mine,Mine,NM,Colfax,36.6297542,-105.2088936,3633.0,3633.0


## Method 4: Generalized interpolation across rows, with Linear Interpolation as an example

Forward pass

In [13]:
%sql DROP VIEW IF EXISTS forward;

We first need a function that helps us find the previous not-null value; special way to do it in postgres through `coalesce_agg`.


In [21]:
%%sql
-- run this if you didn't run it above with Method 3
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
    SELECT coalesce(value, state);
$$ LANGUAGE SQL;

CREATE OR REPLACE AGGREGATE coalesce_agg(anyelement) (
    SFUNC = coalesce_agg_sfunc,
    STYPE  = anyelement);

In [23]:
%%sql
-- 1. Forward assign run numbers to rows, propagate val, get nextval
CREATE OR REPLACE VIEW forward AS
SELECT feature_id,
    elev_in_m,
       SUM(CASE WHEN elev_in_m IS NULL THEN 0 ELSE 1 END) 
         OVER (ORDER BY feature_id) AS run, -- counting the number of not-null values seen so far
       coalesce_agg(elev_in_m) OVER (ORDER BY feature_id) AS run_start, -- closest not-null value
       CASE WHEN elev_in_m IS NULL 
            THEN lead(elev_in_m, 1) OVER (ORDER BY feature_id)
            ELSE NULL
       END AS nextval -- finding the next value
  FROM holey;
SELECT * FROM forward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

feature_id,elev_in_m,run,run_start,nextval
48638,46.0,168,46.0,
49192,401.0,169,401.0,
49214,194.0,170,194.0,
49350,,170,194.0,
49578,,170,194.0,47.0
49802,47.0,171,47.0,
49925,111.0,172,111.0,
50059,71.0,173,71.0,
50309,,173,71.0,26.0
50661,26.0,174,26.0,


Backward pass

In [24]:
%sql DROP VIEW IF EXISTS backward;

In [26]:
%%sql
-- 2. Backward: assign run_end, run_size, run_rank
CREATE OR REPLACE VIEW backward AS
SELECT feature_id,
    elev_in_m,
run_start,
nextval,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE coalesce_agg(nextval) OVER (PARTITION BY run ORDER BY feature_id DESC)
        END AS run_end, -- closest not-null value backwards
       count(*) OVER (PARTITION BY run) AS run_size, -- size of the gap between not-null values (a run)
       -1 + (RANK() OVER (PARTITION BY run ORDER BY feature_id)) AS run_rank -- index within the run
  FROM forward;
SELECT * FROM backward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

feature_id,elev_in_m,run_start,nextval,run_end,run_size,run_rank
48638,46.0,46.0,,46.0,1,0
49192,401.0,401.0,,401.0,1,0
49214,194.0,194.0,,194.0,3,0
49350,,194.0,,47.0,3,1
49578,,194.0,47.0,47.0,3,2
49802,47.0,47.0,,47.0,1,0
49925,111.0,111.0,,111.0,1,0
50059,71.0,71.0,,71.0,2,0
50309,,71.0,26.0,26.0,2,1
50661,26.0,26.0,,26.0,1,0


Scalar function pass

In [28]:
%%sql
-- 3. Simple scalar pass
CREATE OR REPLACE VIEW final AS
SELECT *, 
       run_start + (run_rank)*((run_end-run_start)/(run_size))
         AS interpolated
  FROM backward;
SELECT * FROM final  ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

feature_id,elev_in_m,run_start,nextval,run_end,run_size,run_rank,interpolated
48638,46.0,46.0,,46.0,1,0,46.0
49192,401.0,401.0,,401.0,1,0,401.0
49214,194.0,194.0,,194.0,3,0,194.0
49350,,194.0,,47.0,3,1,145.0
49578,,194.0,47.0,47.0,3,2,96.0
49802,47.0,47.0,,47.0,1,0,47.0
49925,111.0,111.0,,111.0,1,0,111.0
50059,71.0,71.0,,71.0,2,0,71.0
50309,,71.0,26.0,26.0,2,1,48.5
50661,26.0,26.0,,26.0,1,0,26.0


In [29]:
%sql EXPLAIN Analyze SELECT * from final LIMIT 500;

QUERY PLAN
Limit (cost=2196.29..2267.30 rows=500 width=64) (actual time=28.649..30.215 rows=500 loops=1)
-> Subquery Scan on backward (cost=2196.29..3834.15 rows=11533 width=64) (actual time=28.648..30.183 rows=500 loops=1)
-> WindowAgg (cost=2196.29..3545.82 rows=11533 width=64) (actual time=28.645..30.120 rows=500 loops=1)
-> WindowAgg (cost=2196.29..3344.00 rows=11533 width=56) (actual time=28.639..29.832 rows=501 loops=1)
-> Incremental Sort (cost=2196.29..3142.17 rows=11533 width=48) (actual time=28.635..29.587 rows=502 loops=1)
"Sort Key: forward.run, forward.feature_id"
Presorted Key: forward.run
Full-sort Groups: 16 Sort Method: quicksort Average Memory: 27kB Peak Memory: 27kB
-> WindowAgg (cost=2192.36..2423.02 rows=11533 width=48) (actual time=28.563..29.452 rows=514 loops=1)
-> Sort (cost=2192.36..2221.19 rows=11533 width=40) (actual time=28.489..28.521 rows=515 loops=1)


---

# String distance

In [30]:
%reload_ext sql
%sql postgresql://localhost:5432/postgres

In [31]:
%sql CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;

In [32]:
%%sql
DROP TABLE IF EXISTS Strings;
CREATE TABLE Strings (str1 TEXT, str2 TEXT);
INSERT INTO Strings VALUES
 ('Lisa', 'List'),
 ('Lisa', 'License'),
 ('Joe', 'Noel');

In [33]:
%%sql
SELECT *,
       levenshtein(str1, str2),
       soundex(str1) as soundex1,
       soundex(str2) as soundex2,
       dmetaphone(str1) AS dmetaphone1,
       dmetaphone(str2) AS dmetaphone2,
       dmetaphone_alt(str1) AS dmetaphone_alt1,
       dmetaphone_alt(str2) AS dmetaphone_alt2
FROM Strings;

str1,str2,levenshtein,soundex1,soundex2,dmetaphone1,dmetaphone2,dmetaphone_alt1,dmetaphone_alt2
Lisa,List,1,L200,L230,LS,LST,LS,LST
Lisa,License,4,L200,L252,LS,LSNS,LS,LSNS
Joe,Noel,2,J000,N400,J,NL,A,NL
