# Lecture 16: Imputation and Entity Resolution

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import importlib
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)

sns.reset_orig()
sns.set(font_scale=1.5)
%matplotlib inline

# GNIS data

This is the GNIS dataset from the previous lecture. If you didn't load in the database, run the below cell before connecting.

In [None]:
!psql -h localhost -d gnis -c 'SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();'
!psql -h localhost -c 'DROP DATABASE IF EXISTS gnis'
!psql -h localhost -c 'CREATE DATABASE gnis' 
!psql -h localhost -d gnis -f ../lec13/data/gnis.sql

In [None]:
%reload_ext sql
%sql postgresql://localhost:5432/gnis

## Make a "holey" dataset

In [None]:
%sql SELECT setseed(0.12345);

In [None]:
%%sql
DROP TABLE IF EXISTS holey CASCADE;
CREATE TABLE holey AS 
SELECT feature_id, feature_name, feature_class, state_alpha, county_name, 
       prim_lat_dec, prim_long_dec, 
       CASE WHEN random() > 0.9 THEN NULL
            ELSE elev_in_m
        END AS elev_in_m
  FROM national;
SELECT count(elev_in_m)::float / count(*) FROM holey;

In [None]:
%%sql
SELECT * FROM holey TABLESAMPLE BERNOULLI(.1);

## Schema 1. Default Value Imputation with SQL

In [None]:
%%sql
WITH elevavg AS (SELECT avg(elev_in_m) FROM holey)
SELECT h.*, 
       CASE WHEN h.elev_in_m IS NOT NULL THEN h.elev_in_m
            ELSE e.avg
        END AS imputed_elev_in_m
  FROM holey h, elevavg e
LIMIT 10;

## Scheme 2. Correlation across columns

In [None]:
# Training is an aggregate function
# Here we'll train the model in SQL just for fun
result = %sql SELECT regr_slope(elev_in_m, prim_long_dec), \
               regr_intercept(elev_in_m, prim_long_dec) FROM holey
slope, intercept = result[0]
slope, intercept

In [None]:
%%sql
SELECT prim_long_dec, elev_in_m
FROM holey
WHERE prim_long_dec >= -122
ORDER BY prim_long_dec
OFFSET 40;

In [None]:
%%sql
SELECT *, 
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE {{slope}}*prim_long_dec + {{intercept}}
        END AS imputed_elev_in_m
  FROM holey
LIMIT 10;

## Scheme 3. General model-based interpolation

We won't show the demo; check slides for the general idea.

In [None]:
%reload_ext sql
%sql postgresql://localhost:5432/gnis
%config SqlMagic.displaylimit = 100

## Scheme 4. [simple] Correlation across ordered rows

In [None]:
%%sql
-- The following does not work in PostgreSQL!
WITH buggy AS (
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE lag(elev_in_m, 1)
                 OVER (ORDER BY feature_id)
        END AS imputed_elev_in_m
  FROM holey
)
SELECT feature_id, elev_in_m, imputed_elev_in_m FROM buggy ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

In [None]:
%%sql
-- Here's a UDA fix from
-- https://stackoverflow.com/questions/18987791/how-do-i-efficiently-select-the-previous-non-null-value
CREATE OR REPLACE FUNCTION coalesce_agg_sfunc(state anyelement, value anyelement) RETURNS anyelement AS
$$
    SELECT coalesce(value, state);
$$ LANGUAGE SQL;

CREATE OR REPLACE AGGREGATE coalesce_agg(anyelement) (
    SFUNC = coalesce_agg_sfunc,
    STYPE  = anyelement);

In [None]:
%%sql
-- Fixed to handle repeated NULLs
WITH fixed AS (
SELECT *,
       coalesce_agg(elev_in_m) OVER (order by feature_id) AS imputed_elev_in_m
  FROM holey
)
SELECT feature_id, elev_in_m, imputed_elev_in_m FROM fixed ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

## Generalized interpolation across rows

Forward pass

In [None]:
%%sql
-- 1. Forward assign run numbers to rows, propagate val, get nextval
CREATE OR REPLACE VIEW forward AS
SELECT *,
       SUM(CASE WHEN elev_in_m IS NULL THEN 0 ELSE 1 END) 
         OVER (ORDER BY feature_id) AS run,
       coalesce_agg(elev_in_m) OVER (ORDER BY feature_id) AS run_start,
       CASE WHEN elev_in_m IS NULL 
              THEN lead(elev_in_m, 1) OVER (ORDER BY feature_id)
            ELSE NULL
             END AS nextval
  FROM holey;
SELECT * FROM forward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

Backward pass

In [None]:
%%sql
-- 2. Backward: assign run_end, run_size, run_rank
CREATE OR REPLACE VIEW backward AS
SELECT *,
       CASE WHEN elev_in_m IS NOT NULL THEN elev_in_m
            ELSE coalesce_agg(nextval) OVER (PARTITION BY run ORDER BY feature_id DESC)
        END AS run_end,
       count(*) OVER (PARTITION BY run) AS run_size,
       -1 + (RANK() OVER (PARTITION BY run ORDER BY feature_id)) AS run_rank
  FROM forward;
SELECT * FROM backward ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

Scalar function pass

In [None]:
%%sql
-- 3. Simple scalar pass
CREATE OR REPLACE VIEW final AS
SELECT *, 
       run_start + (run_rank-1)*((run_end-run_start)/(run_size))
         AS interpolated
  FROM backward;
SELECT * FROM final  ORDER BY feature_id ASC LIMIT 12 OFFSET 183;

In [None]:
%sql EXPLAIN Analyze SELECT * from final LIMIT 500;

# String distance

In [None]:
%reload_ext sql
%sql postgresql://localhost:5432/postgres

In [None]:
%sql CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;

In [None]:
%%sql
DROP TABLE IF EXISTS Strings;
CREATE TABLE Strings (str1 TEXT, str2 TEXT);
INSERT INTO Strings VALUES
 ('Lisa', 'List'),
 ('Lisa', 'License'),
 ('Joe', 'Noel');

In [None]:
%%sql
SELECT *,
       levenshtein(str1, str2),
       soundex(str1) as soundex1,
       soundex(str2) as soundex2,
       dmetaphone(str1) AS dmetaphone1,
       dmetaphone(str2) AS dmetaphone2,
       dmetaphone_alt(str1) AS dmetaphone_alt1,
       dmetaphone_alt(str2) AS dmetaphone_alt2
FROM Strings;