<a href="https://colab.research.google.com/github/andrew66882011/qss20_slides_activities/blob/main/activities/05_merging_session2_activitysolutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports 

In [3]:
!pip install recordlinkage

Collecting recordlinkage
[?25l  Downloading https://files.pythonhosted.org/packages/db/26/babbca39d74824e8bc17428a8eb04951a1d63318af7d02beeb2106a1ec26/recordlinkage-0.14-py3-none-any.whl (944kB)
[K     |████████████████████████████████| 952kB 14.0MB/s 
[?25hCollecting jellyfish>=0.5.4
[?25l  Downloading https://files.pythonhosted.org/packages/30/a6/4d039bc827a102f62ce7a7910713e38fdfd7c7a40aa39c72fb14938a1473/jellyfish-0.8.2-cp37-cp37m-manylinux2014_x86_64.whl (90kB)
[K     |████████████████████████████████| 92kB 9.1MB/s 
Installing collected packages: jellyfish, recordlinkage
Successfully installed jellyfish-0.8.2 recordlinkage-0.14


In [8]:
import pandas as pd
import re 
import numpy as np
import datetime
from datetime import datetime

## a couple recordlinkage packages
import recordlinkage

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
!pip install pyjarowinkler



In [6]:
## modules still being installed on jhub
## nltk for string distance
import nltk

## jarowinkler
from pyjarowinkler import distance

# Load raw data 

In [10]:
## general link: https://data.sandiego.gov/datasets/business-listings/

## active tax certificates
sd = pd.read_csv("https://seshat.datasd.org/ttcs/sd_businesses_active_datasd.csv")



In [11]:
## PPP loans subsetted to CA and sd zip
ppp = pd.read_csv("https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/6b62a44b-69ec-436a-9b95-0ea550475543/download/public_150k_plus.csv")
ppp.BorrowerState.value_counts(dropna = False)
ppp = ppp[(ppp.BorrowerState == "CA") &
                  (ppp.BorrowerZip.isin(sd.address_zip))].copy()
ppp.shape

CA     131005
TX      76303
NY      74218
FL      60418
IL      40877
PA      38584
OH      32687
NJ      32552
MI      29846
MA      27483
GA      25984
WA      23779
VA      23313
NC      22876
CO      19760
MN      19736
MD      19021
WI      17361
IN      16623
MO      16358
TN      15893
AZ      15812
LA      13692
OR      13471
CT      12900
AL      10927
SC      10766
OK       9999
UT       9306
KY       9183
IA       8305
NV       8286
KS       8204
AR       5860
NE       5807
MS       5515
NH       5080
HI       4981
NM       4491
ID       4426
DC       4412
ME       4187
RI       3794
WV       3368
ND       3221
MT       3146
DE       2996
PR       2948
SD       2697
AK       2665
VT       2269
WY       2145
GU        444
VI        290
MP         84
AS         20
NaN        13
Name: BorrowerState, dtype: int64

(17381, 51)

# Activity

- Clean the address fields in the respective data sets 
- Clean the naics code fields in the data to extract the first two digits
- Take a random sample of ~200 rows of each of the datasets (sd and ppp). Write code to fuzzy match using different variables to block on between zip code, city name, 2-digit naics sector, etc. As a warning, you may not get any matches with a random sample so may want to construct a targeted sample using a few ones you know have matches
- If you haven't already, put the steps in the recordlinkage process into a function
- **Challenge exercise**: make the function general enough so that it can take in multiple string arguments to potentially fuzzy match on



## Step 1 and 2- preprocess join fields

### Cleaning ppp address cols

In [12]:
## view random sample of ppp rows
ppp_addcols = ["BorrowerAddress", "BorrowerCity", "BorrowerState", "BorrowerZip"]
ppp[ppp_addcols].sample(n = 10, random_state = 666)

## see that (1) 5-digit versus 9 dig zip code
## and (2) variable capitalization

## first, strip zip code to be first 5 dig
ppp['BorrowerZip'] = ppp.BorrowerZip.str.replace("\-.*", "", regex = True)

## first, convert those cols to upper
## using apply to do less manually
ppp[ppp_addcols] = ppp[ppp_addcols].apply(lambda x: x.astype(str).str.upper())
ppp[ppp_addcols].sample(n = 10, random_state = 666)


Unnamed: 0,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip
69950,9346 Abraham Way,Santee,CA,92071-2861
43424,14251 Danielson St,Poway,CA,92064-8818
119758,950 eastlake parkway,CHULA VISTA,CA,91914-3558
50954,10201 wateridge circle,san diego,CA,92121-5800
127737,2128 Missouri St,San Diego,CA,92109-3627
141502,7389 Jackson Dr,San Diego,CA,92119-2316
132173,3506 Breakwater Court,HAYWARD,CA,94545
64407,28300 Constellation Rd,Santa Clarita,CA,91355
98925,6701 Koll Center Pkwy Ste 250,Pleasanton,CA,94566-8062
59291,"199 Fremont St., Ste 2100",San Francisco,CA,94105


Unnamed: 0,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip
69950,9346 ABRAHAM WAY,SANTEE,CA,92071
43424,14251 DANIELSON ST,POWAY,CA,92064
119758,950 EASTLAKE PARKWAY,CHULA VISTA,CA,91914
50954,10201 WATERIDGE CIRCLE,SAN DIEGO,CA,92121
127737,2128 MISSOURI ST,SAN DIEGO,CA,92109
141502,7389 JACKSON DR,SAN DIEGO,CA,92119
132173,3506 BREAKWATER COURT,HAYWARD,CA,94545
64407,28300 CONSTELLATION RD,SANTA CLARITA,CA,91355
98925,6701 KOLL CENTER PKWY STE 250,PLEASANTON,CA,94566
59291,"199 FREMONT ST., STE 2100",SAN FRANCISCO,CA,94105


### Cleaning sd address cols

In [13]:
sd_address = ['address_no', 'address_pd', 'address_road', 'address_sfx', 'address_city', 
             'address_state', 'address_zip']

sd[sd_address].head()

## truncate zip
sd['BorrowerZip'] = sd.address_zip.str.replace("\-.*", "", regex = True)

## replace NA for those with ""
sd[sd_address] = sd[sd_address].fillna("")

## do more manual truncation since no, pd, road, and sfx are
## separated by whitespace; others are comma
sd['address_merged'] = sd[['address_no', 'address_pd', 'address_road', 
                               'address_sfx']].agg(' '.join, axis=1) 

sd[sd_address + ['address_merged']].head()
                            
## leaving city, state, and zip separate

Unnamed: 0,address_no,address_pd,address_road,address_sfx,address_city,address_state,address_zip
0,9655,,GRANITE RIDGE,DR,SAN DIEGO,CA,92123-2697
1,1168,,FESLER,ST,EL CAJON,CA,92020-1812
2,8666,,COMMERCE,AVE,SAN DIEGO,CA,92121-2613
3,10101,,OLD GROVE,RD,SAN DIEGO,CA,92131-1650
4,1,,VISION,WAY,BLOOMFIELD,CT,06002-5321


Unnamed: 0,address_no,address_pd,address_road,address_sfx,address_city,address_state,address_zip,address_merged
0,9655,,GRANITE RIDGE,DR,SAN DIEGO,CA,92123-2697,9655 GRANITE RIDGE DR
1,1168,,FESLER,ST,EL CAJON,CA,92020-1812,1168 FESLER ST
2,8666,,COMMERCE,AVE,SAN DIEGO,CA,92121-2613,8666 COMMERCE AVE
3,10101,,OLD GROVE,RD,SAN DIEGO,CA,92131-1650,10101 OLD GROVE RD
4,1,,VISION,WAY,BLOOMFIELD,CT,06002-5321,1 VISION WAY


### Create 2-dig naics codes

In [14]:
## already exists for sd
sd[[col for col in sd.columns if "naics" in col]]
print('NAICS 2-dig in SD are:-----------------')
sd.naics_sector.value_counts(dropna = False)

## for ppp- truncate to first 2dig
ppp['naics_sector'] = ppp.NAICSCode.astype(str).str[0:2]
print('NAICS 2-dig in PPP are:-----------------')
ppp.naics_sector.value_counts(dropna = False)

Unnamed: 0,naics_sector,naics_code,naics_description
0,44,442,FURNITURE & HOME FURNISHINGS STORES
1,23,23511,"PLUMBING, HEATING & AC CONTRACTOR"
2,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE
3,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE
4,45,4539,OTHER MISCELLANEOUS STORE RETAILERS
...,...,...,...
63588,45,454,NONSTORE RETAILERS
63589,48,4841,GENERAL FREIGHT TRUCKING
63590,54,54161,MANAGEMENT CONSULTING SERVICES
63591,52,52421,INSURANCE AGENCIES & BROKERAGES


NAICS 2-dig in SD are:-----------------


54    13175
81     9403
56     4905
45     4868
62     4840
23     4741
44     3617
72     3576
53     3172
42     2001
48     1908
61     1766
71     1424
52     1202
33      835
51      801
32      323
49      309
31      305
55      229
11      114
22       65
92       12
21        2
Name: naics_sector, dtype: int64

NAICS 2-dig in PPP are:-----------------


23    2710
54    2680
72    2042
62    1329
33    1300
42    1009
56     976
81     816
44     597
53     469
51     415
45     414
48     392
32     383
na     361
52     336
61     268
99     208
31     206
71     202
11      94
49      43
22      42
92      41
55      28
21      20
Name: naics_sector, dtype: int64

## Clean business name by converting to upper and removing some puncutation

Here, i'm stripping out the , and . but keeping & since it may be relevant

In [15]:
print("Before cleaning names:-----------------")
sd.dba_name.sample(10, random_state = 4)
ppp.BorrowerName.sample(10, random_state = 4)

sd['clean_name'] = [re.sub(r"\,|\.", "", name) for name in sd.dba_name]
ppp['clean_name'] = [re.sub(r"\,|\.", "", name) for name in ppp.BorrowerName]

print("After cleaning names:-----------------")
sd.clean_name.sample(10, random_state = 4)
ppp.clean_name.sample(10, random_state = 4)

Before cleaning names:-----------------


10219    COASTAL PACIFIC FUNDING CORPORATION
1457                           LSW ENGINEERS
22544            SMITH  RAFTERY  CONSULTANTS
8471             SAN DIEGO ASSOC OF REALTORS
1660                          SCHMIDT DESIGN
26545                             LIGHTHOUSE
12157             MITSUBISHI ELECTRIC US INC
1575        LAW OFFICES OF CHARLES WOLFINGER
35862                   METAL METHOD COMPANY
13446        ATS ADVANCED TRUSTEE STRATEGIES
Name: dba_name, dtype: object

145519                     THE RECESSROOM INCORPORATED
84738                   FINAL CLEANING SOLUTIONS, INC.
79834           DOREL GABORAS DBA TODAY'S CONSTRUCTION
38354                        INVESTMENT CONCEPTS, INC.
57542                    PACIFIC 9 TRANSPORTATION, INC
104945    PACIFIC ENVIRNMENTAL &AMP; ABATEMENT SOLUTIO
103451                           INFOCAP NETWORKS, LLC
114808                TOTAL CARE FAMILY MEDICAL CENTER
50150                                      AFFLUENT HR
94652                        INDUSTRIAL PLASTIC SUPPLY
Name: BorrowerName, dtype: object

After cleaning names:-----------------


10219    COASTAL PACIFIC FUNDING CORPORATION
1457                           LSW ENGINEERS
22544            SMITH  RAFTERY  CONSULTANTS
8471             SAN DIEGO ASSOC OF REALTORS
1660                          SCHMIDT DESIGN
26545                             LIGHTHOUSE
12157             MITSUBISHI ELECTRIC US INC
1575        LAW OFFICES OF CHARLES WOLFINGER
35862                   METAL METHOD COMPANY
13446        ATS ADVANCED TRUSTEE STRATEGIES
Name: clean_name, dtype: object

145519                     THE RECESSROOM INCORPORATED
84738                     FINAL CLEANING SOLUTIONS INC
79834           DOREL GABORAS DBA TODAY'S CONSTRUCTION
38354                          INVESTMENT CONCEPTS INC
57542                     PACIFIC 9 TRANSPORTATION INC
104945    PACIFIC ENVIRNMENTAL &AMP; ABATEMENT SOLUTIO
103451                            INFOCAP NETWORKS LLC
114808                TOTAL CARE FAMILY MEDICAL CENTER
50150                                      AFFLUENT HR
94652                        INDUSTRIAL PLASTIC SUPPLY
Name: clean_name, dtype: object

## Step 2 - randomly sample rows and try fuzzy matching

In [16]:
## shift to larger sample out of SD since that's the pool
## we're searching for PPP in
ppp_samp = ppp.sample(n = 200, random_state = 566)
sd_samp = sd.sample(n = 20000, random_state = 566)

## here, i'm blocking on zip code and fuzzily matching on 
## business name and street name  

### step 1- init recordlinker
link_ppp_sd = recordlinkage.Index()

### step 2 - tell it what to block on 
link_ppp_sd.block("BorrowerZip")

### step 3- compute candidate links
candidate_links_zipcode = link_ppp_sd.index(sd_samp, ppp_samp)
candidate_links_zipcode

## step 3 under the hood - pull out example
## see that they have the same zip but unlikely to be matches
sd_samp.loc[sd_samp.index == 7201,
           ["clean_name", "BorrowerZip"]]
ppp_samp.loc[ppp_samp.index.isin([137429, 84332]),
           ["clean_name", "BorrowerZip"]]

### step 4- create compare class and add fuzzy strings
compare = recordlinkage.Compare()
compare.string('clean_name', 'clean_name', method='jarowinkler', threshold=0.7)
compare.string('address_merged', 'BorrowerAddress', method = "jarowinkler", threshold = 0.9)



<Index>

MultiIndex([(10601,  60327),
            (10601,  53058),
            (10601,  46158),
            (10601,  45518),
            (10601,  52265),
            (48595,  60327),
            (48595,  53058),
            (48595,  46158),
            (48595,  45518),
            (48595,  52265),
            ...
            (19897,  37101),
            (18710, 151546),
            (22534, 162330),
            (23028,  53287),
            (40265,  53287),
            (52357,  41790),
            (31892,  41790),
            (11635, 125712),
            ( 5840, 125712),
            (58497,  39770)],
           length=44621)

Unnamed: 0,clean_name,BorrowerZip
7201,MILTONS DELICATESSEN GRILL & BAKERY,92014


Unnamed: 0,clean_name,BorrowerZip


<Compare>

<Compare>

In [17]:
## step 5- feed the candidate links to the compare class and compute
compare_vectors = compare.compute(candidate_links_zipcode, sd_samp, ppp_samp)
compare_vectors

## step 5 under the hood - pull out example
## of biz with matching (non-zero) address col
## and name col
compare_vectors.columns = ["name", "address"]
compare_vectors[(compare_vectors.name == 1) &
               (compare_vectors.address == 1)]


Unnamed: 0,Unnamed: 1,0,1
10601,60327,0.0,0.0
10601,53058,0.0,0.0
10601,46158,0.0,0.0
10601,45518,0.0,0.0
10601,52265,0.0,0.0
...,...,...,...
52357,41790,1.0,1.0
31892,41790,0.0,0.0
11635,125712,0.0,0.0
5840,125712,0.0,0.0


Unnamed: 0,Unnamed: 1,name,address
5106,145118,1.0,1.0
48857,145118,1.0,1.0
25503,149344,1.0,1.0
28646,88146,1.0,1.0
1699,97872,1.0,1.0
24478,41407,1.0,1.0
52357,41790,1.0,1.0


In [18]:
### step 5 under the hood - example of similar-enough name
example_id = 14769
sd_samp.loc[sd_samp.index == example_id,
           ["clean_name", "BorrowerZip", "address_merged"]]
ppp_samp.loc[ppp_samp.index.isin([68600]),
           ["clean_name", "BorrowerZip", "BorrowerAddress"]]

## yay looks like a true match!

## step 6 -- algorithm to aggregate across fields
## example code has k-means
## here, we're using e-m alg
## and transforming it to a df
## sd_index is on left 
ecm = recordlinkage.ECMClassifier()
predicted_matches_ecm = pd.DataFrame(list(ecm.fit_predict(compare_vectors)),
                                     columns = ['sd_index', 'ppp_index'])
print("Matches returned are:------------")
predicted_matches_ecm


Unnamed: 0,clean_name,BorrowerZip,address_merged
14769,AQUA CLEAR WATER TREATMENT SPECIALISTS INC,92117,3952D CLMNT MSA BLVD


Unnamed: 0,clean_name,BorrowerZip,BorrowerAddress


Matches returned are:------------


Unnamed: 0,sd_index,ppp_index
0,5106,145118
1,48857,145118
2,25503,149344
3,28646,88146
4,1699,97872
5,24478,41407
6,52357,41790


In [19]:
## step 7 - merge the two to compare
### merge to sd data
sd_samp['sd_index'] = sd_samp.index
m1_addsd = pd.merge(predicted_matches_ecm,
                   sd_samp[['clean_name', 'sd_index', 'address_merged', "naics_sector"]],
                   on = "sd_index",
                   how = "inner")
ppp_samp['ppp_index'] = ppp_samp.index
m2_addp = pd.merge(m1_addsd, 
                  ppp_samp[['clean_name', 'ppp_index', 'BorrowerAddress', 'naics_sector']],
                  on = "ppp_index",
                  how = "inner",
                  suffixes = ["_SDtaxdata", "_PPPloandata"])

## see some true matches and some false matches (e.g., adaptive launch solutions
## is a false match; habitat restoration sciences is a false match)
## would then want to adjust string threshold
## or possibly add naics code as field
m2_addp

Unnamed: 0,sd_index,ppp_index,clean_name_SDtaxdata,address_merged,naics_sector_SDtaxdata,clean_name_PPPloandata,BorrowerAddress,naics_sector_PPPloandata
0,5106,145118,ROBERT HOWES,1285 UNIVERSITY AVE,81,SCOOTER BREW INC,1458 UNIVERSITY AVE,72
1,48857,145118,SACRED WOMB MEDICINE,1286 UNIVERSITY AVE,45,SCOOTER BREW INC,1458 UNIVERSITY AVE,72
2,25503,149344,CHALLENGE ELECTRIC CORP,285 VERNON WAY,23,CHALLENGE ELECTRIC CORP,285 VERNON WAY,23
3,28646,88146,GEARY FLOORS INC,349 S MARSHALL AVE,23,GEARY FLOORS INC,349 S MARSHALL AVE,23
4,1699,97872,C & M MOTORS INC,904 ROOSEVELT AVE,81,C&M MOTORS INC,904 ROOSEVELT AVE,44
5,24478,41407,SPECTRA COMPANY INC,2510 SUPPLY ST,23,SPECTRA COMPANY,2510 SUPPLY ST,23
6,52357,41790,HAL HAYS CONSTRUCTION INC,4181 LATHAM ST,23,HAL HAYS CONSTRUCTION INC,4181 LATHAM ST,23


## Example of more manual versus more automatic way of concatenating address columns


In [21]:

## more manual way
ppp['address_merged_manual'] = ppp.BorrowerAddress + ", " + ppp.BorrowerCity + ", " + ppp.BorrowerState + ", " + ppp.BorrowerZip
ppp[ppp_addcols + ['address_merged_manual']].head()

### more automatic using agg and join
ppp['address_merged_auto'] = ppp[ppp_addcols].agg(', '.join, axis=1)
ppp[["address_merged_manual", "address_merged_auto"]].head()

Unnamed: 0,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip,address_merged_manual
35298,431 ISIS AVE,INGLEWOOD,CA,90301,"431 ISIS AVE, INGLEWOOD, CA, 90301"
35307,9242 LIGHTWAVE AVE STE 100,SAN DIEGO,CA,92123,"9242 LIGHTWAVE AVE STE 100, SAN DIEGO, CA, 92123"
35320,26460 CORPORATE AVE SUITE 250,HAYWARD,CA,94545,"26460 CORPORATE AVE SUITE 250, HAYWARD, CA, 94545"
35321,"6400 OAK CANYON DRIVE, #200",IRVINE,CA,92618,"6400 OAK CANYON DRIVE, #200, IRVINE, CA, 92618"
35323,3708 RUFFIN RD,SAN DIEGO,CA,92123,"3708 RUFFIN RD, SAN DIEGO, CA, 92123"


Unnamed: 0,address_merged_manual,address_merged_auto
35298,"431 ISIS AVE, INGLEWOOD, CA, 90301","431 ISIS AVE, INGLEWOOD, CA, 90301"
35307,"9242 LIGHTWAVE AVE STE 100, SAN DIEGO, CA, 92123","9242 LIGHTWAVE AVE STE 100, SAN DIEGO, CA, 92123"
35320,"26460 CORPORATE AVE SUITE 250, HAYWARD, CA, 94545","26460 CORPORATE AVE SUITE 250, HAYWARD, CA, 94545"
35321,"6400 OAK CANYON DRIVE, #200, IRVINE, CA, 92618","6400 OAK CANYON DRIVE, #200, IRVINE, CA, 92618"
35323,"3708 RUFFIN RD, SAN DIEGO, CA, 92123","3708 RUFFIN RD, SAN DIEGO, CA, 92123"
