In [2]:
import sys
import os
project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.append(project_root)

from src.mapping import CompanyMap, fuzzy_join
import pandas as pd


### Example 1: Fuzzy matching individual contractors across datasets

This will be the most common use case for the contract profile.

* 1. Analyst select a contractor from list
* 2. One CompanyMap object per target dataset is used to return potential matches for that contractor from each dataset.
* 3. Profile site is populated with data from matches, with clear sourcing highlighted to analyst so they can make the final determination of whether the contractors match

#### Import contractor registry data (dataset 1)

In [3]:
reg = pd.read_csv('../Contractor_Registry_certificate_20250215.csv')
reg

Unnamed: 0,Certificate Number,Business Name,DBA Name,Business Type,Business is MWBE Owned,Business is Publicly Traded,Business Officers,Address,Address 2,City,...,Debarment Start Date,Debarment End Date,Business has final determination for violation of Labor or Tax Law,Business has final determination safety standard violations,Business is associated with an apprenticeship program,Business is sponsor of a program,Business is signatory to a group program,Business has Workers Compensation Insurance,Business is exempt from Workers Compensation Insurance,Georeference
0,24-639SQ-CR,10100 Inc.,"10100, Inc.",Corporation,No,No,,955 West River Road,,Grand Island,...,,,No,No,No,No,No,Yes,No,POINT (-78.99049 42.97805)
1,24-63ZE1-CR,"1068 Curry Road, Inc.",city glass company,Corporation,No,No,,1068 curry rd,,"schenectady, ny 12306",...,,,No,No,No,No,No,Yes,No,POINT (-73.969 42.78614)
2,25-64TYL-CR,11400 LLC,CLARK FOOD SERVICE EQUIPMENT,LLC,No,No,,2551 HORSESHOE RD,,LANCASTER,...,,,No,No,No,No,No,Yes,No,POINT (-76.206 40.07246)
3,25-6414J-CR,"1349 Main, Gorenflo's Buffalo Wholesale Lock C...",Gorenflo's Buffalo Wholesale Lock,Corporation,No,No,,1349 Main Street,,Buffalo,...,,,No,No,No,No,No,Yes,No,POINT (-78.86594 42.91033)
4,25-64V75-CR,1895 ELECTRIC LLC,,LLC,No,No,,60 SCHOL ST#114,,ORCHARD PARK,...,,,No,No,No,No,No,Yes,No,POINT (-78.74632 42.76559)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7012,24-64EIY-CR,Zoria Housing LLC,Zoria Housing LLC,LLC,Yes,No,,87-28 130th street,,JAMAICA,...,,,No,No,Yes,No,Yes,Yes,No,POINT (-73.82193 40.70079)
7013,24-64FPI-CR,"Zorn Industries, Inc.",,Corporation,No,No,,11 SETTLERS WAY,,SETAUKET,...,,,No,No,No,No,No,Yes,No,POINT (-73.12543 40.93814)
7014,24-64RJL-CR,"Zuech's Environmental Services, Inc.",,Corporation,No,No,,PO Box 108,,Franklinville,...,,,No,No,No,No,No,Yes,No,
7015,24-6358H-CR,Zuke's Excavating LLC,,LLC,No,No,,65 Sanitaria Springs Road,,Binghamton,...,,,No,No,No,No,No,Yes,No,POINT (-75.74882 42.14368)


Rename columns for consistency and fill na

In [4]:
reg = reg.rename(columns={'Business Name':'NAME','DBA Name':'DBA'})
reg['ADDRESS'] = reg['Address'].fillna("") + " " + reg['Address 2'].fillna("") + " " + reg['City'].fillna("") + " " + reg['State'].fillna("") + " " + reg['Zip Code'].fillna("")

#### Import apprentice signatories (dataset 2)

In [5]:
sig = pd.read_csv('../data/raw/apprentice_signatories.csv')

Rename columns and fill na

In [6]:
sig = sig.rename(columns={'Signatory Name':'NAME'})
sig['ADDRESS'] = sig['Signatory Address'].fillna("") + " " + sig['City'].fillna("") + " " + sig['State'].fillna("") + " " + sig['Zip Code'].fillna("")
sig['DBA'] = ''

Instantiate CompanyMap with Registry target data

In [12]:
c = CompanyMap(
    target_data=reg,
    name_cols=['NAME','DBA'],
    addr_col='ADDRESS'
)

Pick an example from the signatories data

In [29]:
ex = sig.sample(1)
ex

Unnamed: 0,Sponsor,Trade,NAME,Signatory Address,City,State,Zip Code,Date Signatory Added,Source,ADDRESS,DBA
34243,Operating Engineers LU #137 Appr. Skills,Optg. Engineer (Universal Equipment),CONSTRUCTION RESOURCES CORP OF NEW YORK,779 GORNIK DRIVE,PERTH AMBOY,NJ,8861,1/1/2022,Group Signatory FOIL NYSDOL Oct 2024,779 GORNIK DRIVE PERTH AMBOY NJ 08861,


In [30]:
c.get_match_df(names=ex[['NAME','DBA']].values[0],address=ex['ADDRESS'].values[0])

Unnamed: 0,Certificate Number,NAME,DBA,Business Type,Business is MWBE Owned,Business is Publicly Traded,Business Officers,Address,Address 2,City,...,Business is associated with an apprenticeship program,Business is sponsor of a program,Business is signatory to a group program,Business has Workers Compensation Insurance,Business is exempt from Workers Compensation Insurance,Georeference,ADDRESS,norm_NAME,norm_DBA,norm_ADDRESS
1549,25-64OZV-CR,Construction Resources Corp. of New York,,Corporation,Yes,No,,779 Gornik Drive,,Perth Amboy,...,Yes,No,Yes,Yes,No,POINT (-74.28631 40.53765),779 Gornik Drive Perth Amboy NJ 08861,construction resources corp of new york,,779 gornik drive perth amboy nj 08861


### Example 2: Fully fuzzy joining dataset

Finding matches across 2 full datasets can be time consuming. In this example, we join the NYDOL debarment list to the contractor registry (construction companies only) because the debarment list is only a few hundred records. 

#### Import debarred contractor data (dataset 2)

In [31]:
debar = pd.read_csv('../data/processed/NYSDOL_debarment_02_19_2025.csv')
debar

Unnamed: 0,ID,AGENCY,FISCAL_OFFICER,FEIN,EMPLOYER_NAME,EMPLOYER_DBA,ADDRESS,DEBAR_START,DEBAR_END
0,1.0,DOL,DOL,*****5754,"0369 CONTRACTORS, LLC",,515 WEST AVE UNIT PH 13NORWALK CT 06850,05/12/2021,05/12/2026
1,2.0,DOL,DOL,*****5784,"A.J.M. TRUCKING, INC.",,PO BOX 2064 MONROE NY 10950,02/12/2024,02/12/2029
2,3.0,DOL,DOL,,AKHLAQ OULAKH,,4307 28TH AVE ASTORIA NY 11103,10/11/2024,10/11/2029
3,4.0,DOL,NYC,,"ALL COUNTY SEWER & DRAIN, INC.",,7 GREENFIELD DR WARWICK NY 10990,03/25/2022,03/25/2027
4,5.0,DOL,DOL,*****8387,"AMERICAN PAVING & MASONRY, CORP.",,8 FOREST AVE GLEN COVE NY 11542,05/24/2024,05/24/2029
...,...,...,...,...,...,...,...,...,...
202,204.0,DOL,DOL,,PAULINE CHAHALES,,935 S LAKE BLVD MAHOPAC NY 10541,05/17/2021,05/17/2026
203,205.0,DOL,DOL,*****9060,"PEC GROUP OF N.Y., INC.",,935 S LAKE BLVD SUITE 7MAHOPAC NY 10541,05/17/2021,05/17/2026
204,206.0,DOL,DOL,*****9060,"PEC GROUP OF N.Y., INC.",,935 S LAKE BLVD SUITE 7MAHOPAC NY 10541,03/02/2021,03/02/2026
205,207.0,DOL,DOL,,RUSSELL NEEDHAM,,532 NEPTUNE AVENUE BROOKLYN NY 11224,01/12/2022,01/12/2027


Rename columns for matching

In [32]:
debar = debar.rename(columns={'EMPLOYER_NAME':'NAME','EMPLOYER_DBA':'DBA'})

#### Join Registry data to debarment data

Theoretically, there should NOT be debarred contractors on the registry, but we found one potential match

In [33]:
df = fuzzy_join(debar, reg, name_cols=['NAME','DBA'], addr_col='ADDRESS', how='inner')
df

Unnamed: 0_level_0,ID,AGENCY,FISCAL_OFFICER,FEIN,NAME_l,DBA_l,ADDRESS_l,DEBAR_START,DEBAR_END,variable,...,Business is associated with an apprenticeship program,Business is sponsor of a program,Business is signatory to a group program,Business has Workers Compensation Insurance,Business is exempt from Workers Compensation Insurance,Georeference,ADDRESS_r,norm_NAME,norm_DBA,norm_ADDRESS
value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1921.0,35.0,DOL,DOL,*****7619,DANCO CONSTRUCTION UNLIMITED INC.,,485 RAFT AVENUE HOLBROOK NY 11741,10/19/2021,10/19/2026,0,...,No,No,No,Yes,No,POINT (-73.0722 40.77477),485 RAFT AVENUE HOLBROOK NY 11741,d kalogeras construction inc,,485 raft avenue holbrook ny 11741
