In [1]:
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
import csv
import tqdm
import os

from IPython.display import display

In [2]:
# Define paths for the original, corrected, and cache directories
data_dir = Path("data")
dataact_dir = data_dir / '5MM MASTER FILE FROM SBA WITHOUT CONTACT DETAILS'
dataact_dir_corrected = data_dir / '5MM MASTER FILE FROM SBA WITHOUT CONTACT DETAILS Corrected'
cache_dir = data_dir / 'cache'

# Create the directories if they don't exist
dataact_dir_corrected.mkdir(parents=True, exist_ok=True)
cache_dir.mkdir(parents=True, exist_ok=True)

def preprocess_csv_file(input_file, output_file):
    with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        headers = next(reader)
        writer.writerow(headers)
        num_columns = len(headers)
        incorrect_rows_count = 0
        for row in reader:
            if len(row) == num_columns:
                writer.writerow(row)
            else:
                incorrect_rows_count += 1
        print(f"File: {input_file.name} - Incorrect rows omitted: {incorrect_rows_count}")

def load_or_preprocess():
    all_dataframes = []
    for csv_file in dataact_dir.glob('*.csv'):
        cache_file = cache_dir / f"{csv_file.stem}_corrected.parquet"

        if cache_file.exists():
            df = pl.read_parquet(cache_file)
            print(f"Loaded cached data from {cache_file}")
        else:
            output_file = dataact_dir_corrected / csv_file.name
            preprocess_csv_file(csv_file, output_file)
            df = pl.read_csv(output_file, ignore_errors=True)
            df.write_parquet(cache_file)
            print(f"Processed and cached data for {csv_file.name}")

        all_dataframes.append(df)

    return pl.concat(all_dataframes)

# Execute the function to load or process and cache the data
loan_info_df = load_or_preprocess()


Loaded cached data from data/cache/DATAACT_EIDL_LOANS_DMCS2.0_corrected.parquet
Loaded cached data from data/cache/DATAACT_EIDL_LOANS_20200401-20200609_corrected.parquet
Loaded cached data from data/cache/DATAACT_EIDL_LOANS_20200626-20200723_corrected.parquet
Loaded cached data from data/cache/DATAACT_EIDL_LOANS_20200610-20200625_corrected.parquet
Loaded cached data from data/cache/DATAACT_EIDL_LOANS_20200724-20201115_corrected.parquet


In [3]:
print(f"Bad rows {loan_info_df['LEGALENTITYCONGRESSIONALDISTRICT'].is_null().sum()}")

# Drop rows with missing 'LEGALENTITYCONGRESSIONALDISTRICT' values
loan_info_df = loan_info_df.filter(loan_info_df['LEGALENTITYCONGRESSIONALDISTRICT'].is_not_null())

print(f"Number of rows after dropping missing 'LEGALENTITYCONGRESSIONALDISTRICT' values: {len(loan_info_df)}")

Bad rows 850
Number of rows after dropping missing 'LEGALENTITYCONGRESSIONALDISTRICT' values: 3765391


In [4]:
loan_info_df.head()

ACTIONTYPE,ACTIONDATE,ASSISTANCETYPE,RECORDTYPE,FAIN,AWARDMODIFICATIONAMENDMENTNUM,URI,CORRECTIONLATEDELETEIND,FISCALYEARANDQTRCORRECTION,SAI_NUM,AWARDEEORRECIPIENTLEGALENTITYNAME,AWARDEEORRECIPIENTUNIQUEIDENTIFIER,LEGALENTITYADDRLINE1,LEGALENTITYADDRLINE2,LEGALENTITYADDRLINE3,LEGALENTITYCITYNAME,LEGALENTITYSTATECD,LEGALENTITYZIP5,LEGALENTITYZIPLAST4,LEGALENTITYCOUNTRYCD,LEGALENTITYFOREIGNCITYNAME,LEGALENTITYFOREIGNPROVINCENAME,LEGALENTITYFOREIGNPOSTALCD,LEGALENTITYCONGRESSIONALDISTRICT,BUSINESSTYPES,FUNDINGAGENCYCD,FUNDINGSUBTIERAGENCYCD,FUNDINGOFFICECD,AWARDINGAGENCYCD,AWARDINGSUBTIERAGENCYCD,AWARDINGOFFICECD,CFDA_NUM,PRIMPLACEOFPERFORMANCECD,PRIMPLACEOFPERFORMANCECOUNTRYCD,PRIMPLACEOFPERFORMANCEZIP+4,PRIMPLACEOFPERFORMANCEFOREIGNLOCATIONDESC,PRIMPLACEOFPERFORMANCECONGRESSIONALDISTRICT,AWARDDESC,PERIODOFPERFORMANCESTARTDATE,PERIODOFPERFORMANCECURRENTENDDATE,FEDERALACTIONOBLIGATION,NONFEDERALFUNDINGAMOUNT,FACEVALUEOFDIRECTLOANORLOANGUARANTEE,ORIGINALLOANSUBSIDYCOST,BUSINESSFUNDSINDICATOR
str,i64,i64,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str,i64,str,i64,i64,i64,i64,i64,i64,f64,str,str,str,str,i64,str,str,str,str,str,f64,f64,str
"""A""",20200803,7,2,2443918208,1,,,,"""SAI Exempt""","""AKIKO'S SUSHI BAR INC.""",,"""726 Noriega St""",,,"""SAN FRANCISCO""","""CA""",94122,4542,"""USA""",,,,12,"""R""",73,7300,732990,73,7300,732990,59.008,"""CA**075""","""USA""",,,12,"""TO PROVIDE LOANS TO RESTORE AS…",,,,,144000.0,19612.8,"""NON"""
"""A""",20200803,7,2,2446098206,1,,,,"""SAI Exempt""","""DITZLER GENERAL CONTRACTING IN…",,"""212 WILE AVE""",,,"""SOUDERTON""","""PA""",18964,1624,"""USA""",,,,1,"""R""",73,7300,732990,73,7300,732990,59.008,"""PA**091""","""USA""",,,1,"""TO PROVIDE LOANS TO RESTORE AS…",,,,,25300.0,3445.86,"""NON"""
"""A""",20200803,7,2,2448018208,1,,,,"""SAI Exempt""","""HERMAN CLEANERS, LLC""",,"""5590 KEYSTONE PINE WAY""",,,"""DUBLIN""","""OH""",43016,9472,"""USA""",,,,12,"""R""",73,7300,732990,73,7300,732990,59.008,"""OH**049""","""USA""",,,12,"""TO PROVIDE LOANS TO RESTORE AS…",,,,,100000.0,13620.0,"""NON"""
"""A""",20200803,7,2,2449338201,1,,,,"""SAI Exempt""","""RSM REALTY, LLC""",,"""706 OXFORD RD""",,,"""YPSILANTI""","""MI""",48197,2146,"""USA""",,,,12,"""R""",73,7300,732990,73,7300,732990,59.008,"""MI**161""","""USA""",,,12,"""TO PROVIDE LOANS TO RESTORE AS…",,,,,28600.0,3895.32,"""NON"""
"""A""",20200803,7,2,2455458209,1,,,,"""SAI Exempt""","""PINK PALETTE ARTISTS LLC""",,"""7738 HERON LAKES DR""",,,"""HOUSTON""","""TX""",77064,1739,"""USA""",,,,2,"""R""",73,7300,732990,73,7300,732990,59.008,"""TX**201""","""USA""",,,2,"""TO PROVIDE LOANS TO RESTORE AS…",,,,,98400.0,13402.08,"""NON"""


In [5]:
interesting_cols = ['AWARDEEORRECIPIENTLEGALENTITYNAME', 'LEGALENTITYADDRLINE1', 'LEGALENTITYCITYNAME', 'LEGALENTITYSTATECD', 'LEGALENTITYZIP5',
                    'FACEVALUEOFDIRECTLOANORLOANGUARANTEE', 'ORIGINALLOANSUBSIDYCOST']

In [6]:
# Drop everything except the interesting columns
loan_info_df = loan_info_df.select(interesting_cols)

In [7]:
alt_data_dir = data_dir / 'db_alt'

def load_or_cache_xlsx(file, group):
    cache_file = cache_dir / f"{file.stem}_{group}.parquet"

    if cache_file.exists():
        try:
            df = pl.read_parquet(cache_file)
        except:
            os.remove(cache_file)
            return load_or_cache_xlsx(file, group)
        print(f"Loaded from cache: {cache_file}")
    else:
        df = pl.read_excel(file)
        df = df.with_columns([pl.lit(file.name).alias('filename')])
        df.write_parquet(cache_file)
        print(f"Processed and cached: {cache_file}")
    
    return df

# Load all Excel files, process them if needed, and cache the result
list_alt_dfs = []
for file in alt_data_dir.rglob('*.xlsx'):
    list_alt_dfs.append(load_or_cache_xlsx(file, 'all'))

# Concatenate and split into owners and contacts based on indices
owners_indices = [1, 5]
contacts_indices = [i for i in range(len(list_alt_dfs)) if i not in owners_indices]

owners_df = pl.concat([list_alt_dfs[i] for i in owners_indices])
contacts_df = pl.concat([list_alt_dfs[i] for i in contacts_indices])

# Cache the concatenated owners and contacts DataFrames
owners_cache = cache_dir / 'owners_df.parquet'
contacts_cache = cache_dir / 'contacts_df.parquet'

if not owners_cache.exists():
    owners_df.write_parquet(owners_cache)
    print(f"Cached owners DataFrame: {owners_cache}")

if not contacts_cache.exists():
    contacts_df.write_parquet(contacts_cache)
    print(f"Cached contacts DataFrame: {contacts_cache}")

Loaded from cache: data/cache/List-4 - 1,004,842 Contacts For 5 Million List - CEO Database With Mobile Number_all.parquet
Loaded from cache: data/cache/1 Millions Owners Database Part-1_all.parquet
Loaded from cache: data/cache/List-5 - 1,007,897 Contacts For 5 Million List - CEO Database With Mobile Number_all.parquet
Loaded from cache: data/cache/List-2 - 1,008,186 Contacts For 5 Million List - CEO Database With Mobile Number_all.parquet
Loaded from cache: data/cache/List-3 - 1,006,298 Contacts For 5 Million List - CEO Database With Mobile Number_all.parquet
Loaded from cache: data/cache/1 Millions Owners Database-Part-2_all.parquet
Loaded from cache: data/cache/List-1 - 1,006,603 Contacts For 5 Million List - CEO Database With Mobile Number_all.parquet


In [8]:
apollo_82_dir = data_dir / 'apollo/Apollo 82 Million Data'

list_apollo_82_dfs = []
for file in apollo_82_dir.rglob('*.xlsx'):
    list_apollo_82_dfs.append(load_or_cache_xlsx(file, 'all'))


Loaded from cache: data/cache/List # 08_1,038,035 Contacts New Project 82 Million Part-1_all.parquet
Loaded from cache: data/cache/List # 17_1,037,663 Contacts New Project 82 Million Part-1_all.parquet
Loaded from cache: data/cache/List # 23_1,036,165 Contacts New Project 82 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 32_1,036,167 Contacts New Project 82 Million Part-1_all.parquet
Loaded from cache: data/cache/List # 21_1,035,777 Contacts New Project 82 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 50_1,036,786 Contacts New Project 82 Million Part-1_all.parquet
Loaded from cache: data/cache/List # 11_1,031,855 Contacts New Project 82 Million Part-1_all.parquet
Loaded from cache: data/cache/List # 24_1,046,530 Contacts New Project 82 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 03_1,032,046 Contacts New Project 82 Million Part-1_all.parquet
Loaded from cache: data/cache/List # 44_1,034,952 Contacts New Project 82 Million Part-1_al

In [9]:
# Convert all columns of each DataFrame to strings before concatenation
list_apollo_82_dfs_str = [df.with_columns([pl.col(column).cast(pl.Utf8) for column in df.columns]) for df in list_apollo_82_dfs]

# Updated list to store the modified DataFrames
updated_list_apollo_82_dfs = []

for df in list_apollo_82_dfs_str:
    # Check if 'Reveneu' is a column in the DataFrame
    if 'Reveneu' in df.columns:
        # Rename 'Reveneu' to 'Revenue' and add to the updated list
        df = df.rename({'Reveneu': 'Revenue'})
        
    if '__UNNAMED__3' in df.columns:
        df = df.drop('__UNNAMED__3')
    # Add the (potentially modified) DataFrame to the new list
    updated_list_apollo_82_dfs.append(df)

# Now concatenate the updated DataFrames, ensuring all columns are treated as strings
apollo_82_df = pl.concat(updated_list_apollo_82_dfs)

# clean up memory
del list_apollo_82_dfs
del list_apollo_82_dfs_str
del updated_list_apollo_82_dfs

In [10]:
apollo_zoom_dir = data_dir / 'apollo/Zoom Info 70 Million'

list_zoom_dfs = []
for file in apollo_zoom_dir.glob('*.xlsx'):
    list_zoom_dfs.append(load_or_cache_xlsx(file, 'all'))

Loaded from cache: data/cache/List # 04_1,034,387 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 24_1,032,524 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 35_1,045,980 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 12_1,031,401 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 17_1,043,959 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 15_1.033.871 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 51_1,046,432 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 65_1,046,087 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 41_1,046,246 Contacts New Project 143 Million Part-2_all.parquet
Loaded from cache: data/cache/List # 18_1,044,132 Contacts New Project 143 Million

In [11]:
list_zoom_dfs_str = [df.with_columns([pl.col(column).cast(pl.Utf8) for column in df.columns]) for df in list_zoom_dfs]

updated_list_apollo_82_dfs = []

for df in list_zoom_dfs_str:
    if 'Domain' in df.columns:
        df = df.drop('Domain')
    if 'Pattern' in df.columns:
        df = df.drop('Pattern')
        
    updated_list_apollo_82_dfs.append(df)
    
zoom_df = pl.concat(updated_list_apollo_82_dfs)

In [12]:
zoom_df

First Name,Middle Name,Last Name,Title,Company Name,Mailing Address,Primary City,Primary State,ZIP Code,Country,Phone,Web Address,Email,Reveneu,Employee,Industry,Sub Industry,filename
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Monica""",,"""Dougherty""","""Manager of Information Technol…","""Sole Technology""","""20161 Windrow Dr""","""Lake Forest""","""CA""","""92630""","""USA""","""949-460-2020""","""www.soletechnology.com""","""dmonica@soletechnology.com""","""$100 - $250M""","""250 - 1000""","""Retail""","""Retail Other""","""List # 04_1,034,387 Contacts N…"
"""Janice""",,"""Cole""","""Branch Manager""","""Bmo Financial Group Ltd""","""Po Box 3 Stn 1St Can Place""","""Toronto""","""ON""","""M5X 1A3""","""Canada""","""416-867-5000""","""www.bmo.com""","""jcole@bmo.com""","""> $1B""","""> 100K""","""Financial Services""","""Banks""","""List # 04_1,034,387 Contacts N…"
"""Paul""",,"""Kossler""","""Software Engineer""","""United States Navy""","""33055 Nixie Way""","""San Diego""","""CA""","""92147""","""USA""","""619-524-6734""","""www.eodpoe2.navsea.navy.mil""","""kpaul@navy.mil""","""> $1B""","""> 100K""","""Government""","""National Government""","""List # 04_1,034,387 Contacts N…"
"""Aravind""",,"""Sen""","""Consultant""","""Cognizant Technology Solutions…","""500 Frank W Burr Blvd""","""Teaneck""","""NJ""","""7666""","""USA""","""201-801-0233""","""www.cognizant.com""","""aravind@cognizant.com""","""> $1B""","""10K - 50K""","""Computers & Electronics""","""IT and Network Services and Su…","""List # 04_1,034,387 Contacts N…"
"""Michael""",,"""Heaney""","""Vice President, Sales""","""Infogroup / Infousa""","""5711 S 86Th Cir""","""Omaha""","""NE""","""68127""","""USA""","""402-593-4500""","""www.infogroup.com""","""mheaney@infogroup.com""","""$250 - 500M""","""1K - 10K""","""Business Services""","""Data and Records Management""","""List # 04_1,034,387 Contacts N…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Ran""",,"""Libeskind-Hadas""","""Chair-Computer Science""","""Harvey Mudd College""","""301 Platt Blvd""","""Claremont""","""CA""","""91711""","""USA""","""909-621-8000""","""www.hmc.edu""","""libeskind-hadas_ran@hmc.edu""","""$1 - $10M""","""100 - 250""","""Education""","""Colleges and Universities""","""List # 08_1,039,704 Contacts N…"
"""Theodore""",,"""Darlak""","""Technical Services Manager-Inf…","""Niagara University""","""Po Box 2008""","""Niagara University""","""NY""","""14109""","""USA""","""716-286-8200""","""www.niagara.edu""","""darlak_theodore@niagara.edu""","""$1 - $10M""","""250 - 1000""","""Education""","""Colleges and Universities""","""List # 08_1,039,704 Contacts N…"
"""Arleen""",,"""Thompson""","""Office of Instructional and In…","""Boston Public Schools""","""26 Court St""","""Boston""","""MA""","""2108""","""USA""","""617-635-9000""","""www.boston.k12.ma.us""","""thompson_arleen@boston.k12.ma.…","""$100 - $250M""","""250 - 1000""","""Education""","""Elementary and Secondary Schoo…","""List # 08_1,039,704 Contacts N…"
"""Darrell""",,"""Walker""","""Vice President, Education""","""CA Inc""","""1 Computer Associates Plz""","""Islandia""","""NY""","""11749""","""USA""","""800-225-5224""","""www.ca.com""","""darrell.walker@ca.com""","""> $1B""","""10K - 50K""","""Software & Internet""","""Software & Internet Other""","""List # 08_1,039,704 Contacts N…"


In [13]:
# def check_headers(dfs):
#     first_schema = dfs[0].schema
#     for i, df in enumerate(dfs[1:], start=1):
#         if df.schema != first_schema:
#             print(f"Header mismatch found in DataFrame at index {i} compared to the first DataFrame.")
#             return False
#     print("All headers match.")
#     return True

# # Check if all DataFrames have the same schema
# check_headers(list_zoom_dfs_str)


In [14]:
apollo_82_df


First Name,Middle Name,Last Name,Title,Company Name,Mailing Address,Primary City,Primary State,ZIP Code,Country,Phone,Web Address,Email,Revenue,Employee,Industry,Sub Industry,filename
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Andrew""",,"""Ku""","""Radiology, M.D.""","""Ct.Gov Portal""","""Capitol Place 21 Oak St""","""Hartford""","""CT""","""6106""","""USA""","""860-240-0000""","""www.ct.gov""","""andrew.ku@ct.gov""","""$0 - $1M""","""10K - 50K""","""Government""","""State/Provincial Government""","""List # 08_1,038,035 Contacts N…"
"""Leslie""",,"""Thomson""","""General Counsel""","""Montgomery Psychiatry & Associ…","""1040 Longfield Ct""","""Montgomery""","""AL""","""36117""","""USA""","""334-288-9009""","""www.mpa1040.com""","""lthomson@mpa1040.com""","""$0 - $1M""","""0 - 25""","""Healthcare""","""Doctors and Health Care Practi…","""List # 08_1,038,035 Contacts N…"
"""Vince""",,"""Grafft""","""Sales Representative""","""New Horizons Computer Learning…","""6377 E Tanque Verde Rd Ste 200""","""Tucson""","""AZ""","""85715""","""USA""","""520-290-5600""","""www.nhtucson.com""","""vgrafft@nhtucson.com""","""$10 - 50M""","""250 - 1000""","""Education""","""Education Other""","""List # 08_1,038,035 Contacts N…"
"""Rob""",,"""Huner""","""Purchasing Manager""","""Kraft Foods Group, Inc.""","""Three Lakes Dr""","""Northfield""","""IL""","""60093""","""USA""","""847-646-2000""","""www.kraft.com""","""rhuner@kraft.com""","""> $1B""","""> 100K""","""Manufacturing""","""Food & Dairy Product Manufactu…","""List # 08_1,038,035 Contacts N…"
"""John""","""S.""","""Phillip""","""Sales Manager, Supply Chain Ma…","""Eclipse""","""6625 Reflections Dr""","""Dublin""","""OH""","""43017""","""USA""","""614-652-9006""","""www.eclipse-studio.com""","""jsphillip@eclipse-studio.com""","""$0 - 1M""","""0 - 25""","""Business Services""","""Other""","""List # 08_1,038,035 Contacts N…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Anne""",,"""Reischick""","""Deputy Editor""","""Vancouver School District""","""Po Box 8937""","""Vancouver""","""WA""","""98668""","""USA""","""360-313-1100""","""www.vansd.org""","""anne.reischick@vansd.org""","""$10 - $50M""","""250 - 1000""","""Education""","""Elementary and Secondary Schoo…","""List # 09_1,035,664 Contacts N…"
"""Joel""",,"""Vale""","""Project Manager Analytic Devel…","""State of Vermont""","""133 State Street 5Th Floor""","""Montpelier""","""VT""","""5633""","""USA""","""802-828-4141""","""www.vermont.gov""","""jvale@vermont.gov""","""$1 - $10M""","""1K - 10K""","""Government""","""Government Other""","""List # 09_1,035,664 Contacts N…"
"""James""",,"""Chou""","""Chief Marketing Officer""","""Harvard-Faculty of Arts and Sc…","""54 Dunster St""","""Cambridge""","""MA""","""2138""","""USA""","""617-496-0738""","""www.fas.harvard.edu""","""jchou@fas.harvard.edu""","""$100 - $250M""","""1K - 10K""","""Education""","""Colleges and Universities""","""List # 09_1,035,664 Contacts N…"
"""Clarence""",,"""Conner""","""Media Director""","""Bridgestreet Worldwide Inc.""","""485 Springpark Pl Ste 200""","""Herndon""","""VA""","""20170""","""USA""","""571-481-2700""","""www.bridgestreet.com""","""cconner@bridgestreet.com""","""$10 - $50M""","""250 - 1000""","""Real Estate & Construction""","""Property Leasing and Managemen…","""List # 09_1,035,664 Contacts N…"


In [15]:
# Mapping of current names in apollo_82_df to target names
column_mapping = {
    'Mailing Address': 'Company Address',
    'Primary City': 'Company City',
    'Primary State': 'Company State',
    'Email': 'Business Email',
    'Phone': 'Mobile Phone'
}

# Rename the columns based on the mapping
apollo_82_df = apollo_82_df.rename(column_mapping)

# After renaming, ensure the necessary columns are all present
apollo_82_df = apollo_82_df.select(
    [
        'First Name',
        'Last Name',
        'Business Email',
        'Mobile Phone',
        'Company Name',
        'Company State',
        'Company City',
        'Company Address',
        'filename'  # Include other fields as necessary
    ]
)

zoom_df = zoom_df.rename(column_mapping)

# After renaming, ensure the necessary columns are all present
zoom_df = zoom_df.select(
    [
        'First Name',
        'Last Name',
        'Business Email',
        'Mobile Phone',
        'Company Name',
        'Company State',
        'Company City',
        'Company Address',
        'filename'  # Include other fields as necessary
    ]
)


In [16]:
apollo_82_df.head()

First Name,Last Name,Business Email,Mobile Phone,Company Name,Company State,Company City,Company Address,filename
str,str,str,str,str,str,str,str,str
"""Andrew""","""Ku""","""andrew.ku@ct.gov""","""860-240-0000""","""Ct.Gov Portal""","""CT""","""Hartford""","""Capitol Place 21 Oak St""","""List # 08_1,038,035 Contacts N…"
"""Leslie""","""Thomson""","""lthomson@mpa1040.com""","""334-288-9009""","""Montgomery Psychiatry & Associ…","""AL""","""Montgomery""","""1040 Longfield Ct""","""List # 08_1,038,035 Contacts N…"
"""Vince""","""Grafft""","""vgrafft@nhtucson.com""","""520-290-5600""","""New Horizons Computer Learning…","""AZ""","""Tucson""","""6377 E Tanque Verde Rd Ste 200""","""List # 08_1,038,035 Contacts N…"
"""Rob""","""Huner""","""rhuner@kraft.com""","""847-646-2000""","""Kraft Foods Group, Inc.""","""IL""","""Northfield""","""Three Lakes Dr""","""List # 08_1,038,035 Contacts N…"
"""John""","""Phillip""","""jsphillip@eclipse-studio.com""","""614-652-9006""","""Eclipse""","""OH""","""Dublin""","""6625 Reflections Dr""","""List # 08_1,038,035 Contacts N…"


In [17]:
zoom_df.head()

First Name,Last Name,Business Email,Mobile Phone,Company Name,Company State,Company City,Company Address,filename
str,str,str,str,str,str,str,str,str
"""Monica""","""Dougherty""","""dmonica@soletechnology.com""","""949-460-2020""","""Sole Technology""","""CA""","""Lake Forest""","""20161 Windrow Dr""","""List # 04_1,034,387 Contacts N…"
"""Janice""","""Cole""","""jcole@bmo.com""","""416-867-5000""","""Bmo Financial Group Ltd""","""ON""","""Toronto""","""Po Box 3 Stn 1St Can Place""","""List # 04_1,034,387 Contacts N…"
"""Paul""","""Kossler""","""kpaul@navy.mil""","""619-524-6734""","""United States Navy""","""CA""","""San Diego""","""33055 Nixie Way""","""List # 04_1,034,387 Contacts N…"
"""Aravind""","""Sen""","""aravind@cognizant.com""","""201-801-0233""","""Cognizant Technology Solutions…","""NJ""","""Teaneck""","""500 Frank W Burr Blvd""","""List # 04_1,034,387 Contacts N…"
"""Michael""","""Heaney""","""mheaney@infogroup.com""","""402-593-4500""","""Infogroup / Infousa""","""NE""","""Omaha""","""5711 S 86Th Cir""","""List # 04_1,034,387 Contacts N…"


In [18]:
company_fields = ['Company Name', 'Company State', 'Company City', 'Company Address']
owner_fields = ['First Name', 'Last Name', 'Business Email', 'Mobile Phone']


In [19]:
# Usage

def preprocess(file_path, output_file):
    # Determine total file size for progress tracking
    file_size = os.path.getsize(file_path)
    # max_lines = 100000
    line_index = 0  
    bad_rows = 0
    with open(file_path, 'rb') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        progress = tqdm.tqdm(total=file_size, desc="Processing File", unit="B", unit_scale=True)

        # Initialize control variables
        current_row = []
        processed_bytes = 0

        # Read and process the file line by line in binary mode
        while True:
            line = infile.readline()
            if not line:
                break

            processed_bytes += len(line)
            progress.update(len(line))
            
            # Decode line, handle escaped tabs and internal newlines
            decoded_line = line.decode('utf-8')
            decoded_line = decoded_line.replace('\\\\t', '<ESC_TAB>').replace('\r\n', '<ESC_NEWLINE>').replace('\r', '<ESC_CR>')

            # Split on unescaped tabs, process each part
            parts = decoded_line.split('\t')
            for part in parts:
                normalized_part = part.replace('<ESC_TAB>', '\t').replace('<ESC_NEWLINE>', '\r\n').replace('<ESC_CR>', '\r').strip()
                current_row.append(normalized_part)

            # If the line ends with a newline (split condition), it means end of a row
            if decoded_line.endswith('\n'):
                # Write the completed row to the file
                if len(current_row) != 51:
                    bad_rows += 1
                writer.writerow(current_row)
                current_row = []
            else:
                # print(f"Line {line_index} does not end with a newline, continuing to next line")
                pass

            line_index += 1
            # if line_index > max_lines:
            #     break

        # Write any remaining data in current_row to the file (in case the last line does not end with a newline)
        if current_row:
            writer.writerow(current_row)

        progress.close()
    print(f"Bad rows: {bad_rows}/{line_index} ({bad_rows/line_index*100:.2f}%)")

apollo_file_path = Path('data/apollo/Apollo 200 Million 3_3-008.csv')
apollo_output_file = Path('data/apollo/Apollo 200 Million 3_3-008_corrected_2.csv')

if not apollo_output_file.exists():
    preprocess(apollo_file_path, apollo_output_file)
    
    
apollo2_file_path = Path('data/apollo/Apollo 200 Million 2_3-002.csv')
apollo2_output_file = Path('data/apollo/Apollo 200 Million 2_3-002_corrected.csv')

if not apollo2_output_file.exists():
    preprocess(apollo2_file_path, apollo2_output_file)


In [20]:
# apollo_df = pl.read_csv('data/apollo/Apollo 200 Million 3_3-008_corrected_2.csv', truncate_ragged_lines=True, infer_schema_length=0)

In [21]:
# apollo2_df = pl.read_csv('data/apollo/Apollo 200 Million 2_3-002.csv', truncate_ragged_lines=True, infer_schema_length=0)

In [22]:
# for i, file in enumerate(alt_data_dir.rglob('*.xlsx')):
#     print(f"File {i}: {file}")
#     print(list_alt_dfs[i].shape)
#     print(list_alt_dfs[i].schema)
    
# # Group them together by schema


In [23]:
def capitalize(s):
    parts = s.lower().split()
    return ' '.join(part.capitalize() for part in parts)

# Convert column names to capitalized form
owners_df.columns = [capitalize(col.replace('_', ' ')) for col in owners_df.columns]

# Set Filename back to filename
owners_df = owners_df.with_columns([owners_df['Filename'].alias('filename')]).drop('Filename')


In [24]:
company_fields = ['Company Name', 'Company State', 'Company City', 'Company Address']
owner_fields = ['First Name', 'Last Name', 'Business Email', 'Mobile Phone']


### Now load the B2B Email list

In [25]:
b2b_data_dir = data_dir / 'B2b EMAIL LIST 60MM usbizdata'

b2b_csv_files = []

for csv_file in b2b_data_dir.rglob('*.csv'):
    b2b_csv_files.append(csv_file)
    

print(f"Number of B2B CSV files: {len(b2b_csv_files)}")

# Function to read only the first row (header) of a CSV to check consistency
def read_header(file):
    try:
        df = pl.read_csv(file, has_header=True, new_columns=None, n_rows=0, infer_schema_length=0)
        return df.schema
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return None


# # Function to read only the first row (header) of a CSV to check consistency
# def read_header(file):
#     return pl.read_csv(file, has_header=True, skip_rows_after_header=-1)

# Check if all headers are the same
headers = [read_header(file) for file in b2b_csv_files]
first_header = headers[0]

for i, header in enumerate(headers):
    if header != first_header:
        print(f"Header {i} do not match across files.")
        break

if all(header == first_header for header in headers):
    print("All headers match.")
else:
    raise ValueError("Headers do not match across files.")

# Function to read a CSV file with all data as string
def load_csv_with_filename(file):
    try:
        df = pl.read_csv(file, has_header=True, infer_schema_length=0)
        df = df.with_columns([pl.lit(file.name).alias('filename')])
        return df
    except Exception as e:
        print(f"Failed to load {file}: {e}")
        return None

# Load all data treating everything as strings and concatenate
# b2b_df = pl.concat([pl.read_csv(file, has_header=True, infer_schema_length=0) for file in b2b_csv_files])#, how='diagonal')

b2b_df = pl.concat([load_csv_with_filename(file) for file in b2b_csv_files if file.exists()], how='diagonal')


print(f"Total rows in concatenated dataframe: {len(b2b_df)}")

Number of B2B CSV files: 83
All headers match.
Total rows in concatenated dataframe: 54877230


In [26]:
b2b_df['filename']

filename
str
"""WA_B2B_23_1.csv"""
"""WA_B2B_23_1.csv"""
"""WA_B2B_23_1.csv"""
"""WA_B2B_23_1.csv"""
"""WA_B2B_23_1.csv"""
…
"""NH_B2B_23.csv"""
"""NH_B2B_23.csv"""
"""NH_B2B_23.csv"""
"""NH_B2B_23.csv"""


In [27]:
company_fields, owner_fields

(['Company Name', 'Company State', 'Company City', 'Company Address'],
 ['First Name', 'Last Name', 'Business Email', 'Mobile Phone'])

In [28]:
b2b_df.columns

['Company Name',
 'Address',
 'City',
 'State',
 'Zip',
 'County',
 'Phone',
 'Contact First',
 'Contact Last',
 'Title',
 'Direct Phone',
 'Email',
 'Website',
 'Employee Count',
 'Annual Sales',
 'SIC Code',
 'Industry',
 'filename']

In [29]:
# Now rename columns in b2b_df to match the other dataframes
# Define the current names and the new names based on provided matching fields
company_fields_mapping = {
    'Address': 'Company Address',
    'City': 'Company City',
    'State': 'Company State',
    'Zip': 'Company Zip',
    'Phone': 'Company Phone',
    'Email': 'Business Email'
}

owner_fields_mapping = {
    'Contact First': 'First Name',
    'Contact Last': 'Last Name',
    'Direct Phone': 'Mobile Phone'
}

# Merge the two mappings
full_mapping = {**company_fields_mapping, **owner_fields_mapping}

# Rename the columns in b2b_df according to the mapping
b2b_df = b2b_df.rename({k: v for k, v in full_mapping.items() if k in b2b_df.columns})

# Print the new columns to confirm changes
print(b2b_df.columns)


['Company Name', 'Company Address', 'Company City', 'Company State', 'Company Zip', 'County', 'Company Phone', 'First Name', 'Last Name', 'Title', 'Mobile Phone', 'Business Email', 'Website', 'Employee Count', 'Annual Sales', 'SIC Code', 'Industry', 'filename']


In [30]:
b2b_df

Company Name,Company Address,Company City,Company State,Company Zip,County,Company Phone,First Name,Last Name,Title,Mobile Phone,Business Email,Website,Employee Count,Annual Sales,SIC Code,Industry,filename
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Comtech""","""9215 151st Ave Ne""","""Redmond""","""WA""","""98052-3511""","""King""","""4254518138""","""Eleizer""","""Scharf""","""President""",,"""escharf@comtech-group.com""","""comtechphones.com""","""10 To 19""","""$20 To 50 Million""","""506563""","""Telecommunication Equip/Syst-W…","""WA_B2B_23_1.csv"""
"""Tupperware""","""916 S 30th Ave""","""Yakima""","""WA""","""98902-4074""","""Yakima""","""5099659224""","""Fred""","""Lubarsky""","""Owner""",,"""fsl31@aol.com""","""aol.com""","""1 To 4""","""Less Than $500,000""","""596304""","""Home Demonstration-Merchandise""","""WA_B2B_23_1.csv"""
"""Bion Diagnostic Sciences Inc""","""12277 134th Ct Ne Ste 100""","""Redmond""","""WA""","""98052-2431""","""King""","""4258211010""","""Kelly""","""Walsh""","""Manager""",,"""kwalsh@polymedco.com""","""polymedco.com""",,,"""289900""","""Chemical Preparations, Nec""","""WA_B2B_23_1.csv"""
"""Garden Cafe""","""18923 Peter Johnson Rd""","""Mount Vernon""","""WA""","""98273-9302""","""Skagit""","""3608489189""","""Martin""","""Tynan""","""Manager""",,"""mtynan@gardencafe.com""","""gardencafe.com""","""5 To 9""","""Less Than $500,000""","""581208""","""Restaurants""","""WA_B2B_23_1.csv"""
"""Garden Cafe""","""18923 Peter Johnson Rd""","""Mount Vernon""","""WA""","""98273-9302""","""Skagit""","""3608489189""","""Dao""","""Choi""","""Owner""",,"""dchoi@gardencafe.com""","""gardencafe.com""","""5 To 9""","""Less Than $500,000""","""581208""","""Restaurants""","""WA_B2B_23_1.csv"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""County Of Hillsborough""","""400 Mast Rd""","""Goffstown""","""NH""","""03045-2427""","""Hillsborough""","""6036275540""","""Richard""","""Kerby""","""Manager""","""8132725625""","""richard@hillsboroughcounty.org""","""hillsboroughcountynh.org""","""1 To 4""","""Unknown""","""912103""","""Government Offices-County""","""NH_B2B_23.csv"""
"""Crabtree & Evelyn""","""2 Common Ct Unit B8""","""North Conway""","""NH""","""03860-5440""","""Carroll""","""6033564100""","""Romona""","""Ropper""","""Manager""",,"""rropper@crabtree-evelyn.com""","""crabtree-evelyn.com""","""5 To 9""","""$1 To 2.5 Million""","""599992""","""Cosmetics & Perfumes-Retail""","""NH_B2B_23.csv"""
"""Alternative Communication""","""3 Bud Way Ste 20""","""Nashua""","""NH""","""03063-1700""","""Hillsborough""","""6038823100""","""Warren""","""Kane""","""Manager""",,"""warren@urnet.com""","""acstelcom.com""","""1 To 4""","""$1 To 2.5 Million""","""489903""","""Communications""","""NH_B2B_23.csv"""
"""Alternative Communication""","""3 Bud Way Ste 20""","""Nashua""","""NH""","""03063-1700""","""Hillsborough""","""6038823100""","""Terri""","""Richards""","""Owner""",,"""terri@urnet.com""","""acstelcom.com""","""1 To 4""","""$1 To 2.5 Million""","""489903""","""Communications""","""NH_B2B_23.csv"""


In [31]:
owners_df

First Name,Last Name,Business Email,Job Title,Mobile Phone,Company Name,Company Domain,Company Phone,Primary Industry,Primary Sic,Company Industry,Company Sic Code,Company Address,Company City,Company State,Company Zip,Company Linkedin Url,Company Revenue,Company Employee Count,filename
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Wendy""","""Bodin""","""wendy.bodin@luminaryhealth.com""","""Owner""","""5055037276""","""Luminary Holistics Llc""","""luminaryhealth.com""","""\N""","""Offices Of Health Practitioner…","""8049""","""Offices Of Health Practitioner…","""8049""","""11811 Menaul Blvd Ne""","""Albuquerque""","""NM""","""87112""","""\N""","""Under 1 Million""","""1 to 10""","""1 Millions Owners Database Par…"
"""Keith""","""Frantz""","""keith@cardsandcocktails.com""","""Owner""","""8585606499""","""Academy Of Intl Bartending""","""cardsandcocktails.com""","""\N""","""\N""","""\N""","""\N""","""\N""","""\N""","""San Diego""","""CA""","""92111""","""\N""","""Under 1 Million""","""1 to 10""","""1 Millions Owners Database Par…"
"""Kristy""","""Goggio""","""kristy@thecorkedcanvas.com""","""Co Owner""","""2623788338""","""The Corked Canvas""","""thecorkedcanvas.com""","""\N""","""\N""","""\N""","""\N""","""\N""","""\N""","""\N""","""\N""","""\N""","""\N""","""Under 1 Million""","""1 to 10""","""1 Millions Owners Database Par…"
"""M""","""Tedburke""","""tedburke@insightbb.com""","""Owner""","""2708263593""","""Lonnies Best Taste Of Chicago""","""insightbb.com""","""8006345574""","""Real Estate Agents & Managers …","""6531""","""Real Estate Agents & Managers …","""6531""","""121 Saint Matthews Avenue""","""Louisville""","""KY""","""40207""","""linkedin.com/company/lonnies-b…","""1 Billion and Over""","""10000+""","""1 Millions Owners Database Par…"
"""Jodi""","""Bass""","""jodi@thepleasurechest.com""","""Owner""","""7758832225""","""The Pleasure Chest""","""thepleasurechest.com""","""8007534536""","""Gift, Novelty, And Souvenir Sh…","""5947""","""Gift, Novelty, And Souvenir Sh…","""5947; 7929; 7299""","""1150 2nd Ave Frnt 1""","""New York""","""CA""","""10065""","""linkedin.com/company/the-pleas…","""25 Million to 50 Miliion""","""101 to 250""","""1 Millions Owners Database Par…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Daniel""","""Huckans""","""dan.huckans@derdanenterprises.…","""Owner At Derdan Enterprises""","""8568161577""","""Derdan Enterprises""","""derdanenterprises.net""","""\N""","""Electrical Work""","""1731""","""Electrical Work; Telephone Com…","""1731; 4810""","""7255 W Sunset Rd""","""Las Vegas""","""NV""","""89113""","""linkedin.com/company/derdan-en…","""Under 1 Million""","""1 to 10""","""1 Millions Owners Database-Par…"
"""Tim""","""Charter""","""charter@cabainc.com""","""Owner""","""4058587361, 4058400231""","""Cher A Bumps Assoc Inc""","""cabainc.com""","""\N""","""Insurance Agents, Brokers & Se…","""6411""","""Insurance Agents, Brokers & Se…","""6411""","""2601 Nw Expressway""","""Oklahoma City""","""OK""","""73112""","""\N""","""25 Million to 50 Miliion""","""101 to 250""","""1 Millions Owners Database-Par…"
"""Jennifer""","""Resnick""","""jresnick@polishmenailspa.com""","""Owner""","""2016531100""","""Polish Me""","""polishmenailspa.com""","""\N""","""\N""","""\N""","""\N""","""\N""","""\N""","""Jersey City""","""NJ""","""7311""","""\N""","""Under 1 Million""","""1 to 10""","""1 Millions Owners Database-Par…"
"""Mark""","""Southworth""","""m.southworth@shreddirect.com""","""Owner""","""6147333873""","""Shred Direct Llc""","""shreddirect.com""","""\N""","""Services-Business Services, Ne…","""7389""","""Electrical Work; Services-Busi…","""1731; 7389""","""\N""","""Plain City""","""OH""","""43064""","""\N""","""Under 1 Million""","""1 to 10""","""1 Millions Owners Database-Par…"


In [32]:
contacts_df

First Name,Last Name,Business Email,Job Title,Mobile Phone,Linkedin Url,Contact Metro City,Contact State,Company Name,Company Domain,Company Phone,Primary Industry,Primary Sic,Company Industry,Company Sic Code,Company Address,Company City,Company State,Company Zip,Company Linkedin Url,Company Revenue,Company Employee Count,filename
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Donna""","""Chernisky""","""donna@customdesignedsolutions.…","""President And Chief Executive …","""3012360450""","""linkedin.com/in/donna-chernisk…","""\N""","""MD""","""Custom Designed Solutions""","""customdesignedsolutions.com""","""\N""","""Services-Computer Programming …","""7371""","""Services-Computer Programming …","""7371""","""3401 Forsythia Ln""","""Burtonsville""","""MD""","""20866""","""linkedin.com/company/custom-de…","""Under 1 Million""","""1 to 10""","""List-4 - 1,004,842 Contacts Fo…"
"""David""","""Callahan""","""d.callahan@putt-putt.com""","""Ceo""","""9194019759""","""\N""","""\N""","""\N""","""Putt-Putt""","""putt-putt.com""","""\N""","""Services-Management Consulting…","""8742""","""Amusement And Recreation, N.E.…","""7999; 8742""","""\N""","""Winston Salem""","""NC""","""27199""","""\N""","""25 Million to 50 Miliion""","""101 to 250""","""List-4 - 1,004,842 Contacts Fo…"
"""Brandon""","""Allison""","""b.allison@aatac.com""","""Chief Executive""","""7043924624""","""\N""","""\N""","""\N""","""Aatac Inc""","""aatac.com""","""\N""","""Retail-Eating Places""","""5812""","""Services-Automotive Repair, Se…","""7500; 5013; 5812; 3713""","""4000 Sam Wilson Road""","""Charlotte""","""NC""","""28214""","""linkedin.com/company/aatac-inc""","""5 Million to 10 Million""","""26 to 50""","""List-4 - 1,004,842 Contacts Fo…"
"""Georgia""","""Reynolds""","""g.reynolds@reynoldsbenefits.co…","""Ceo""","""7328420808""","""\N""","""\N""","""\N""","""Reynolds Benefits""","""reynoldsbenefits.com""","""\N""","""Insurance Agents, Brokers & Se…","""6411""","""Insurance Agents, Brokers & Se…","""6411""","""655 Shrewsbury Ave""","""Shrewsbury""","""NJ""","""7702""","""linkedin.com/company/reynolds-…","""Under 1 Million""","""1 to 10""","""List-4 - 1,004,842 Contacts Fo…"
"""Dennis""","""Huang""","""huang@folica.com""","""Chief Executive Officergeneral…","""6098608430""","""\N""","""\N""","""\N""","""Folica Inc.""","""folica.com""","""8889194247""","""Miscellaneous Retail Stores, N…","""5999""","""Wholesale-Durable Goods, Nec; …","""5099; 5999; 8742""","""315 Madison Avenue""","""New York""","""NY""","""10017""","""linkedin.com/company/folica-in…","""25 Million to 50 Miliion""","""101 to 250""","""List-4 - 1,004,842 Contacts Fo…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Michael""","""Zasloff""","""mzasloff@captivatepharma.com""","""Chief Executive Officer""","""2678936625""","""\N""","""\N""","""\N""","""Captivate Pharmaceuticals""","""captivatepharma.com""","""\N""","""Services-Commercial Physical &…","""8731""","""Services-Commercial Physical &…","""8731""","""3805 Old Easton Rd""","""Doylestown""","""PA""","""18902""","""\N""","""Under 1 Million""","""1 to 10""","""List-1 - 1,006,603 Contacts Fo…"
"""Larry""","""Ryback""","""larry.j-ryback@tijuanaflats.co…","""Ceo""","""4078596190""","""\N""","""\N""","""\N""","""Tijuana Flats Tex-Mex""","""tijuanaflats.com""","""\N""","""Retail-Eating Places""","""5812""","""Retail-Eating Places""","""5812""","""4224 Virginia Beach Blvd""","""Maitland""","""FL""","""32751""","""linkedin.com/company/tijuana-f…","""250 Million to 500 Million""","""1001 to 5000""","""List-1 - 1,006,603 Contacts Fo…"
"""Rohit""","""Shukla""","""rshukla@old.larta.org""","""Ceo""","""2136942826""","""\N""","""\N""","""\N""","""Larta Institute""","""old.larta.org""","""\N""","""Services-Management Consulting…","""8742""","""Services-Management Consulting…","""8742""","""\N""","""Los Angeles""","""CA""","""90014""","""\N""","""1 Million to 5 Million""","""11 to 25""","""List-1 - 1,006,603 Contacts Fo…"
"""Thomas""","""Farr""","""thomas@outdoorexpressions.net""","""Chief Executive Officer""","""7702710851""","""\N""","""\N""","""\N""","""Outdoor Expressions""","""outdoorexpressions.net""","""\N""","""Services-Allied To Motion Pict…","""7829""","""Services-Allied To Motion Pict…","""7829""","""981 Little Rd""","""Canton""","""GA""","""30115""","""\N""","""1 Million to 5 Million""","""11 to 25""","""List-1 - 1,006,603 Contacts Fo…"


In [33]:
loan_info_df

AWARDEEORRECIPIENTLEGALENTITYNAME,LEGALENTITYADDRLINE1,LEGALENTITYCITYNAME,LEGALENTITYSTATECD,LEGALENTITYZIP5,FACEVALUEOFDIRECTLOANORLOANGUARANTEE,ORIGINALLOANSUBSIDYCOST
str,str,str,str,i64,f64,f64
"""AKIKO'S SUSHI BAR INC.""","""726 Noriega St""","""SAN FRANCISCO""","""CA""",94122,144000.0,19612.8
"""DITZLER GENERAL CONTRACTING IN…","""212 WILE AVE""","""SOUDERTON""","""PA""",18964,25300.0,3445.86
"""HERMAN CLEANERS, LLC""","""5590 KEYSTONE PINE WAY""","""DUBLIN""","""OH""",43016,100000.0,13620.0
"""RSM REALTY, LLC""","""706 OXFORD RD""","""YPSILANTI""","""MI""",48197,28600.0,3895.32
"""PINK PALETTE ARTISTS LLC""","""7738 HERON LAKES DR""","""HOUSTON""","""TX""",77064,98400.0,13402.08
…,…,…,…,…,…,…
"""Saiko Excavation LLC""","""322 S 400 W""","""Logan""","""UT""",84321,25000.0,3405.0
"""Baryonnoh Vision""","""5982 BROOKSTONE DR NW""","""CONCORD""","""NC""",28027,15000.0,2043.0
"""EW Real Estate LLC""","""3630 concord dr""","""beachwood""","""OH""",44122,25000.0,3405.0
"""MAZA ASSOCIATES LLC""","""18A VAN WAGENEN AVE""","""JERSEY CITY""","""NJ""",7306,1700.0,231.54


In [34]:
loan_info_df['AWARDEEORRECIPIENTLEGALENTITYNAME']

AWARDEEORRECIPIENTLEGALENTITYNAME
str
"""AKIKO'S SUSHI BAR INC."""
"""DITZLER GENERAL CONTRACTING IN…"
"""HERMAN CLEANERS, LLC"""
"""RSM REALTY, LLC"""
"""PINK PALETTE ARTISTS LLC"""
…
"""Saiko Excavation LLC"""
"""Baryonnoh Vision"""
"""EW Real Estate LLC"""
"""MAZA ASSOCIATES LLC"""


In [35]:
contacts_df[company_fields + owner_fields]

Company Name,Company State,Company City,Company Address,First Name,Last Name,Business Email,Mobile Phone
str,str,str,str,str,str,str,str
"""Custom Designed Solutions""","""MD""","""Burtonsville""","""3401 Forsythia Ln""","""Donna""","""Chernisky""","""donna@customdesignedsolutions.…","""3012360450"""
"""Putt-Putt""","""NC""","""Winston Salem""","""\N""","""David""","""Callahan""","""d.callahan@putt-putt.com""","""9194019759"""
"""Aatac Inc""","""NC""","""Charlotte""","""4000 Sam Wilson Road""","""Brandon""","""Allison""","""b.allison@aatac.com""","""7043924624"""
"""Reynolds Benefits""","""NJ""","""Shrewsbury""","""655 Shrewsbury Ave""","""Georgia""","""Reynolds""","""g.reynolds@reynoldsbenefits.co…","""7328420808"""
"""Folica Inc.""","""NY""","""New York""","""315 Madison Avenue""","""Dennis""","""Huang""","""huang@folica.com""","""6098608430"""
…,…,…,…,…,…,…,…
"""Captivate Pharmaceuticals""","""PA""","""Doylestown""","""3805 Old Easton Rd""","""Michael""","""Zasloff""","""mzasloff@captivatepharma.com""","""2678936625"""
"""Tijuana Flats Tex-Mex""","""FL""","""Maitland""","""4224 Virginia Beach Blvd""","""Larry""","""Ryback""","""larry.j-ryback@tijuanaflats.co…","""4078596190"""
"""Larta Institute""","""CA""","""Los Angeles""","""\N""","""Rohit""","""Shukla""","""rshukla@old.larta.org""","""2136942826"""
"""Outdoor Expressions""","""GA""","""Canton""","""981 Little Rd""","""Thomas""","""Farr""","""thomas@outdoorexpressions.net""","""7702710851"""


In [36]:
owners_df[company_fields + owner_fields]

Company Name,Company State,Company City,Company Address,First Name,Last Name,Business Email,Mobile Phone
str,str,str,str,str,str,str,str
"""Luminary Holistics Llc""","""NM""","""Albuquerque""","""11811 Menaul Blvd Ne""","""Wendy""","""Bodin""","""wendy.bodin@luminaryhealth.com""","""5055037276"""
"""Academy Of Intl Bartending""","""CA""","""San Diego""","""\N""","""Keith""","""Frantz""","""keith@cardsandcocktails.com""","""8585606499"""
"""The Corked Canvas""","""\N""","""\N""","""\N""","""Kristy""","""Goggio""","""kristy@thecorkedcanvas.com""","""2623788338"""
"""Lonnies Best Taste Of Chicago""","""KY""","""Louisville""","""121 Saint Matthews Avenue""","""M""","""Tedburke""","""tedburke@insightbb.com""","""2708263593"""
"""The Pleasure Chest""","""CA""","""New York""","""1150 2nd Ave Frnt 1""","""Jodi""","""Bass""","""jodi@thepleasurechest.com""","""7758832225"""
…,…,…,…,…,…,…,…
"""Derdan Enterprises""","""NV""","""Las Vegas""","""7255 W Sunset Rd""","""Daniel""","""Huckans""","""dan.huckans@derdanenterprises.…","""8568161577"""
"""Cher A Bumps Assoc Inc""","""OK""","""Oklahoma City""","""2601 Nw Expressway""","""Tim""","""Charter""","""charter@cabainc.com""","""4058587361, 4058400231"""
"""Polish Me""","""NJ""","""Jersey City""","""\N""","""Jennifer""","""Resnick""","""jresnick@polishmenailspa.com""","""2016531100"""
"""Shred Direct Llc""","""OH""","""Plain City""","""\N""","""Mark""","""Southworth""","""m.southworth@shreddirect.com""","""6147333873"""


In [37]:
contact_info_df = pl.concat([contacts_df[company_fields + owner_fields + ['filename']],
                             owners_df[company_fields + owner_fields + ['filename']],
                             b2b_df[company_fields + owner_fields + ['filename']],
                             apollo_82_df[company_fields + owner_fields + ['filename']],
                             zoom_df[company_fields + owner_fields + ['filename']]])
                            

In [38]:
# Now clear the original dataframes to free up memory
del owners_df
del contacts_df
del b2b_df
del apollo_82_df
del zoom_df

In [39]:
contact_info_df

Company Name,Company State,Company City,Company Address,First Name,Last Name,Business Email,Mobile Phone,filename
str,str,str,str,str,str,str,str,str
"""Custom Designed Solutions""","""MD""","""Burtonsville""","""3401 Forsythia Ln""","""Donna""","""Chernisky""","""donna@customdesignedsolutions.…","""3012360450""","""List-4 - 1,004,842 Contacts Fo…"
"""Putt-Putt""","""NC""","""Winston Salem""","""\N""","""David""","""Callahan""","""d.callahan@putt-putt.com""","""9194019759""","""List-4 - 1,004,842 Contacts Fo…"
"""Aatac Inc""","""NC""","""Charlotte""","""4000 Sam Wilson Road""","""Brandon""","""Allison""","""b.allison@aatac.com""","""7043924624""","""List-4 - 1,004,842 Contacts Fo…"
"""Reynolds Benefits""","""NJ""","""Shrewsbury""","""655 Shrewsbury Ave""","""Georgia""","""Reynolds""","""g.reynolds@reynoldsbenefits.co…","""7328420808""","""List-4 - 1,004,842 Contacts Fo…"
"""Folica Inc.""","""NY""","""New York""","""315 Madison Avenue""","""Dennis""","""Huang""","""huang@folica.com""","""6098608430""","""List-4 - 1,004,842 Contacts Fo…"
…,…,…,…,…,…,…,…,…
"""Harvey Mudd College""","""CA""","""Claremont""","""301 Platt Blvd""","""Ran""","""Libeskind-Hadas""","""libeskind-hadas_ran@hmc.edu""","""909-621-8000""","""List # 08_1,039,704 Contacts N…"
"""Niagara University""","""NY""","""Niagara University""","""Po Box 2008""","""Theodore""","""Darlak""","""darlak_theodore@niagara.edu""","""716-286-8200""","""List # 08_1,039,704 Contacts N…"
"""Boston Public Schools""","""MA""","""Boston""","""26 Court St""","""Arleen""","""Thompson""","""thompson_arleen@boston.k12.ma.…","""617-635-9000""","""List # 08_1,039,704 Contacts N…"
"""CA Inc""","""NY""","""Islandia""","""1 Computer Associates Plz""","""Darrell""","""Walker""","""darrell.walker@ca.com""","""800-225-5224""","""List # 08_1,039,704 Contacts N…"


In [40]:
def normalize_names_with_pandas(series):
    # Convert Polars Series to Pandas Series
    pandas_series = series.to_pandas()
    
    # Lowercase
    pandas_series = pandas_series.str.lower()
    # Remove all punctuation including periods and other non-alphanumeric characters, and replace with a space
    pandas_series = pandas_series.str.replace(r"[^\w\s]", " ", regex=True)
    # Normalize spaces: replace one or more whitespace characters with a single space
    pandas_series = pandas_series.str.replace(r"\s+", " ", regex=True)
    # Strip any leading or trailing whitespace
    pandas_series = pandas_series.str.strip()
    
    # Convert back to Polars Series
    return pl.Series(pandas_series.name, pandas_series.tolist())

# Example usage with your data
loan_info_names = loan_info_df['AWARDEEORRECIPIENTLEGALENTITYNAME']
contacts_names = contact_info_df['Company Name']

normalized_loan_names = normalize_names_with_pandas(loan_info_names)
normalized_contacts_names = normalize_names_with_pandas(contacts_names)

# Use with_columns to add the normalized names to the DataFrames
loan_info_df = loan_info_df.with_columns([normalized_loan_names.alias('Normalized Name')])
contact_info_df = contact_info_df.with_columns([normalized_contacts_names.alias('Normalized Name')])

### Remove large companies from dataset

In [None]:
# Now that we have normalized names filter out big companies from the both dataframes
companies_to_exclude = ['uber', 'lyft', 'doordash', 'instacart', 'world financial group', 'door dash', 'airbnb', 'postmates',
                        'amazon flex', 'grubhub', 'northwestern mutual', 'amazon flex', 'yellow cab', 'coldwell banker']

# Create a regular expression pattern that matches any of the company names
pattern = '|'.join(companies_to_exclude)

# Filter out rows with company names to exclude
filtered_df = loan_info_df.filter(~loan_info_df['Normalized Name'].str.contains(pattern))

with pl.Config(tbl_rows=400):
    print(filtered_df['Normalized Name'].value_counts().sort("count", descending=True).head(50))


shape: (50, 2)
┌────────────────────────┬───────┐
│ Normalized Name        ┆ count │
│ ---                    ┆ ---   │
│ str                    ┆ u32   │
╞════════════════════════╪═══════╡
│ n a                    ┆ 650   │
│ self employed          ┆ 552   │
│ none                   ┆ 330   │
│ independent contractor ┆ 299   │
│ na                     ┆ 196   │
│ phuong nguyen          ┆ 84    │
│ taxi driver            ┆ 82    │
│ thuy nguyen            ┆ 78    │
│ self                   ┆ 72    │
│ pro nails              ┆ 72    │
│ jose rodriguez         ┆ 67    │
│ vipkid                 ┆ 61    │
│ driver                 ┆ 61    │
│ top nails              ┆ 59    │
│ transportation         ┆ 59    │
│ thanh nguyen           ┆ 57    │
│ michael                ┆ 55    │
│ thao nguyen            ┆ 52    │
│ trang nguyen           ┆ 52    │
│ star nails             ┆ 50    │
│ real estate            ┆ 47    │
│ happy nails            ┆ 47    │
│ thu nguyen             ┆ 47    │
│ tua

In [None]:
loan_info_df = loan_info_df.filter(~loan_info_df['Normalized Name'].str.contains(pattern))
contact_info_df = contact_info_df.filter(~contact_info_df['Normalized Name'].str.contains(pattern))

### Combine the loan dataset with contacts dataset

In [None]:
import polars as pl
from fuzzywuzzy import fuzz

# Join the on the 'Normalized Name' column. This giv
matching_rows = loan_info_df.with_row_index().join(
    contact_info_df,
    on="Normalized Name",
    how="inner",
    suffix="_contacts"
)

### Read existing Zapp dataset to remove duplicates

In [None]:
import pandas as pd
zapp_df = pd.read_csv('data/db_existing/Cell Phone Numbers - Business Owners MAIN/ZappDataEIDLholders210k/210KDataZappEIDLholders.csv', encoding='ISO-8859-1')
zapp_df = zapp_df.astype(str)

zapp_df = pl.DataFrame(zapp_df)

normalized_zapp_names = normalize_names_with_pandas(zapp_df['AWARDEEORRECIPIENTLEGALENTITYNAME'])
zapp_df = zapp_df.with_columns([normalized_zapp_names.alias('Normalized Name')])

  zapp_df = pd.read_csv('data/db_existing/Cell Phone Numbers - Business Owners MAIN/ZappDataEIDLholders210k/210KDataZappEIDLholders.csv', encoding='ISO-8859-1')


In [None]:
multi_col_matches = zapp_df.join(
    matching_rows,
    on=pl.col("Normalized Name"),
    how="inner"
).filter(
    (pl.col("Dzemail") != "nan") & (pl.col("Dzemail") == pl.col("Business Email")) |
    (pl.col("Dzcell") != "nan") & (pl.col("Dzcell") == pl.col("Mobile Phone"))
)

# Exclude the matched rows based on index from the original dataframe
matching_rows_filtered = matching_rows.filter(~matching_rows['index'].is_in(multi_col_matches['index']))

# Output the results
print(f"Number of multi-column matches: {len(multi_col_matches)}")
print(f"Number of good matches after excluding zapp: {len(matching_rows_filtered)}")


Number of multi-column matches: 7850
Number of good matches after excluding zapp: 30390813


In [None]:
matching_rows.shape

(30642613, 18)

## Now we try to figure out which matched pairs are actually the same company. We start by considering companies with a very similar address (above 70 seems to be a good cutoff)

In [None]:
# Define a function for fuzzy matching addresses
def fuzzy_match_addresses(row: dict) -> int:
    # This function assumes the DataFrame has the necessary address columns named appropriately
    return fuzz.ratio(row['LEGALENTITYADDRLINE1'], row['Company Address'])

# Apply fuzzy matching to the address columns and create a new column 'AddrSimilarity'
matching_rows_filtered = matching_rows_filtered.with_columns(
    pl.struct(pl.all()).map_elements(fuzzy_match_addresses, return_dtype=pl.Int64).alias('AddrSimilarity')
)

good_matches_df = matching_rows_filtered.filter((pl.col('AddrSimilarity') > 70))

In [None]:
# Ok, now remove the rows that have already been matched in good matches from matching_rows
matching_rows_1 = matching_rows_filtered.filter(~matching_rows_filtered['index'].is_in(good_matches_df['index']))


In [None]:
len(good_matches_df), len(matching_rows_1), len(matching_rows_filtered)

(372882, 27643461, 30390813)

### Now, add in entries in the same city

In [None]:
# Now, filter out rows where LEGALENTITYSTATECD and Company State are not equal
matching_rows_2 = matching_rows_1.filter(matching_rows_1['LEGALENTITYSTATECD'].str.to_uppercase() == matching_rows_1['Company State'].str.to_uppercase())

# Filter by city
matching_rows_2 = matching_rows_2.filter(matching_rows_2['LEGALENTITYCITYNAME'].str.to_uppercase() == matching_rows_2['Company City'].str.to_uppercase())

print(f"Number of probable unique matches: {len(matching_rows_2)} / {len(matching_rows_1)}")

Number of probable unique matches: 555559 / 27643461


In [None]:
# add matching rows to good matches and remove from matching rows
good_matches_df_2 = pl.concat([good_matches_df, matching_rows_2])

matching_rows_3 = matching_rows_1.filter(~matching_rows_1['index'].is_in(matching_rows_2['index']))

print(f"Current good matches: {len(good_matches_df_2)} with {len(matching_rows_3)} remaining")

Current good matches: 928441 with 22373986 remaining


In [None]:
good_matches_df_2[interesting_cols]

AWARDEEORRECIPIENTLEGALENTITYNAME,LEGALENTITYADDRLINE1,LEGALENTITYCITYNAME,LEGALENTITYSTATECD,LEGALENTITYZIP5,FACEVALUEOFDIRECTLOANORLOANGUARANTEE,ORIGINALLOANSUBSIDYCOST
str,str,str,str,i64,f64,f64
"""TubeMaster, Inc.""","""8008 Vinecrest Ave.""","""Louisville""","""KY""",40222,150000.0,20430.0
"""IntelliFusion Technologies""","""6 Fielek Ter""","""Parlin""","""NJ""",8859,12300.0,1675.26
"""Logical Net Corporation""","""2345 Maxon Rd Ext""","""Schenectady""","""NY""",12308,150000.0,20430.0
"""Nashville Party Authority""","""704 Sandburg Pl""","""Nashville""","""TN""",37214,25000.0,3405.0
"""DESERT SKY PROPERTIES""","""7373 E. Doubletree Ranch Road,…","""SCOTTSDALE""","""AZ""",85258,10000.0,1362.0
…,…,…,…,…,…,…
"""CITRIX SYSTEMS INC""","""851 W CYPRESS CREEK""","""FORT LAUDERDALE""","""FL""",33309,49900.0,6796.38
"""Phillips Foods, Inc.""","""3761 COMMERCE DR STE 413""","""BALTIMORE""","""MD""",21227,150000.0,20430.0
"""Revlon Inc""","""1 New York Plaza, New York, NY…","""New York""","""NY""",10004,47500.0,6469.5
"""Revlon Inc""","""1 New York Plaza, New York, NY…","""New York""","""NY""",10004,-47500.0,-6469.5


In [None]:
good_matches_df_2.filter(good_matches_df_2['First Name'].str.contains('Ananth'))

index,AWARDEEORRECIPIENTLEGALENTITYNAME,LEGALENTITYADDRLINE1,LEGALENTITYCITYNAME,LEGALENTITYSTATECD,LEGALENTITYZIP5,FACEVALUEOFDIRECTLOANORLOANGUARANTEE,ORIGINALLOANSUBSIDYCOST,Normalized Name,Company Name,Company State,Company City,Company Address,First Name,Last Name,Business Email,Mobile Phone,filename,AddrSimilarity
u32,str,str,str,str,i64,f64,f64,str,str,str,str,str,str,str,str,str,str,i64
277308,"""IntelliFusion Technologies""","""6 Fielek Ter""","""Parlin""","""NJ""",8859,12300.0,1675.26,"""intellifusion technologies""","""Intellifusion Technologies""","""NJ""","""Parlin""","""6 Fielek Terrace""","""Ananth""","""Godavari""","""ananth@intellifusion.com""","""4087541690""","""List-4 - 1,004,842 Contacts Fo…",86
3403719,"""Lam Research Corp""","""4650 Cushing Blvd""","""fremont""","""CA""",94538,16400.0,2233.68,"""lam research corp""","""Lam Research Corp""","""CA""","""Fremont""","""4650 Cushing Pkwy""","""Ananth""","""Indrakanti""","""ananth.indrakanti@lamrc.com""",,"""CA_B2B_23_1.csv""",76
3403719,"""Lam Research Corp""","""4650 Cushing Blvd""","""fremont""","""CA""",94538,16400.0,2233.68,"""lam research corp""","""Lam Research Corp""","""CA""","""Fremont""","""4650 Cushing Pkwy""","""Ananth""","""Indrakanti""","""ananth.indrakanti@lamrc.com""",,"""CA_B2B_23_1.csv""",76
1910084,"""BAISYS CONSULTING LLC""","""50 MAIN ST STE 1000""","""WHITE PLAINS""","""NY""",10606,46800.0,6374.16,"""baisys consulting llc""","""Baisys Consulting Llc""","""NY""","""White Plains""","""50 M St # 1000""","""Ananth""","""Bobbili""","""a.bobbili@baisys.com""",,"""NY_B2B_23_3.csv""",73
1910084,"""BAISYS CONSULTING LLC""","""50 MAIN ST STE 1000""","""WHITE PLAINS""","""NY""",10606,46800.0,6374.16,"""baisys consulting llc""","""Baisys Consulting Llc""","""NY""","""White Plains""","""50 M St # 1000""","""Ananth""","""Bobbili""","""bobbili@baisys.com""",,"""NY_B2B_23_3.csv""",73
2277286,"""AgReliant Genetics, LLC""","""1122 E 169TH ST""","""WESTFIELD""","""IN""",46074,148700.0,20252.94,"""agreliant genetics llc""","""Agreliant Genetics Llc""","""IN""","""Westfield""","""1122 E 169Th St""","""Anantharaman""","""Velayudham""","""anantharaman.velayudham@agreli…","""317-896-5551""","""List # 21_1,035,777 Contacts N…",87
3229695,"""Kindred healthcare inc""","""680 South Fourth Street""","""Louisville""","""KY""",40202,27300.0,3718.26,"""kindred healthcare inc""","""Kindred Healthcare, Inc.""","""KY""","""Louisville""","""680 S. 4th Street""","""Ananth""","""o'brien""","""ananth_obrien@kindredhealthcar…","""502-596-7300""","""List # 44_1,034,952 Contacts N…",75


In [None]:
export_col_set = interesting_cols + owner_fields + ['filename'] 
export_col_set

['AWARDEEORRECIPIENTLEGALENTITYNAME',
 'LEGALENTITYADDRLINE1',
 'LEGALENTITYCITYNAME',
 'LEGALENTITYSTATECD',
 'LEGALENTITYZIP5',
 'FACEVALUEOFDIRECTLOANORLOANGUARANTEE',
 'ORIGINALLOANSUBSIDYCOST',
 'First Name',
 'Last Name',
 'Business Email',
 'Mobile Phone',
 'filename']

In [None]:
good_matches_df_2 = good_matches_df_2.with_columns(
    (pl.col('LEGALENTITYADDRLINE1').is_not_null().cast(int) +
     pl.col('LEGALENTITYCITYNAME').is_not_null().cast(int) +
     pl.col('LEGALENTITYSTATECD').is_not_null().cast(int) +
     pl.col('LEGALENTITYZIP5').is_not_null().cast(int) +
     pl.col('First Name').is_not_null().cast(int) +
     pl.col('Last Name').is_not_null().cast(int) +
     pl.col('Business Email').is_not_null().cast(int) +
     pl.col('Mobile Phone').is_not_null().cast(int) +
     pl.col('FACEVALUEOFDIRECTLOANORLOANGUARANTEE').is_not_null().cast(int) +
     pl.col('ORIGINALLOANSUBSIDYCOST').is_not_null().cast(int)
    ).alias('info_count')
)

In [None]:
good_matches_df_3 = good_matches_df_2.clone()

for column_sets in (('Business Email',), ('Mobile Phone',), ('First Name', 'Last Name')):
    dup_columns = ['index'] + list(column_sets)
    initial_count = len(good_matches_df_3)
    good_matches_df_3 = good_matches_df_3.unique(subset=dup_columns, keep='first')
    
    print(f"Removed {initial_count - len(good_matches_df_3)} duplicates based on {column_sets}")
print(f"Final count: {len(good_matches_df_3)}")

Removed 76206 duplicates based on ('Business Email',)
Removed 711624 duplicates based on ('Mobile Phone',)
Removed 3565 duplicates based on ('First Name', 'Last Name')
Final count: 137046


In [None]:
good_matches_df_3[export_col_set].write_csv('good_matches_6.csv')

In [None]:
good_matches_df_3['Business Email'].value_counts().sort(descending=True, by='count')

Business Email,count
str,u32
"""vincent.scarpinato@usaa.com""",23
"""mooremj@aafes.com""",23
"""joanw@microsoft.com""",23
"""schonhol@musc.edu""",23
"""donwi@microsoft.com""",23
…,…
"""al@litegear.com""",1
"""d.ammerman@ammermanexperience.…",1
"""michael@gconspiracy.com""",1
"""l.speaker@speakerlaw.com""",1


In [None]:
# Filter rows where 'AddrSimilarity' is greater than 80
filtered_rows = matching_rows.filter((pl.col('AddrSimilarity') > 70) & (pl.col('AddrSimilarity') < 80))

# Print the filtered rows to verify
display(filtered_rows[['Company Name', 'LEGALENTITYADDRLINE1', 'Company Address', 'AddrSimilarity']])

# Count occurrences of each index in the result
index_counts = filtered_rows['index'].value_counts()

print(f"Number of unique index entries: {(index_counts['count'] == 1).sum()} out of {len(index_counts)}")


Company Name,LEGALENTITYADDRLINE1,Company Address,AddrSimilarity
str,str,str,i64
"""Desert Sky Properties""","""7373 E. Doubletree Ranch Road,…","""7373 E Doubletree Ranch Rd""",78
"""Comprehensive Resources, Inc.""","""1663 East 17th Street 2nd Flo…","""1663 East 17 Street""",75
"""Revitalize Charging Solutions,…","""1120 South Freeway, Suite 209""","""1120 South Freeway""",77
"""Synergy World, Inc.""","""12625 HIgh Bluff Drive 3208""","""12625 High Bluff Dr""",78
"""Klickpicks Llc""","""136 East 57th Street, 10th Flo…","""136 East 57th Street""",77
…,…,…,…
"""Ginza Usa""","""342 1/4 E FIRST ST""","""342 1/4 E 1st St""",71
"""Barbara Bourne Photography""","""14 Healdsburg Ave. Suite D""","""14 Healdsburg Ave""",79
"""Barbara Bourne Photography""","""14 Healdsburg Ave. Suite D""","""14 Healdsburg Ave""",79
"""Ark Solutions, Inc.""","""1939 Roland Clarke Pl. Suite…","""1939 Roland Clarke Place""",75


Number of unique index entries: 2274 out of 3896
