In [36]:
!pip install fpdf


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py): started
  Building wheel for fpdf (setup.py): finished with status 'done'
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40713 sha256=4d83177fc004d777ebd201785f7855482b1a3f833e22e690e82c1f62111bc6f9
  Stored in directory: c:\users\maria\appdata\local\pip\cache\wheels\65\4f\66\bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [4]:
import pandas as pd

df_contracts = pd.read_csv("CLEANED_NYC_AWARDED_CONTRACTS.csv")
df_registry = pd.read_csv("cleaned_contractors.csv") 


In [23]:
# Find duplicate business names
duplicates = df_registry[df_registry.duplicated(subset=["Business Name"], keep=False)]


print("Duplicate Business Names Found:")
print(duplicates)


Duplicate Business Names Found:
     Certificate Number                       Business Name DBA Name  \
745         24-64G7T-CR                        AVISIGHT LLC   NO DBA   
746         25-64XZA-CR                        AVISIGHT LLC   NO DBA   
1023        24-635FX-CR                 BOVE INDUSTRIES INC   NO DBA   
1024        24-64NDK-CR                 BOVE INDUSTRIES INC   NO DBA   
2725        24-64N79-CR  GRACE CONTRACTING  DEVELOPMENT LLC   NO DBA   
2726        25-64VDG-CR  GRACE CONTRACTING  DEVELOPMENT LLC   NO DBA   
2770        24-639H6-CR            GREENWOOD INDUSTRIES INC   NO DBA   
2771        24-64D4A-CR            GREENWOOD INDUSTRIES INC   NO DBA   
5149        24-63HTP-CR               PRO ARCHITECTURAL LLC   NO DBA   
5150        24-63H1N-CR               PRO ARCHITECTURAL LLC   NO DBA   

     Business Type Business is MWBE Owned Business is Publicly Traded  \
745            LLC                     No                          No   
746            LLC           

In [11]:
import pandas as pd
from rapidfuzz import process, fuzz


df_contracts = pd.read_csv("CLEANED_NYC_AWARDED_CONTRACTS.csv")
df_registry = pd.read_csv("cleaned_contractors.csv")

# Clean and Standardize Business Names
df_contracts["Prime Vendor"] = df_contracts["Prime Vendor"].str.upper().str.strip().str.replace(r"[^A-Z0-9 ]", "", regex=True)
df_contracts["Sub Vendor"] = df_contracts["Sub Vendor"].fillna("").str.upper().str.strip().str.replace(r"[^A-Z0-9 ]", "", regex=True)
df_registry["Business Name"] = df_registry["Business Name"].str.upper().str.strip().str.replace(r"[^A-Z0-9 ]", "", regex=True)
df_registry["DBA Name"] = df_registry["DBA Name"].fillna("").str.upper().str.strip().str.replace(r"[^A-Z0-9 ]", "", regex=True)

# Create Lookup Dictionary (Business Name + DBA Name)
vendor_lookup = {}
for _, row in df_registry.iterrows():
    names = {row["Business Name"]}
    if row["DBA Name"]:
        names.add(row["DBA Name"])
    
    for name in names:
        vendor_lookup.setdefault(name, []).append(row.to_dict())


# Sample Check
sample_vendor = df_contracts["Prime Vendor"].iloc[0]
print(f"Sample Prime Vendor: {sample_vendor}")
print(f"Exact Match in Lookup?: {'YES' if sample_vendor in vendor_lookup else 'NO'}")

# Apply Exact Matching
def get_exact_match(vendor_name):
    if vendor_name == "NOT APPLICABLE":
        return "Not Applicable"
    return "Exact Match" if vendor_name in vendor_lookup else "No Match"

df_contracts["Prime Vendor Match"] = df_contracts["Prime Vendor"].map(get_exact_match)
df_contracts["Sub Vendor Match"] = df_contracts["Sub Vendor"].map(get_exact_match)


# Apply Fuzzy Matching for Unmatched Vendors
def rapid_fuzzy_match(vendor_name):
    if vendor_name in ["No Match", "NOT APPLICABLE"]:
        return vendor_name
    match = process.extractOne(vendor_name, list(vendor_lookup.keys()), scorer=fuzz.ratio, score_cutoff=90)
    return f"Fuzzy Match: {match[0]}" if match else "No Match"

df_contracts.loc[df_contracts["Prime Vendor Match"] == "No Match", "Prime Vendor Match"] = df_contracts["Prime Vendor"].apply(rapid_fuzzy_match)
df_contracts.loc[df_contracts["Sub Vendor Match"] == "No Match", "Sub Vendor Match"] = df_contracts["Sub Vendor"].apply(rapid_fuzzy_match)

# Ensure Properly Matched Contracts
df_matched = df_contracts[
    (
        (df_contracts["Prime Vendor Match"] != "No Match") & 
        (df_contracts["Sub Vendor Match"] != "No Match")
    ) |  
    (
        (df_contracts["Prime Vendor Match"] != "No Match") & 
        (df_contracts["Sub Vendor Match"] == "Not Applicable")
    ) |  
    (
        (df_contracts["Prime Vendor Match"] == "Not Applicable") & 
        (df_contracts["Sub Vendor Match"] != "No Match")
    )    
]

#  Matched Contracts File
df_matched.to_csv("Matched_Contracts.csv", index=False)

# number of Matches & Unmatched Vendors
exact_matches_prime = df_contracts["Prime Vendor Match"].str.contains("Exact Match", na=False).sum()
exact_matches_sub = df_contracts["Sub Vendor Match"].str.contains("Exact Match", na=False).sum()
fuzzy_matches_prime = df_contracts["Prime Vendor Match"].str.contains("Fuzzy Match", na=False).sum()
fuzzy_matches_sub = df_contracts["Sub Vendor Match"].str.contains("Fuzzy Match", na=False).sum()
unmatched_prime = (df_contracts["Prime Vendor Match"] == "No Match").sum()
unmatched_sub = (df_contracts["Sub Vendor Match"] == "No Match").sum()

# Print Matching Summary
print("**Matching Summary**")
print(f"Prime Vendor - Exact Matches: {exact_matches_prime}")
print(f"Prime Vendor - Fuzzy Matches: {fuzzy_matches_prime}")
print(f"Prime Vendor - No Match: {unmatched_prime}")
print("-" * 40)
print(f"Sub Vendor - Exact Matches: {exact_matches_sub}")
print(f"Sub Vendor - Fuzzy Matches: {fuzzy_matches_sub}")
print(f"Sub Vendor - No Match: {unmatched_sub}")
print("=" * 50)


print("Corrected matched contracts saved as 'Matched_Contracts.csv'")


Sample Prime Vendor: WILDLIFE CONSERVATION SOCIETY
Exact Match in Lookup?: NO
**Matching Summary**
Prime Vendor - Exact Matches: 1691
Prime Vendor - Fuzzy Matches: 189
Prime Vendor - No Match: 387
----------------------------------------
Sub Vendor - Exact Matches: 883
Sub Vendor - Fuzzy Matches: 193
Sub Vendor - No Match: 633
Corrected matched contracts saved as 'Matched_Contracts_Corrected.csv'


In [13]:

# DBA Names that appear in Prime Vendor or Sub Vendor
dba_matches_prime = df_registry[df_registry["DBA Name"].isin(df_contracts["Prime Vendor"])]
dba_matches_sub = df_registry[df_registry["DBA Name"].isin(df_contracts["Sub Vendor"])]

# Combine results into one DataFrame
dba_matches = pd.concat([dba_matches_prime, dba_matches_sub]).drop_duplicates()

# Save results
dba_matches.to_csv("DBA_Matches.csv", index=False)

# Print results
print("DBA Name Matches Found!")
print(f"DBA matches saved to DBA_Matches.csv")
print(f"{len(dba_matches_prime)} matches found in Prime Vendor column")
print(f"{len(dba_matches_sub)} matches found in Sub Vendor column")




DBA Name Matches Found!
DBA matches saved to DBA_Matches.csv
20 matches found in Prime Vendor column
37 matches found in Sub Vendor column


In [22]:
#block to build lookup profile
import tkinter as tk
from tkinter import messagebox, scrolledtext


# Load Data
df_registry = pd.read_csv("cleaned_contractors.csv")  
df_contracts = pd.read_csv("Matched_Contracts.csv") 

# Standardize Business Names
df_registry["Business Name"] = df_registry["Business Name"].str.upper().str.strip()
df_registry["DBA Name"] = df_registry["DBA Name"].fillna("").str.upper().str.strip()
df_contracts["Prime Vendor"] = df_contracts["Prime Vendor"].str.upper().str.strip()
df_contracts["Sub Vendor"] = df_contracts["Sub Vendor"].fillna("").str.upper().str.strip()

# Lookup Dictionary for Business
vendor_lookup = df_registry.groupby("Business Name").apply(lambda x: x.to_dict(orient="records")).to_dict()

# Business Search Function
def search_business():
    search_term = entry_search.get().strip().upper()
    if not search_term:
        messagebox.showerror("Error", "Please enter a Business Name!")
        return

    # Remove "Fuzzy Match: " if present
    search_term_cleaned = search_term.replace("Fuzzy Match: ", "").strip()

    # Exact Match Check
    business_data = vendor_lookup.get(search_term_cleaned)

    if not business_data:
        # Perform fuzzy matching
        match_result = process.extractOne(search_term_cleaned, df_registry["Business Name"].unique(), scorer=fuzz.ratio, score_cutoff=75)
        
        if match_result:
            best_match = match_result[0]
            business_data = vendor_lookup.get(best_match)
            search_term_cleaned = best_match  # Use actual vendor name
        else:
            messagebox.showinfo("No Match", f"No business found for '{search_term}'.")
            return

    # Ensure "Exact Match" Retrieves Correct Contracts
    if search_term_cleaned == "Exact Match":
        contracts = df_contracts[
            (df_contracts["Prime Vendor"] == df_contracts["Prime Vendor Match"]) |
            (df_contracts["Sub Vendor"] == df_contracts["Sub Vendor Match"])
        ]
    else:
        contracts = df_contracts[
            (df_contracts["Prime Vendor Match"].str.contains(search_term_cleaned, na=False, case=False)) | 
            (df_contracts["Sub Vendor Match"].str.contains(search_term_cleaned, na=False, case=False))
        ]

    # Build Business Profile Display
    profile_text = f"\n📌 Business Profile for {search_term_cleaned}\n" + "-" * 40
    
    for record in business_data:
        profile_text += f"""
🏢 Address: {record.get('Address', 'N/A')}
📍 City: {record.get('City', 'N/A')}, {record.get('State', 'N/A')} {record.get('Zip Code', 'N/A')}
☎️ Phone: {record.get('Phone', 'N/A')}
🏢 MWBE Owned: {record.get('Business is MWBE Owned', 'N/A')}
🚫 Debarment Status: {record.get('Business has been debarred', 'N/A')}
🏗️ Apprenticeship Program: {record.get('Business is associated with an apprenticeship program', 'N/A')}
--------------------------------------
"""

    # Build Contract Details
    contract_text = "\n📑 Contracts Associated with This Business:\n" if not contracts.empty else "❌ No contracts found for this business."

    for _, row in contracts.iterrows():
        contract_text += f"\n🔹 Contract ID: {row['Prime Contract ID']} | ${row['Prime Contract Current Amount']}\n"
        contract_text += f"   🔹 Contracting Agency: {row['Prime Contracting Agency']}\n"
        contract_text += f"   🔹 Start: {row['Prime Contract Start Date']} - End: {row['Prime Contract End Date']}\n"
        contract_text += "-" * 50

    # Display Results in GUI
    profile_output.delete(1.0, tk.END)
    profile_output.insert(tk.END, profile_text + contract_text)

# GUI Setup
root = tk.Tk()
root.title("Contractor Profile Lookup")
root.geometry("750x700")

# Search Box
tk.Label(root, text="Enter Business Name:", font=("Arial", 12)).pack(pady=5)
entry_search = tk.Entry(root, font=("Arial", 12), width=40)
entry_search.pack(pady=5)

# Search Button
tk.Button(root, text="Search", font=("Arial", 12), command=search_business).pack(pady=10)

# Profile Output (Scrollable)
profile_output = scrolledtext.ScrolledText(root, width=80, height=25, font=("Arial", 10))
profile_output.pack(pady=10, padx=10)

# Run the App
root.mainloop()


In [21]:
# sample rows where Prime Vendor and Sub Vendor have exact matches
exact_matches_sample = df_contracts[
    (df_contracts["Prime Vendor Match"] == "Exact Match") |
    (df_contracts["Sub Vendor Match"] == "Exact Match")
][["Prime Contract ID", "Prime Vendor", "Prime Vendor Match", "Sub Vendor", "Sub Vendor Match"]].head(10)

# Show the sample exact matches
print(exact_matches_sample)


     Prime Contract ID                          Prime Vendor  \
16  MMA107120258802791  RICHARDS PLUMBING AND HEATING CO INC   
17  MMA105720258803823    HIGH POINT CONSTRUCTION GROUP CORP   
18  MMA105720258802802                         ZHL GROUP INC   
19  MMA105720248805474                         ZHL GROUP INC   
20  MMA105720248805471                         ZHL GROUP INC   
21  MMA105720248805470                         ZHL GROUP INC   
22  MMA105720248804543             CRESCENT CONTRACTING CORP   
23  MMA105720248804449             CRESCENT CONTRACTING CORP   
24  MMA105720248804351             CRESCENT CONTRACTING CORP   
25  MMA105720248804325             CRESCENT CONTRACTING CORP   

   Prime Vendor Match      Sub Vendor Sub Vendor Match  
16        Exact Match  NOT APPLICABLE   NOT APPLICABLE  
17        Exact Match  NOT APPLICABLE   NOT APPLICABLE  
18        Exact Match  NOT APPLICABLE   NOT APPLICABLE  
19        Exact Match  NOT APPLICABLE   NOT APPLICABLE  
20        

In [10]:
# num of unique Prime Vendors
unique_prime_vendors = df_contracts["Prime Vendor"].nunique()

# Count unique Sub Vendors
unique_sub_vendors = df_contracts["Sub Vendor"].nunique()

# Count total unique vendors across both Prime and Sub Vendor columns
unique_total_vendors = pd.concat([df_contracts["Prime Vendor"], df_contracts["Sub Vendor"]]).nunique()

# Print results
print(f"Unique Prime Vendors: {unique_prime_vendors}")
print(f"Unique Sub Vendors: {unique_sub_vendors}")
print(f"Total Unique Vendors (Prime + Sub): {unique_total_vendors}")


Unique Prime Vendors: 270
Unique Sub Vendors: 732
Total Unique Vendors (Prime + Sub): 972
