In [10]:
from __future__ import annotations

import sys
from pathlib import Path
import os

def find_project_root(start: Path | None = None) -> Path:
    """
    Walk upward from `start` until we find a project marker.
    Markers can be: pyproject.toml, .git, or a top-level 'src' folder.
    """
    if start is None:
        start = Path.cwd()

    for p in [start, *start.parents]:
        if (p / "pyproject.toml").exists() or (p / ".git").exists() or (p / "src").exists():
            return p

    raise RuntimeError(f"Could not find project root starting from {start}")

PROJECT_ROOT = find_project_root()
PROJECT_ROOT

# ensure imports work
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

sys.path[:3]

# make sure the notebook’s working directory is the project root
os.chdir(PROJECT_ROOT)
Path.cwd()

In [29]:
from src.match_warning_letters_to_stores import (
    build_warning_letter_panel,
    prep_store_info_df,
    merge_warning_letters_to_stores_by_zip,
    apply_address_cleaning,
    similarity_score,
)

In [26]:
# -----------------------------------------------------------------------------
# Shared path config (single source of truth)
# -----------------------------------------------------------------------------
def get_paths() -> dict[str, Path]:
    """
    Centralize all file/folder paths used by both dev cells and prod main().
    Returns:
        dict[str, Path]: Dictionary of relevant paths.
    """
    project_root = PROJECT_ROOT

    letter_path = project_root / "data" / "raw" / "warning_letters"
    warning_letters_panel_path = (
        project_root / "data" / "processed" / "warning_letters" / "warning_letters_panel.feather"
    )

    store_info_file_path = (
        project_root / "data" / "raw" / "pdi" / "STORE_STATUS_NEW-0.csv"
    )

    output_excel_path = (
        project_root / "data" / "processed" / "warning_letters_matched.xlsx"
    )

    debug_dir = project_root / "data" / "processed" / "debug"
    debug_dir.mkdir(parents=True, exist_ok=True)

    return {
        "project_root": project_root,
        "letter_path": letter_path,
        "warning_letters_panel_path": warning_letters_panel_path,
        "store_info_file_path": store_info_file_path,
        "output_excel_path": output_excel_path,
        "debug_dir": debug_dir,
    }


def peek(df, name: str, n: int = 5) -> None:
    """Lightweight inspection helper for both dev and prod."""
    display(f"\n=== {name} ===")
    display("shape:", df.shape)
    display("cols:", list(df.columns))
    display(df.head(n))

In [None]:
# create dictionary of relevant paths
paths = get_paths()
paths

{'project_root': WindowsPath('c:/Users/cahase/Documents/e_cigs_clean'),
 'store_info_file_path': WindowsPath('c:/Users/cahase/Documents/e_cigs_clean/data/raw/pdi/STORE_STATUS_NEW-0.csv'),
 'debug_dir': WindowsPath('c:/Users/cahase/Documents/e_cigs_clean/data/processed/debug')}

In [None]:
# build warning letters panel
warning_letters_df = build_warning_letter_panel(
    letter_path=str(paths["letter_path"]),
    letter_panel_output_file_path=str(paths["warning_letters_panel_path"]),
)
peek(warning_letters_df, "warning_letters_df")


shape: (610, 9)
cols: ['store_type', 'insp_date', 'issue_date', 'store_name', 'address', 'city', 'state', 'zip_code', 'products']


Unnamed: 0,store_type,insp_date,issue_date,store_name,address,city,state,zip_code,products
0,Brick and Mortar,2023-02-25,2023-05-25,660 Vape and Smoke,660 East Blue Ridge Boulevard,Kansas City,MO,64145,Puff Plus Loquat Aloe ENDS; Puff Plus Pumpkin ...
1,Brick and Mortar,2023-03-14,2023-05-25,7-Eleven / Conoco,1499 South Federal Boulevard,Denver,CO,80219,Puff XXL Lush Ice ENDS; Puff XXL Banana Ice ENDS
2,Brick and Mortar,2023-02-05,2023-05-25,Bath General Store and Market,502 Carteret Street,Bath,NC,27808,Puff Plus Watermelon Kiwifruit Ice; Puff Plus ...
3,Brick and Mortar,2023-02-16,2023-05-25,Clark's Food Mart,26605 US Highway 264 East,Pantego,NC,27860,Puff Plus Lemon Agave ENDS; Puff Plus Tangerin...
4,Brick and Mortar,2023-02-26,2023-05-25,D and H Candy and Tobacco,127 East 23rd Street,New York,NY,10010,Puff Plus Grape ENDS


In [24]:
# load and prep store info df
store_info_df = prep_store_info_df(
    store_info_file_path=str(paths["store_info_file_path"])
)
peek(store_info_df, "store_info_df")


=== store_info_df ===
shape: (32192, 17)
cols: ['store_id', 'store_name', 'store_chain_id', 'store_chain_name_pdi', 'store_flag', 'active_status', 'address', 'city', 'state', 'zip_code', 'latitude', 'longitude', 'chain_size', 'start_date', 'start_week_continuous_data', 'created_at', 'updated_at']


Unnamed: 0,store_id,store_name,store_chain_id,store_chain_name_pdi,store_flag,active_status,address,city,state,zip_code,latitude,longitude,chain_size,start_date,start_week_continuous_data,created_at,updated_at
0,43585,az express 1,25914,az express 4,False,A,3860 Front St,Winnsboro,LA,71295,32.163,-91.7201,2,2025-03-31,2025-03-31,2025-04-08 02:10:59.598,2025-04-08 02:10:59.598
1,43575,Friendship Travel Plaza,16770,Starlight Fuel LLC,False,A,9013 US Highway 412 N,Friendship,TN,38034,35.9009,-89.2502,4,2025-04-04,2025-03-31,2025-04-06 02:10:29.967,2025-04-06 02:10:29.967
2,43478,Corner Market Burke Rd,9009,Breaktime Corner Market,False,A,3510 Burke Rd,Pasadena,TX,77504,29.6582,-95.187,238,2024-12-26,2024-12-23,2025-03-27 02:10:13.395,2025-03-27 02:10:13.395
3,43451,Ralport Chevron,25834,Ralport enterprises,True,A,2750 US Highway 98 N,Lakeland,FL,33805,28.0774,-81.9628,1,2025-03-03,2025-03-03,2025-03-25 02:10:11.999,2025-03-25 02:10:11.999
4,43461,plainfield mart,25842,plainfield mart,False,A,50 E Main St,Carlisle,PA,17015,40.2016,-77.2809,1,2024-12-24,2024-12-23,2025-03-30 02:09:34.725,2025-03-30 02:09:34.725


In [27]:
# merge warning letters to store info, by zip code
merged_df = merge_warning_letters_to_stores_by_zip(
    store_info_df,
    warning_letters_df
)
peek(merged_df, "merged_df")

'\n=== merged_df ==='

'shape:'

(32648, 9)

'cols:'

['address_pdi',
 'address_fda',
 'store_id',
 'store_name_pdi',
 'store_chain_name_pdi',
 'store_name_fda',
 'insp_date',
 'issue_date',
 'zip_code']

Unnamed: 0,address_pdi,address_fda,store_id,store_name_pdi,store_chain_name_pdi,store_name_fda,insp_date,issue_date,zip_code
0,3860 Front St,,43585,az express 1,az express 4,,NaT,NaT,71295
1,9013 US Highway 412 N,,43575,Friendship Travel Plaza,Starlight Fuel LLC,,NaT,NaT,38034
2,3510 Burke Rd,,43478,Corner Market Burke Rd,Breaktime Corner Market,,NaT,NaT,77504
3,2750 US Highway 98 N,,43451,Ralport Chevron,Ralport enterprises,,NaT,NaT,33805
4,50 E Main St,,43461,plainfield mart,plainfield mart,,NaT,NaT,17015


In [28]:
# clean addresses
merged_df_clean = apply_address_cleaning(
    merged_df,
    address_columns=["address_pdi", "address_fda"],
)
peek(merged_df_clean, "merged_df_clean")



'\n=== merged_df_clean ==='

'shape:'

(32648, 9)

'cols:'

['address_pdi',
 'address_fda',
 'store_id',
 'store_name_pdi',
 'store_chain_name_pdi',
 'store_name_fda',
 'insp_date',
 'issue_date',
 'zip_code']

Unnamed: 0,address_pdi,address_fda,store_id,store_name_pdi,store_chain_name_pdi,store_name_fda,insp_date,issue_date,zip_code
0,3860 Front St,,43585,az express 1,az express 4,,NaT,NaT,71295
1,9013 US Hwy 412 N,,43575,Friendship Travel Plaza,Starlight Fuel LLC,,NaT,NaT,38034
2,3510 Burke Rd,,43478,Corner Market Burke Rd,Breaktime Corner Market,,NaT,NaT,77504
3,2750 US Hwy 98 N,,43451,Ralport Chevron,Ralport enterprises,,NaT,NaT,33805
4,50 E Main St,,43461,plainfield mart,plainfield mart,,NaT,NaT,17015


In [31]:
# similarity scoring (can be slow; start small if needed)
merged_df_clean["similarity_score"] = merged_df_clean.apply(
    lambda x: similarity_score(x["address_pdi"], x["address_fda"]),
    axis=1,
)

top_matches = merged_df_clean.sort_values(
    by="similarity_score", ascending=False
).head(20)
peek(top_matches, "top_matches")

'\n=== top_matches ==='

'shape:'

(20, 10)

'cols:'

['address_pdi',
 'address_fda',
 'store_id',
 'store_name_pdi',
 'store_chain_name_pdi',
 'store_name_fda',
 'insp_date',
 'issue_date',
 'zip_code',
 'similarity_score']

Unnamed: 0,address_pdi,address_fda,store_id,store_name_pdi,store_chain_name_pdi,store_name_fda,insp_date,issue_date,zip_code,similarity_score
24330,801 Valley College Dr,801 Valley College Dr,9279,Rhythm food mart,Prem & Rhythm inc,Rhythms Food Mart,2023-08-08,2023-09-22,40272,100.0
24066,13592 River Rd,13592 River Rd,21938,SMOKER'S HAVEN # 3,SMOKER'S HAVEN # 3 LLC,Smoker’s Haven,2023-05-25,2023-06-08,70047,100.0
23357,3119 Hubbard Rd,3119 HUBBARD Rd,26016,convenient foodmart,Balisana Inc,SUNOCO / CONVENIENT FOOD MART,2024-05-03,2024-07-16,44057,100.0
23200,281 W Virginia St,281 W VIRGINIA St,35059,AP Petro Inc.,AP PETRO INC,BP / TO GO,2024-05-06,2024-07-16,60014,100.0
4903,6000 W 1st Ave,6000 W 1ST Ave,12431,The First & Harlan Conoco,Praum Inc.,VALERO / FOOD STORE,2024-05-11,2024-07-16,80226,100.0
