In [1]:
import pandas as pd
import re
import json
import os
from datetime import datetime

# Ensures that extract_street and extract_city_from_address are defined
def extract_street(address, suffix_regex):
    if pd.isnull(address):
        return None
    match = re.search(suffix_regex, address, re.IGNORECASE)
    if not match:
        return None
    matched_suffix = match.group(0)
    before_suffix, suffix, _ = address.partition(matched_suffix)
    street_address = re.sub(r',', '', before_suffix + suffix).strip()
    return street_address

def extract_city_from_address(address, city_list):
    if pd.isnull(address):
        return None
    address = str(address)  # Ensure address is treated as string
    for city in city_list:
        if city.lower() in address.lower().replace(',', ''):
            return city
    return None

def load_city_list_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    cleaned_text = data["cleanedText"]
    lines = cleaned_text.split('\n')
    city_list = [line for line in lines if line and not line.isupper() and not line.startswith('//') and not line.startswith('[')]
    return city_list

def parse_address_data(file_path, suffix_file_path, city_list_path):
    with open(suffix_file_path, 'r') as file:
        street_suffixes = json.load(file)
    all_suffixes = set()
    for suffix in street_suffixes:
        all_suffixes.update(suffix["abbrs"])
    suffix_pattern = '|'.join(re.escape(sfx) for sfx in all_suffixes)
    suffix_regex = rf'\b(?:{suffix_pattern})\b'

    city_list = load_city_list_from_json(city_list_path)
    
    df = pd.read_csv(file_path)
    df['Street_Address'] = df['principal_address'].apply(lambda x: extract_street(x, suffix_regex))
    df['City'] = df['principal_address'].apply(lambda x: extract_city_from_address(x, city_list))
    df['County'] = df['last_county'].apply(lambda x: ' '.join(x.split('-')[:-1]) if pd.notnull(x) else x)
    df['State'] = df['last_county'].apply(lambda x: x.split('-')[-1].upper() if pd.notnull(x) else x)
    
    df.drop(columns=['profile_url'], inplace=True)
    
    return df

def export_parsed_data(df, directory_path):
    today = datetime.now().strftime('%Y-%m-%d')
    output_file_name = f"{today}_parsed.csv"
    output_file_path = os.path.join(directory_path, output_file_name)
    
    df.to_csv(output_file_path, index=False)
    return output_file_path

# Adjust file paths accordingly
file_path = '/Users/bryanevan/Downloads/raw.csv'
suffix_file_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/street_suffix.json'
city_list_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/cleaned_cities_list.json'
directory_path = '/Users/bryanevan/Downloads'

parsed_df = parse_address_data(file_path, suffix_file_path, city_list_path)
new_file_path = export_parsed_data(parsed_df, directory_path)
print(f"Exported parsed data to: {new_file_path}")


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_street_suffix.json'