In [1]:
import pandas as pd
import re
import json
import os
from datetime import datetime
import usaddress

def load_json_data(json_path):
    """Load JSON data from a file."""
    with open(json_path, 'r') as f:
        return json.load(f)


def compile_street_suffix_regex(suffix_file_path):
    """Compile regex pattern for street suffixes."""
    street_suffixes = load_json_data(suffix_file_path)
    all_suffixes = set()
    for suffix in street_suffixes:
        all_suffixes.update(suffix["abbrs"])
    pattern = '|'.join(re.escape(sfx) for sfx in all_suffixes)
    return re.compile(r'\b(?:' + pattern + r')\b', re.IGNORECASE)
    

def extract_state_from_address(address, states_data):
    """Identifies the state abbreviation from the address."""
    if pd.isnull(address):
        return None
    normalized_address = str(address).lower()
    for state_abbr, state_name in states_data.items():
        if state_abbr.lower() in normalized_address or state_name.lower() in normalized_address:
            return state_abbr
    return None
    

def extract_city_from_address(address):
    """Extracts city from address using the usaddress library, with error handling for complex addresses."""
    if pd.isnull(address):
        return None
    try:
        parsed_address, _ = usaddress.tag(address)
        return parsed_address.get('PlaceName')
    except usaddress.RepeatedLabelError as e:
        # Fallback: Attempt to extract city from the parsed tokens directly in case of RepeatedLabelError
        tokens = e.parsed_string
        for token, label in tokens:
            if label == 'PlaceName':
                return token
        print(f"Error parsing address '{address}': {e}")
    except Exception as e:
        print(f"An error occurred during address parsing: {e}")
    return None
    

def extract_street(address, suffix_regex):
    """Extract street name from address."""
    if pd.isnull(address):
        return None
    match = suffix_regex.search(str(address))
    if match:
        end_index = match.end()
        street_address_with_suffix = address[:end_index]
        return street_address_with_suffix.replace(',', '').strip()
    return None
    

def extract_zip_code(address):
    """Extract ZIP code from address."""
    if pd.isnull(address):
        return None
    match = re.search(r'\b\d{5}\b', str(address))
    if match:
        return match.group()
    return None


def filter_columns(df, base_columns, new_columns):
    """Filters the DataFrame to keep specified columns."""
    columns_to_keep = base_columns + new_columns
    return df[columns_to_keep]


def parse_address_data(file_path, suffix_regex, states_data):
    """Main parsing function to process the address data."""
    df = pd.read_csv(file_path)
    df['Street_Address'] = df['principal_address'].apply(lambda x: extract_street(x, suffix_regex))
    df['City'] = df['principal_address'].apply(extract_city_from_address)  # Utilizes usaddress for city extraction
    df['ZIP_Code'] = df['principal_address'].apply(extract_zip_code)
    df['State'] = df['principal_address'].apply(lambda x: extract_state_from_address(x, states_data))
    # Specify the base and new columns to keep
    base_columns = ['name', 'principal_name']
    new_columns = ['Street_Address', 'City', 'ZIP_Code', 'State']
    df_filtered = filter_columns(df, base_columns, new_columns)
    return df_filtered


def export_parsed_data(df, input_file_path, directory_path):
    """Export parsed DataFrame to CSV."""
    base_name = os.path.basename(input_file_path)
    name, ext = os.path.splitext(base_name)
    output_file_name = f"{name}_parsed{ext}_test{ext}"
    output_file_path = os.path.join(directory_path, output_file_name)
    df.to_csv(output_file_path, index=False)
    return output_file_path
    
file_path = '/Users/bryanevan/Downloads/Phoenix_forecasa.csv'
suffix_file_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/json/street_suffix.json'
cities_aliases_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/json/cities_and_aliases.json'
states_json_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/json/states.json'
directory_path = '/Users/bryanevan/Downloads'

# Load data
states_data = {k: v for d in load_json_data(states_json_path) for k, v in d.items()}

# Process and export data
suffix_regex_pattern = compile_street_suffix_regex(suffix_file_path)
parsed_df = parse_address_data(file_path, suffix_regex_pattern, states_data)
new_file_path = export_parsed_data(parsed_df, file_path, directory_path)

print(f"Exported parsed data to: {new_file_path}")

Error parsing address 'Westwood College - Los Angeles, 3250, Wilshire Boulevard, Koreatown, Los Angeles, Los Angeles County, California, 90010, United States': 
ERROR: Unable to tag this string because more than one area of the string has the same label

ORIGINAL STRING:  Westwood College - Los Angeles, 3250, Wilshire Boulevard, Koreatown, Los Angeles, Los Angeles County, California, 90010, United States
PARSED TOKENS:    [('Westwood', 'Recipient'), ('College', 'Recipient'), ('Los', 'Recipient'), ('Angeles,', 'Recipient'), ('3250,', 'AddressNumber'), ('Wilshire', 'StreetName'), ('Boulevard,', 'StreetNamePostType'), ('Koreatown,', 'Recipient'), ('Los', 'Recipient'), ('Angeles,', 'Recipient'), ('Los', 'Recipient'), ('Angeles', 'Recipient'), ('County,', 'Recipient'), ('California,', 'Recipient'), ('90010,', 'Recipient'), ('United', 'Recipient'), ('States', 'Recipient')]
UNCERTAIN LABEL:  Recipient

When this error is raised, it's likely that either (1) the string is not a valid person/cor

In [None]:
def main():
    suffix_regex_pattern = compile_street_suffix_regex(suffix_file_path)
    states_set = load_states_data(states_json_path)
    parsed_df = parse_address_data(file_path, suffix_regex_pattern, cities_aliases_path, states_set)
    new_file_path = export_parsed_data(parsed_df, file_path, directory_path)
    print(f"Exported parsed data to: {new_file_path}")

In [None]:
%timeit -r 3 -n 10 main()

In [None]:
%prun -s cumtime main()