In [5]:
import pandas as pd
import re
import json
import os
from datetime import datetime


def raw_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return list(data.keys())  


def load_states(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    states = {**data[0], **{v: k for k, v in data[0].items()}}  # Reverse map for easy lookup
    return states


def extract_state_from_address(address, states):
    if pd.isnull(address):
        return None
    address = address.lower()
    for abbr, name in states.items():
        if re.search(r'\b' + re.escape(abbr.lower()) + r'\b', address) or re.search(r'\b' + re.escape(name.lower()) + r'\b', address):
            return abbr  # Returning abbreviation for uniformity
    return None


def extract_street(address, suffix_regex):
    if pd.isnull(address):
        return None
    match = re.search(suffix_regex, address, re.IGNORECASE)
    if not match:
        return None
    matched_suffix = match.group(0)
    before_suffix, suffix, _ = address.partition(matched_suffix)
    street_address = re.sub(r',', '', before_suffix + suffix).strip()
    return street_address


def extract_city_from_address(address, city_list):
    if pd.isnull(address):
        return None
    address = str(address)  
    for city in city_list:
        if city.lower() in address.lower().replace(',', ''):
            return city
    return None


def extract_zip_code(address):
    if pd.isnull(address):
        return None
    match = re.search(r'\b\d{5}\b', address)
    if match:
        return match.group(0)
    return None


def parse_address_data(file_path, suffix_file_path, city_list_path):
    with open(suffix_file_path, 'r') as file:
        street_suffixes = json.load(file)
    all_suffixes = set()
    for suffix in street_suffixes:
        all_suffixes.update(suffix["abbrs"])
    suffix_pattern = '|'.join(re.escape(sfx) for sfx in all_suffixes)
    suffix_regex = rf'\b(?:{suffix_pattern})\b'
    
    city_list = raw_file(city_list_path)
    states = load_states(states_path)
    
    df = pd.read_csv(file_path)
    df.dropna(subset=['principal_name'], inplace=True)
    df['Street_Address'] = df['principal_address'].apply(lambda x: extract_street(x, suffix_regex))
    df['City'] = df['principal_address'].apply(lambda x: extract_city_from_address(x, city_list))
    df['ZIP_Code'] = df['principal_address'].apply(extract_zip_code)
    df['State'] = df['principal_address'].apply(lambda x: extract_state_from_address(x, states))
    df.dropna(subset=['State'], inplace=True)
    
    desired_columns = ['principal_name','name', 'principal_address', 'Street_Address', 'City', 'State', 'ZIP_Code']
    df_filtered = df[desired_columns]

    return df_filtered


def export_parsed_data(df, directory_path):
    today = datetime.now().strftime('%Y-%m-%d')
    output_file_name = f"{today}_parsed.csv"
    output_file_path = os.path.join(directory_path, output_file_name)
    
    df.to_csv(output_file_path, index=False)
    return output_file_path


file_path = '/Users/bryanevan/Downloads/ohio.csv'
states_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/json/states.json'
suffix_file_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/street_suffix.json'
city_list_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/cleaned_cities_list.json'
directory_path = '/Users/bryanevan/Downloads'

parsed_df = parse_address_data(file_path, suffix_file_path, city_list_path)
new_file_path = export_parsed_data(parsed_df, directory_path)

print(f"Exported parsed data to: {new_file_path}")

Exported parsed data to: /Users/bryanevan/Downloads/2024-04-16_parsed.csv
