In [None]:
import pandas as pd
import re
import json
import os
from datetime import datetime

def load_city_list_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    cleaned_text = data["cleanedText"]
    lines = cleaned_text.split('\n')
    city_list = [line for line in lines if line and not line.isupper() and not line.startswith('//') and not line.startswith('[')]
    return city_list

def extract_street(address, suffix_regex):
    if pd.isnull(address):
        return None
    match = re.search(suffix_regex, address, re.IGNORECASE)
    if not match:
        return None
    end_index = match.end()
    street_address_with_suffix = address[:end_index]
    street_address_with_suffix = street_address_with_suffix.replace(',', '')
    return street_address_with_suffix.strip()

def extract_city_from_address(address, city_list):
    if pd.isnull(address):
        return None
    address = str(address)  # Convert to string to ensure .lower() works
    for city in city_list:
        if city.lower() in address.lower().replace(',', ''):
            return city
    return None

def extract_zip_code(address):
    if pd.isnull(address):
        return None
    match = re.search(r'\b\d{5}\b', address)
    if match:
        return match.group(0)
    return None

def filter_columns(df):
    base_columns = ['name', 'principal_name']
    new_columns = ['Street_Address', 'City', 'ZIP_Code', 'State']
    columns_to_keep = base_columns + new_columns
    df_filtered = df[[col for col in df.columns if col in columns_to_keep]]
    return df_filtered

def parse_address_data(file_path, suffix_file_path, city_list_path):
    with open(suffix_file_path, 'r') as file:
        street_suffixes = json.load(file)
    all_suffixes = set()
    for suffix in street_suffixes:
        all_suffixes.update(suffix["abbrs"])
    suffix_pattern = '|'.join(re.escape(sfx) for sfx in all_suffixes)
    suffix_regex = rf'\b(?:{suffix_pattern})\b'
    
    city_list = load_city_list_from_json(city_list_path)
    
    df = pd.read_csv(file_path)
    df['Street_Address'] = df['principal_address'].apply(lambda x: extract_street(x, suffix_regex))
    df['City'] = df['principal_address'].apply(lambda x: extract_city_from_address(x, city_list))
    df['ZIP_Code'] = df['principal_address'].apply(extract_zip_code)
    df['County'] = df['last_county'].apply(lambda x: ' '.join(x.split('-')[:-1]) if pd.notnull(x) else x)
    df['State'] = df['last_county'].apply(lambda x: x.split('-')[-1].upper() if pd.notnull(x) else x)
    
    # Filter columns after parsing
    df_filtered = filter_columns(df)
    
    return df_filtered

def export_parsed_data(df, input_file_path, directory_path):
    base_name = os.path.basename(input_file_path)
    name, ext = os.path.splitext(base_name)
    output_file_name = f"{name}_parsed{ext}_version1.0{ext}"
    output_file_path = os.path.join(directory_path, output_file_name)
    df.to_csv(output_file_path, index=False)
    return output_file_path


file_path = '/Users/bryanevan/Downloads/Phoenix_forecasa.csv'
suffix_file_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/street_suffix.json'
city_list_path = '/Users/bryanevan/CF/ANACONDA/python_cleaning_scripts/cleaned_cities_list.json'
directory_path = '/Users/bryanevan/Downloads'

# Run the processing and export functions
parsed_df = parse_address_data(file_path, suffix_file_path, city_list_path)
new_file_path = export_parsed_data(parsed_df, file_path, directory_path,)

print(f"Exported parsed data to: {new_file_path}")