In [1]:
import os
import pandas as pd
import usaddress
from uszipcode import SearchEngine
import csv



In [2]:
search = SearchEngine()

In [3]:
def extract_address_components(address):
    try:
        parsed_address_dict = usaddress.tag(address)[0]
        street_number = parsed_address_dict.get('AddressNumber', '')
        street = parsed_address_dict.get('StreetName', '')
        city = parsed_address_dict.get('PlaceName', '')
        state = parsed_address_dict.get('StateName', '')
        zip_code = parsed_address_dict.get('ZipCode', '')
        return pd.Series((street_number, street, city, state, zip_code))
    except:
        return pd.Series((None, None, None, None, None))

In [4]:
def get_city_by_zip(zip_code):
    try:
        zipcode_obj = search.by_zipcode(zip_code)
        return zipcode_obj.major_city
    except:
        return None

In [5]:
# Directory containing the files
directory = r"PLACEHOLDER - Please Insert Path to Directory"

# Initialize lists for clean and not found dataframes
clean_dfs = []
not_found_dfs = []

In [6]:
# Iterate over files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):
        file_path = os.path.join(directory, filename)

        # Read Excel file into a dataframe
        df = pd.read_excel(file_path)

        # Iterate over columns in the dataframe
        for column in df.columns:
            if 'address' in column.lower():
                # Create a copy of the DataFrame
                df_copy = df.copy()
                df_copy[['StreetNumber', 'Street', 'City', 'State', 'Zip']
                        ] = df_copy[column].apply(extract_address_components)
                df_copy['City'] = df_copy['Zip'].apply(get_city_by_zip)
                df_copy['Polygon'] = filename
                clean_dfs.append(df_copy)
            else:
                not_found_dfs.append(df)


                      street            city state    zip  \
0        777 Brockton Avenue        Abington    MA   2351   
1          30 Memorial Drive            Avon    MA   2322   
2        250 Hartford Avenue      Bellingham    MA   2019   
3             700 Oak Street        Brockton    MA   2301   
4          66-4 Parkhurst Rd      Chelmsford    MA   1824   
..                       ...             ...   ...    ...   
229      1501 Skyland Blvd E      Tuscaloosa    AL  35405   
230             3501 20th Av          Valley    AL  36854   
231  1300 Montgomery Highway  Vestavia Hills    AL  35216   
232          4538 Us Hwy 231        Wetumpka    AL  36092   
233           2575 Us Hwy 43        Winfield    AL  35594   

                                               address  
0             777, Brockton Avenue, Abington, MA, 2351  
1                    30, Memorial Drive, Avon, MA 2322  
2            250, Hartford Avenue, Bellingham, MA 2019  
3                   700, Oak Street, Br

In [7]:
# Concatenate the DataFrames in the lists
if clean_dfs:
    clean_df = pd.concat(clean_dfs, ignore_index=True)
else:
    clean_df = pd.DataFrame()
    
if not_found_dfs:
    not_found_df = pd.concat(not_found_dfs, ignore_index=True)
else:
    not_found_df = pd.DataFrame()

                      street            city state    zip  \
0        777 Brockton Avenue        Abington    MA   2351   
1          30 Memorial Drive            Avon    MA   2322   
2        250 Hartford Avenue      Bellingham    MA   2019   
3             700 Oak Street        Brockton    MA   2301   
4          66-4 Parkhurst Rd      Chelmsford    MA   1824   
..                       ...             ...   ...    ...   
229      1501 Skyland Blvd E      Tuscaloosa    AL  35405   
230             3501 20th Av          Valley    AL  36854   
231  1300 Montgomery Highway  Vestavia Hills    AL  35216   
232          4538 Us Hwy 231        Wetumpka    AL  36092   
233           2575 Us Hwy 43        Winfield    AL  35594   

                                               address StreetNumber  \
0             777, Brockton Avenue, Abington, MA, 2351          777   
1                    30, Memorial Drive, Avon, MA 2322           30   
2            250, Hartford Avenue, Bellingham, MA 2019

In [8]:
# Save clean and not found dataframes to CSV files
clean_df.to_csv('clean.csv', index=False)
not_found_df.to_csv('not_found.csv', index=False)

print("Done. CSV files saved.")

Done. CSV files saved.
