In [4]:
import pandas as pd
from fuzzywuzzy import process
from unidecode import unidecode

# Load the data
data_address = pd.read_csv('data_address.csv', encoding='utf-8')
streets = pd.read_csv('upstreets.csv', encoding='latin1')

# Drop NaN values or fill with a default string
data_address = data_address.dropna(subset=['full'])
streets = streets.dropna()

# Apply unidecode to handle special characters
data_address['full'] = data_address['full'].apply(unidecode)
streets = streets.applymap(unidecode)

# Create a dictionary to store the best match for each address
best_matches = {}

# Initialize a counter
counter = 0

# Go through each address in data_address and find the closest match in streets
for address in data_address['full']:
    best_match = process.extractOne(address, streets.iloc[:, 0])
    
    # Check if a match was found before assigning it to the dictionary
    if best_match:
        best_matches[address] = [best_match[0], best_match[1]]
        counter += 1  # increment the counter
        print(f'Matches found so far: {counter}')  # print the counter in real-time
        
    else:
        best_matches[address] = ['No Match Found', 'N/A']

# Print the total number of matches
print(f'Total matches: {counter}')

# Convert the matches dictionary to a DataFrame
best_matches_df = pd.DataFrame.from_dict(best_matches, orient='index', columns=['Best Match', 'Score'])

print(best_matches_df)


Matches found so far: 1
Matches found so far: 2
Matches found so far: 3
Matches found so far: 4
Matches found so far: 5
Matches found so far: 6
Matches found so far: 7
Matches found so far: 8
Matches found so far: 9
Matches found so far: 10
Matches found so far: 11
Matches found so far: 12
Matches found so far: 13
Matches found so far: 14
Matches found so far: 15
Matches found so far: 16
Matches found so far: 17
Matches found so far: 18
Matches found so far: 19
Matches found so far: 20
Matches found so far: 21
Matches found so far: 22
Matches found so far: 23
Matches found so far: 24
Matches found so far: 25
Matches found so far: 26
Matches found so far: 27
Matches found so far: 28
Matches found so far: 29
Matches found so far: 30
Matches found so far: 31
Matches found so far: 32
Matches found so far: 33
Matches found so far: 34
Matches found so far: 35
Matches found so far: 36
Matches found so far: 37
Matches found so far: 38
Matches found so far: 39
Matches found so far: 40
Matches f

KeyboardInterrupt: 

In [5]:

# Convert the matches dictionary to a DataFrame
best_matches_df = pd.DataFrame.from_dict(best_matches, orient='index', columns=['Best Match', 'Score'])

# Reset the index
best_matches_df.reset_index(inplace=True)

# Rename the index column to 'Original Address'
best_matches_df.rename(columns = {'index':'Original Address'}, inplace = True)

# Write the DataFrame to a CSV file
best_matches_df.to_csv('best_matches.csv', index=False)