In [1]:
import pandas as pd
import numpy as np
import Levenshtein
import math

In [2]:
pd.options.display.max_columns = 22
pd.options.display.max_rows = 1000

## Functions

In [3]:
def lev_list(address_list, address, i): # will help us find near matches
    temp_list = []
    temp_list_u = []
    for u_address in address_list:
        if Levenshtein.distance(address, u_address.upper()) <= i:
            temp_list.append(address)
            temp_list_u.append(u_address.upper())
    temp_list_series = pd.Series(temp_list)
    temp_list_u_series = pd.Series(temp_list_u)
    return temp_list_u_series

In [4]:
def gen_list_none(address_list, mismatched, i): # just to find how many addresses are so far off from anything in full
    tmp_list = []
    for address in mismatched:
        if address == "UNKNOWN ADDRESS" or address == "SUPPRESSED ADDRESS":
            continue
        if lev_list(address_list, address, i).shape[0] == 0:
            tmp_list.append(address)
    return pd.Series(tmp_list)

In [5]:
def hasNumbers(inputString): # from stackoverflow
    return any(char.isdigit() for char in inputString)

In [6]:
def combine(row):
    word = ""
    for r in row:
        if r != None:
            word += r + " "
    return word.strip()

In [7]:
def find_numbers(addresses):
    test_addr = addresses.str.split(expand=True)
    test_addr = test_addr.drop(0, axis=1)
    test_addr = test_addr.apply(combine, axis=1)
    uniq = test_addr[test_addr.apply(hasNumbers)].unique()
    return pd.Series(uniq)

In [8]:
def construct_number_dictionary():
    string = ['FIRST', 'SECOND', 'THIRD', 'FOURTH', 'FIFTH', 'SIXTH', 'SEVENTH',
             'EIGHTH', 'NINTH', 'TENTH', 'ELEVENTH', 'TWELFTH', 'THIRTEENTH',
             'FOURTEENTH', 'FIFTEENTH', 'SIXTEENTH', 'SEVENTEENTH', 'EIGHTEENTH',
             'NINETEENTH', 'TWENTIETH']
    numeric = ['1ST', '2ND', '3RD', '4TH', '5TH', '6TH', '7TH', '8TH', '9TH',
              '10TH', '11TH', '12TH', '13TH', '14TH', '15TH', '16T', '17TH',
              '18TH', '19TH', '20TH']
    return dict(zip(string, numeric))

In [9]:
def contains_cardinal(addr): # returns None if no cardinal direction, and direction otherwise
    directions = [' E ', ' W ', ' N ', ' S ', ' NE ', ' NW ', ' SW ', ' SE ']
    count = 0
    for car in directions:
        if car in addr:
            if "JOHNNY W WILLIAMS" in addr:
                return None
            return car # tested earlier, only one instance of each in every single address (no double directions)
    return None

In [10]:
def construct_st_name(address): # returns the street name of the address w/o number, direction, or ending
    split = address.split()
    end = split[-1]
    first = "NA"
    start_index = 0
    end_index = len(split) - 1
    if split[0].isnumeric():
        start_index += 1
    if contains_cardinal(address) != None:
        start_index += 1
    first = split[start_index:end_index]
    word = ""
    for a in first:
        word += a + " "
    word = word.strip()
    return word

In [11]:
def construct_dictionary(): # constructs dictionary from full addresses to create master lookup for endings
    endings = {'AVE', 'DR', 'RD', 'ST', 'LN', 'CT', 'BLVD', 'CIR', 
               'WAY', 'PL', 'EXPY', 'ALY', 'TRL', 'PKWY', 'TER'}
    dictionary = {}
    counter = 0
    for i in range(len(f_addresses)):
        split = f_addresses[i].split()
        end = split[-1]
        if end in endings:
            st_name = construct_st_name(f_addresses[i])
            if st_name in dictionary.keys():
                existing_endings = dictionary[st_name]
                if end not in existing_endings:
                    counter += 1
                    existing_endings.append(end)
            else:
                dictionary[st_name] = [end]
    return dictionary

In [12]:
def test(): # takes in the dictionary and tests to see if a new ending for a given address can be found in full data
    f_addresses_set = set(f_addresses)
    t_addresses_set = set(t_addresses)
    lis = []
    newly_found = []
    replace_in_tax = {}
    words = []
    dictionary = construct_dictionary()
    counter = 0
    for i in range(len(no_tax)):
        split = no_tax.iloc[i].split()
        word = split[:-1]
        wordo = ""
        for a in word:
            wordo += a + " "
        wordo = wordo.strip()
        st_name = construct_st_name(no_tax.iloc[i])
        found = False
        if st_name in dictionary.keys():
            for ending in dictionary[st_name]: #test a full address
                test = wordo + " " + ending
                words.append(test)
                if test in t_addresses_set:
                    newly_found.append(test)
                    replace_in_tax[test] = no_tax.iloc[i]
                    found = True
                    continue
            if not found:
                newly_found.append(no_tax.iloc[i])
        else:
            newly_found.append(no_tax.iloc[i])
            
    return pd.Series(newly_found), replace_in_tax

## Read in data

In [13]:
PATH = '/Users/william/Dropbox (Amherst College)/CDS-2019-AlbanyHub/'
df_sales = pd.read_csv(PATH + 'Raw-Data/DoughertyCountyGASales_2007_2019.csv')
df_addr = pd.read_csv(PATH + 'ToDatabase/addr_junct_table.csv')

In [14]:
df_sales.head()

Unnamed: 0,Parcel ID,Address,Sale Date,Sale Price,Qualified Sales,Reason,Acres,Parcel Class,Year Built,Square Ft,Price Per Square Ft,Neighborhood
0,0000L/00009/009,203 BARFIELD AVE,8/16/2017,$0.00,Unqualified,17,0.3,Residential,1959.0,1030.0,$0.00,MAP L
1,0000L/00009/009,203 BARFIELD AVE,8/16/2017,"$18,000.00",Unqualified,17,0.3,Residential,1959.0,1030.0,$17.48,MAP L
2,00059/00018/016,1912 RIDGEMONT DR,10/4/2011,$0.00,Unqualified,5,0.42,Residential,1969.0,1933.0,$0.00,
3,00059/00018/016,1912 RIDGEMONT DR,10/4/2011,$0.00,Unqualified,3,0.42,Residential,1969.0,1933.0,$0.00,
4,00059/00018/016,1912 RIDGEMONT DR,5/10/2019,$0.00,Unqualified,35,0.42,Residential,1969.0,1933.0,$0.00,


In [15]:
df_addr.head()

Unnamed: 0,Id,Address,Xcoord,Ycoord,Tract,BlockGroup,Block
0,0,501 S MADISON ST,31.570488,-84.162164,1500.0,1.0,13.0
1,1,525 LINCOLN AVE,31.569466,-84.161175,1500.0,1.0,12.0
2,2,609 S JACKSON ST,31.568513,-84.153922,1403.0,1.0,27.0
3,3,602 S MONROE ST,31.568661,-84.158641,1403.0,1.0,25.0
4,4,509 W GORDON AVE,31.568554,-84.159772,1500.0,1.0,17.0


In [16]:
s_addresses = df_sales['Address']
f_addresses = df_addr['Address']

In [17]:
s_addresses.shape

(2508,)

In [18]:
f_addresses.shape

(30721,)

In [19]:
s_addresses[~(s_addresses.isin(f_addresses))].shape # makes not much sense -- needs cleaning!

(1020,)

In [20]:
f_addresses[~(f_addresses.isin(s_addresses))].shape # makes somewhat sense

(30147,)

## Cleaning addresses

In [21]:
not_found = s_addresses[~(s_addresses.isin(f_addresses))]

#### Exploratory using levenshtein

In [22]:
not_found.iloc[0:10]

2        1912 RIDGEMONT DR
3        1912 RIDGEMONT DR
4        1912 RIDGEMONT DR
5          2714 FLEMING RD
6             618 PINE AVE
11    1131 E ROOSEVELT AVE
13    1419 E RESIDENCE AVE
14         113 COLLINS AVE
16         1418 GORDON AVE
17         1418 GORDON AVE
Name: Address, dtype: object

In [23]:
lev_list(f_addresses, "1912 RIDGEMONT DR", 3)

0    1902 RIDGEMONT RD
1    1910 RIDGEMONT RD
2    1912 RIDGEMONT RD
3    1915 RIDGEMONT RD
4    1913 RIDGEMONT RD
5    1911 RIDGEMONT RD
dtype: object

In [24]:
lev_list(f_addresses, "2714 FLEMING RD", 3)

Series([], dtype: float64)

In [25]:
lev_list(f_addresses, "618 PINE AVE", 2).head() # too many entries

0     112 PINE AVE
1     603 PINE AVE
2     128 PINE AVE
3    1018 PINE AVE
4     119 PINE AVE
dtype: object

In [26]:
lev_list(f_addresses, "1131 E ROOSEVELT AVE", 2).head() # too many entries

0    1001 E ROOSEVELT AVE
1    1121 E ROOSEVELT AVE
2    1118 E ROOSEVELT AVE
3    1104 E ROOSEVELT AVE
4    1128 E ROOSEVELT AVE
dtype: object

In [27]:
lev_list(f_addresses, "1419 E RESIDENCE AVE", 2).head() # too many entries

0    1509 E RESIDENCE AVE
1    1420 E RESIDENCE AVE
2    1409 E RESIDENCE AVE
3    1519 E RESIDENCE AVE
4    1210 E RESIDENCE AVE
dtype: object

In [28]:
lev_list(f_addresses, "113 COLLINS AVE", 3)

0    113 COLLINS ST
dtype: object

In [29]:
lev_list(f_addresses, "1418 GORDON AVE", 3) # should be w gordon 

0     1401 E GORDON AVE
1     1411 E GORDON AVE
2     1408 E GORDON AVE
3     1118 W GORDON AVE
4      514 W GORDON AVE
5      614 W GORDON AVE
6        1 W GORDON AVE
7     1218 W GORDON AVE
8     1401 W GORDON AVE
9     1411 W GORDON AVE
10    1413 W GORDON AVE
11    1415 W GORDON AVE
12    1417 W GORDON AVE
13    1419 W GORDON AVE
14    1421 W GORDON AVE
15    1418 W GORDON AVE
16    1416 W GORDON AVE
17    1414 W GORDON AVE
18    1412 W GORDON AVE
19    1410 W GORDON AVE
20    1408 W GORDON AVE
21     814 W GORDON AVE
dtype: object

#### Exploratory using gen list none

In [30]:
no_close = gen_list_none(f_addresses, not_found, 3)

In [31]:
no_close.shape # 282 entries that are no where near any addresses in our dataset

(282,)

In [32]:
no_close.head(20) # some of these could be based upon numbering. eleventh to 11th

0          2714 FLEMING RD
1     3624 SPRING FLATS RD
2     3624 SPRING FLATS RD
3        1008 ELEVENTH AVE
4        1008 ELEVENTH AVE
5       4525 SYLVESTER HWY
6       4525 SYLVESTER HWY
7       4525 SYLVESTER HWY
8       4525 SYLVESTER HWY
9         1200 GAISSERT RD
10        1200 GAISSERT RD
11        1200 GAISSERT RD
12        1200 GAISSERT RD
13        1200 GAISSERT RD
14        1601 TWELFTH AVE
15        1601 TWELFTH AVE
16        1601 TWELFTH AVE
17        1601 TWELFTH AVE
18            WYANDOTTE DR
19            WYANDOTTE DR
dtype: object

#### find numbers that we need to replace in the streets

In [33]:
find_numbers(f_addresses).iloc[18:25] # sample relevant output

18    E 2ND AVE
19    E 3RD AVE
20      5TH AVE
21      6TH AVE
22    W 2ND AVE
23      9TH AVE
24     12TH AVE
dtype: object

In [34]:
find_numbers(s_addresses)

Series([], dtype: object)

In [35]:
str_to_num = construct_number_dictionary()
s_addresses = s_addresses.replace(str_to_num, regex=True)

In [36]:
s_addresses[s_addresses.str.contains("FIRST")]

Series([], Name: Address, dtype: object)

In [37]:
not_found = s_addresses[~(s_addresses.isin(f_addresses))] # check again

In [38]:
not_found.shape

(891,)

In [39]:
not_found.head(5)

2    1912 RIDGEMONT DR
3    1912 RIDGEMONT DR
4    1912 RIDGEMONT DR
5      2714 FLEMING RD
6         618 PINE AVE
Name: Address, dtype: object

In [40]:
no_close = gen_list_none(f_addresses, not_found, 3) # check this again also

In [41]:
no_close.shape

(205,)

#### Addresses that we still don't have answers for. Are they in albany? or from dougherty? Useful to keep them all?

In [42]:
no_close.value_counts() # may not focus as much here

1027 KREG AVE             7
MOULTRIE RD               7
3221 JACQUELINE DR        6
1200 GAISSERT RD          5
GIBSON RD                 5
3219 JACQUELINE DR        5
908 TALLAHASSEE RD        5
4529 PALM AVE             5
1708 SCHILLING D C AVE    4
4914 VAN CISE LN          4
3522 GRAVEL HILL RD       4
3200 NELMS RD             4
ANTIOCH RD                4
4525 SYLVESTER HWY        4
2501 WILDFAIR RD          4
600 MUD CREEK RD          4
3356 FLEMING RD           4
FLEMING RD                4
3220 JACQUELINE DR        4
4922 VAN CISE LN          3
5001 VAN CISE LN          3
224 THE CIRCLE            3
222 THE CIRCLE            3
715 GABLE RD              3
226 THE CIRCLE            3
910 ROLAND RD             3
728 MUD CREEK RD          3
4912 HOLLY HILL RD        3
2615 TARVA RD             3
2724 QUAIL RUN DR         3
2711 HOMESTEAD AVE        3
3110 SPRING FLATS RD      3
1608 DOROUGH AVE          3
5202 DOLES RD             2
1050 SPRING HILL DR       2
824 PATTERSON AVE   

## Fixing endings as before

In [43]:
not_found.head()

2    1912 RIDGEMONT DR
3    1912 RIDGEMONT DR
4    1912 RIDGEMONT DR
5      2714 FLEMING RD
6         618 PINE AVE
Name: Address, dtype: object

In [44]:
df_sales.head()

Unnamed: 0,Parcel ID,Address,Sale Date,Sale Price,Qualified Sales,Reason,Acres,Parcel Class,Year Built,Square Ft,Price Per Square Ft,Neighborhood
0,0000L/00009/009,203 BARFIELD AVE,8/16/2017,$0.00,Unqualified,17,0.3,Residential,1959.0,1030.0,$0.00,MAP L
1,0000L/00009/009,203 BARFIELD AVE,8/16/2017,"$18,000.00",Unqualified,17,0.3,Residential,1959.0,1030.0,$17.48,MAP L
2,00059/00018/016,1912 RIDGEMONT DR,10/4/2011,$0.00,Unqualified,5,0.42,Residential,1969.0,1933.0,$0.00,
3,00059/00018/016,1912 RIDGEMONT DR,10/4/2011,$0.00,Unqualified,3,0.42,Residential,1969.0,1933.0,$0.00,
4,00059/00018/016,1912 RIDGEMONT DR,5/10/2019,$0.00,Unqualified,35,0.42,Residential,1969.0,1933.0,$0.00,
