In [1]:
import pandas as pd
import numpy as np
import Levenshtein
import math

In [2]:
pd.options.display.max_columns = 22
pd.options.display.max_rows = 1000

## Read in all data

In [5]:
df_taxes_list = []
path = "~/Dropbox (GaTech)/CDS-2019-AlbanyHub/Raw-Data/Tax/"
for i in range(2009, 2020):
    df_tmp = pd.read_csv(path + "AY" + str(i) + " Real Property_Personal Property.csv")
    df_tmp['YEAR'] = i # adding a year column to maintiain information 
    df_taxes_list.append(df_tmp)
df_full_taxes = pd.concat(df_taxes_list, ignore_index=True) # concat them all into one
df_full_taxes.head();

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


KeyboardInterrupt: 

In [6]:
df_home = pd.read_csv('~/Dropbox (GaTech)/CDS-2019-AlbanyHub/Processed-Data/fixed_home_v02.csv')
df_cdbg = pd.read_csv('~/Dropbox (GaTech)/CDS-2019-AlbanyHub/Processed-Data/fixed_cdbg.csv')

In [7]:
df_addr = pd.read_csv('~/Dropbox (GaTech)/CDS-2019-AlbanyHub/ToDatabase/addr_junct_table.csv')

## Functions

In [8]:
def genStrip(): # strip off extra white space
    return lambda addr: addr.strip()

In [9]:
def genSpacing(): # fix spacing--not really a need to do this nested function
    def test(word):
        if word == "  ":
            return ""
        elif type(word) == float:
            if math.isnan(word):
                return ""
        else:
            return word
    return test

In [10]:
# address_list is the list of addresses in which you want to test if your 'address' is within 'i' edits of any word
# ex. lev_list(['aaa', 'aa', 'a'], 'a', 2) returns [AA, A]
def lev_list(address_list, address, i): # will help us find near matches
    temp_list = []
    temp_list_u = []
    for u_address in address_list:
        if Levenshtein.distance(address, u_address.upper()) <= i:
            temp_list.append(address)
            temp_list_u.append(u_address.upper())
    temp_list_series = pd.Series(temp_list)
    temp_list_u_series = pd.Series(temp_list_u)
    return temp_list_u_series

In [11]:
# gives a heuristic as to whether a given address is so far off from anything found in our address_list
def gen_list_none(address_list, mismatched, i): # just to find how many addresses are so far off from anything in full
    tmp_list = []
    for address in mismatched:
        if address == "UNKNOWN ADDRESS" or address == "SUPPRESSED ADDRESS":
            continue
        if lev_list(address_list, address, i).shape[0] == 0:
            tmp_list.append(address)
    return pd.Series(tmp_list)

In [12]:
def contains_cardinal(addr): # returns None if no cardinal direction in the address, and direction otherwise
    directions = [' E ', ' W ', ' N ', ' S ', ' NE ', ' NW ', ' SW ', ' SE ']
    count = 0
    for car in directions:
        if car in addr:
            if "JOHNNY W WILLIAMS" in addr: # corner case where the w does not stand for west
                return None
            return car # tested earlier, only one instance of each in every single address (no double directions)
    return None

In [13]:
def construct_st_name(address): # returns the street name of the address w/o number, direction, or ending
    split = address.split()
    end = split[-1]
    first = "NA"
    start_index = 0
    end_index = len(split) - 1
    if split[0].isnumeric():
        start_index += 1
    if contains_cardinal(address) != None:
        start_index += 1
    first = split[start_index:end_index]
    word = ""
    for a in first:
        word += a + " "
    word = word.strip()
    return word

In [14]:
# f_addresses contains list of 'trusted' addresses
# constructs dictionary from all trusted addresses to create master lookup for endings
def construct_dictionary(f_addresses): 
    endings = {'AVE', 'DR', 'RD', 'ST', 'LN', 'CT', 'BLVD', 'CIR', 
               'WAY', 'PL', 'EXPY', 'ALY', 'TRL', 'PKWY', 'TER'}
    dictionary = {}
    counter = 0
    for i in range(len(f_addresses)):
        split = f_addresses[i].split()
        end = split[-1]
        if end in endings:
            st_name = construct_st_name(f_addresses[i])
            if st_name in dictionary.keys():
                existing_endings = dictionary[st_name] # should be a list of endings, as there could be multiple
                if end not in existing_endings:
                    counter += 1
                    existing_endings.append(end)
            else:
                dictionary[st_name] = [end]
    return dictionary

In [15]:
def construct_word_spacing(word_list): # takes in a list of words and returns in correct format
    #ex: construct_word_spacing(['123', 'JACKSON', 'ST']) returns '123 JACKSON ST'
    word = ""
    for part in word_list:
        word += part + " "
    return word.strip()

In [16]:
def reverse_ending_dictionary(f_addresses, t_addresses, not_found):
    # constructs a dictionary given the first part of the street and it's ending
    # example dictionary: {'JACKSON': 'ST'}
    f_addresses_set = set(f_addresses)
    t_addresses_set = set(t_addresses)
    ending_dictionary = construct_dictionary(f_addresses)
    replace_ending = {}
    for i in range(len(not_found)):
        current_address = not_found.iloc[i]
        split = current_address.split()
        no_ending_split = split[:-1]
        word = construct_word_spacing(no_ending_split)
        st_name = construct_st_name(not_found.iloc[i]) #removes number, cardinal direction, ending if exists
        if st_name in ending_dictionary.keys():
            for ending in ending_dictionary[st_name]:
                test_address = word + " " + ending
                if test_address in t_addresses_set:
                    if test_address in replace_ending.keys(): # this structure just uses first option
                        print("multiple addresses found for: " + test_address)
                    else:
                        replace_ending[test_address] = current_address
                # else just ignore
    return replace_ending

In [17]:
def reverse_direction_dictionary(f_addresses, t_addresses, not_found):
    # Creates a dictionary with st name and st direction
    # ex dictionary: {'MOCK': 'S'}
    f_addresses_set = set(f_addresses)
    t_addresses_set = set(t_addresses)
    replace_direction = {}
    for i in range(len(not_found)):
        current_address = not_found.iloc[i]
        car = contains_cardinal(current_address)
        if car != None:
            ind = current_address.find(car)
            addr = current_address[:ind] + " " + current_address[ind+3:]
            if addr in t_addresses_set:
                if addr in replace_direction.keys(): # this structure just uses first option
                    print("multiple directions found for: " + current_address)
                else:
                    replace_direction[addr] = current_address
    return replace_direction

In [18]:
def hasNumbers(inputString): # from stackoverflow
    return any(char.isdigit() for char in inputString)

In [19]:
def combine(row): # sorry for the repetition but does the same thing as genspacing above essentially
    word = ""
    for r in row:
        if r != None:
            word += r + " "
    return word.strip()

In [20]:
def number_dictionary(): # quick replacement for words to numbers
    string = ['FIRST', 'SECOND', 'THIRD', 'FOURTH', 'FIFTH', 'SIXTH', 'SEVENTH',
             'EIGHTH', 'NINTH', 'TENTH', 'ELEVENTH', 'TWELFTH', 'THIRTEENTH',
             'FOURTEENTH', 'FIFTEENTH', 'SIXTEENTH', 'SEVENTEENTH', 'EIGHTEENTH',
             'NINETEENTH', 'TWENTIETH']
    numeric = ['1ST', '2ND', '3RD', '4TH', '5TH', '6TH', '7TH', '8TH', '9TH',
              '10TH', '11TH', '12TH', '13TH', '14TH', '15TH', '16T', '17TH',
              '18TH', '19TH', '20TH']
    return dict(zip(string, numeric))

In [21]:
def find_numbers(addresses): # find all the numbers that would show in the middle of a street name
    # '3 jackson st' should not be flagged but '3 3rd st' should be
    test_addr = addresses.str.split(expand=True)
    test_addr = test_addr.drop(0, axis=1)
    test_addr = test_addr.apply(combine, axis=1)
    uniq = test_addr[test_addr.apply(hasNumbers)].unique()
    return pd.Series(uniq)

In [22]:
def return_not_found(f_addresses, t_addresses):
    return f_addresses[~f_addresses.isin(t_addresses)]

## Cleaning

In [23]:
# just some spacing issues with the address
df_full_taxes['FULL_ADDRESS'] = df_full_taxes['HOUSE_NO'].astype(str).apply(genStrip()) + \
' ' + df_full_taxes['STDIRECT'].apply(genSpacing()) + df_full_taxes["STREET_NAM"].apply(genStrip()) + ' ' + df_full_taxes["STREET_TYPE"].astype(str).apply(genStrip())

NameError: name 'df_full_taxes' is not defined

In [27]:
f_addresses = df_addr['Address'] # full set of addresses that we have
t_addresses = pd.Series(df_full_taxes['FULL_ADDRESS'].unique()) # unique set of tax addresses

In [28]:
print(t_addresses[~t_addresses.isin(f_addresses)].shape)
not_found = return_not_found(f_addresses, t_addresses)
print(not_found.shape) # see how many of all addresses are not found in the tax address set

(16456,)
(9130,)


In [30]:
find_numbers(not_found).head() # see how many of the ones we can't find are because of a number issue

0             W 4TH AVE
1              15TH AVE
2    1/2 S SLAPPEY BLVD
3                US 19S
4       US HIGHWAY 19 S
dtype: object

In [31]:
string_to_number = number_dictionary() # create the number dictionary
t_addresses = t_addresses.replace(string_to_number, regex=True) # and replace with number

In [32]:
return_not_found(f_addresses, t_addresses).shape # resolved around 2k records with just that!

(7759,)

In [33]:
ending_dictionary = reverse_ending_dictionary(f_addresses, t_addresses, not_found) # create dictionary for endings
direction_dictionary = reverse_direction_dictionary(f_addresses, t_addresses, not_found) # dictionary for direction

multiple directions found for: 1310 W HIGHLAND AVE
multiple directions found for: 1106 W 4TH AVE
multiple directions found for: 1111 W 4TH AVE
multiple directions found for: 1301 W 4TH AVE
multiple directions found for: 1202 W 3RD AVE
multiple directions found for: 1205 W 2ND AVE
multiple directions found for: 1209 W 2ND AVE
multiple directions found for: 1218 W 2ND AVE
multiple directions found for: 1403 W 2ND AVE
multiple directions found for: 1413 W LINCOLN AVE
multiple directions found for: 815 S WESTOVER BLVD
multiple directions found for: 1306 E 2ND AVE
multiple directions found for: 1302 E 2ND AVE
multiple directions found for: 1300 E 2ND AVE
multiple directions found for: 1216 E 2ND AVE
multiple directions found for: 1212 E 2ND AVE
multiple directions found for: 1210 E 2ND AVE
multiple directions found for: 1208 E 2ND AVE
multiple directions found for: 1204 E 2ND AVE
multiple directions found for: 1203 E 2ND AVE
multiple directions found for: 1211 E 2ND AVE
multiple directions 

In [35]:
ending_dictionary; #take a peak

In [29]:
direction_dictionary;

In [36]:
t_addresses = t_addresses.replace(ending_dictionary) # replace values
t_addresses = t_addresses.replace(direction_dictionary) # replace values

In [37]:
not_found = return_not_found(f_addresses, t_addresses)

In [38]:
not_found.shape # resolved another 2k! That's good enough for me

(5473,)

In [39]:
not_found.head(10) # a taste of some of our addresses found in the full not found in tax. could be something to look into.

4           509 W GORDON AVE
23             2632 ERICA CT
30        235 BONNY VIEW AVE
33              1200 AUGUSTA
37            205 COLLINS ST
46          1705 E BROAD AVE
50    1001 RADIUM SPRINGS RD
56           109 W BROAD AVE
57             110 SHELBY LN
58             112 SHELBY LN
Name: Address, dtype: object

## Pick out relevant columns and ship for primary id merging

In [40]:
tmp = list(df_full_taxes)[:15]
tmp.append("TOTAL_ACRES")
tmp.extend(list(df_full_taxes)[16:])

In [41]:
df_full_taxes.columns = tmp

In [42]:
to_ship = df_full_taxes[['FULL_ADDRESS', 'ZONE_CODE', 'PARCEL_NO', 'PREV_VAL', 'CURR_VAL', 'VALCHGDATE', 'PROPERTY_CLASS', 'TOTAL_ACRES', 'YEAR']]

In [43]:
to_ship.head()

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR
0,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009
1,0 A C L RAILROAD nan,C3,00104/00001/015,26300,26300,11/12/2008,COMMERCIAL,0.39,2009
2,0 A C L RAILROAD nan,C3,00104/00001/016,990,990,11/12/2008,COMMERCIAL,0.25,2009
3,0 A C L RAILROAD nan,AG,00155/00004/013,21400,21400,4/16/2007,RESIDENTIAL,17.19,2009
4,0 A C L RAILROAD nan,AG,00186/00001/006,28600,28600,6/20/2007,RESIDENTIAL,14.3,2009


In [39]:
#to_ship.to_csv(path + 'TotalTax.csv', index=False)

## Here you run the map_addr_key_tax file -- there are instructions there

## Check that the csv + primaryid merge worked alright

In [45]:
df_total_tax = pd.read_csv('/Users/william/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Old Files dump/TotalTax.csv')

In [46]:
df_total_tax.head()

Unnamed: 0.1,Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
0,0,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009,30589
1,1,0 A C L RAILROAD nan,C3,00104/00001/015,26300,26300,11/12/2008,COMMERCIAL,0.39,2009,30589
2,2,0 A C L RAILROAD nan,C3,00104/00001/016,990,990,11/12/2008,COMMERCIAL,0.25,2009,30589
3,3,0 A C L RAILROAD nan,AG,00155/00004/013,21400,21400,4/16/2007,RESIDENTIAL,17.19,2009,30589
4,4,0 A C L RAILROAD nan,AG,00186/00001/006,28600,28600,6/20/2007,RESIDENTIAL,14.3,2009,30589


## solving an issue with the dates

In [52]:
df_total_tax.drop('Unnamed: 0', axis=1, inplace=True) # unnecessary column

In [53]:
df_total_tax.head(200)

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
0,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009,30589
1,0 A C L RAILROAD nan,C3,00104/00001/015,26300,26300,11/12/2008,COMMERCIAL,0.39,2009,30589
2,0 A C L RAILROAD nan,C3,00104/00001/016,990,990,11/12/2008,COMMERCIAL,0.25,2009,30589
3,0 A C L RAILROAD nan,AG,00155/00004/013,21400,21400,4/16/2007,RESIDENTIAL,17.19,2009,30589
4,0 A C L RAILROAD nan,AG,00186/00001/006,28600,28600,6/20/2007,RESIDENTIAL,14.3,2009,30589
5,0 A C L RAILROAD nan,R4,00104/00002/031,3200,3200,,RESIDENTIAL,0.21,2009,30589
6,0 A C L RAILROAD nan,R4,00104/00002/030,4300,4300,,RESIDENTIAL,0.21,2009,30589
7,0 A C L RAILROAD nan,R4,00104/00002/004,3000,3000,3/6/2009,RESIDENTIAL,0.17,2009,30589
8,0 A C L RAILROAD nan,R6,00104/00001/013,4000,4000,11/12/2008,RESIDENTIAL,1.05,2009,30589
9,0 A C L RAILROAD nan,R4,00104/00002/003,9400,9400,10/31/2008,RESIDENTIAL,0.22,2009,30589


In [54]:
df_total_tax['VALCHGDATE'][0] #see how they are encoded

'11/12/2008'

In [55]:
df_total_tax[df_total_tax['FULL_ADDRESS'].str.contains("2700 ABBEY LN")] #test example

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
19,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2009,13777
37848,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2010,13777
75738,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2011,13777
113843,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2012,13777
151904,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2013,13777
189989,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2014,13777
228128,2700 ABBEY LN,R1A,00333/00009/001,159060,157560,4/20/2015,RESIDENTIAL,1.03,2015,13777
266160,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2016,13777
304229,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2017,13777
342756,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,,COMMERCIAL,1.03,2018,13777


In [56]:
df_tax_sorted = df_total_tax.sort_values(by=['FULL_ADDRESS', 'YEAR', 'VALCHGDATE']) # sort df by addr, year, chg date

In [57]:
df_tax_sorted.index = range(len(df_tax_sorted))

In [58]:
df_tax_sorted[df_tax_sorted['FULL_ADDRESS'].str.contains("2700 ABBEY LN")] # example

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
234850,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2009,13777
234851,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2010,13777
234852,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2011,13777
234853,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2012,13777
234854,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2013,13777
234855,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2014,13777
234856,2700 ABBEY LN,R1A,00333/00009/001,159060,157560,4/20/2015,RESIDENTIAL,1.03,2015,13777
234857,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2016,13777
234858,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2017,13777
234859,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,,COMMERCIAL,1.03,2018,13777


In [59]:
df_tax_sorted['VALCHGDATE'] = df_tax_sorted['VALCHGDATE'].astype(str)

In [61]:
def fix_dates(): # this will create the tsdates column that uses a rule based system for updating the dates
    # currently valchg date has a lot of nans as you can see in the above example. so if we want to use this data
    # as time series we should add dates for the nans, with the previous date.
    i = 0
    TS_DATES = []
    while i < len(df_tax_sorted):
        if i % 50000 == 0:
            print(str(i))
        try:
            existing_entry = df_tax_sorted.iloc[i]
            existing_address = existing_entry['FULL_ADDRESS']
            existing_csv_year = existing_entry['YEAR']
            existing_date = existing_entry['VALCHGDATE']
            if existing_date == 'nan':
                TS_DATES.append("NA")
                while df_tax_sorted.iloc[i+1]['VALCHGDATE'] == 'nan':
                    TS_DATES.append("NA")
                    i = i + 1
            else:
                TS_DATES.append(existing_date)
                while df_tax_sorted.iloc[i+1]['FULL_ADDRESS'] == existing_address:
                    if df_tax_sorted.iloc[i+1]['VALCHGDATE'] == 'nan':
                        TS_DATES.append(existing_date)
                    else:
                        TS_DATES.append(df_tax_sorted.iloc[i+1]['VALCHGDATE'])
                        existing_date = df_tax_sorted.iloc[i+1]['VALCHGDATE']
                    i = i + 1
        except IndexError:
            print(str(i))
        i = i + 1
    return pd.Series(TS_DATES)

In [62]:
TS_DATES = fix_dates()

0
50000
100000
250000
300000
421914


In [63]:
df_tax_sorted['TS_DATES'] = TS_DATES

In [64]:
df_tax_sorted.head(20)

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID,TS_DATES
0,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2017,30589,
1,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2017,30589,
2,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2018,30589,
3,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2018,30589,
4,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2019,30589,
5,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2019,30589,
6,0 A C L RAILROAD,R1B,00122/00001/020,314700,314700,,RESIDENTIAL,152.66,2019,30589,
7,0 A C L RAILROAD nan,C5,00140/00001/007,175000,175000,1/13/2009,,43.75,2009,30589,1/13/2009
8,0 A C L RAILROAD nan,R4,00104/00002/003,9400,9400,10/31/2008,RESIDENTIAL,0.22,2009,30589,10/31/2008
9,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009,30589,11/12/2008


In [65]:
df_tax_sorted.iloc[7]['TS_DATES']

'1/13/2009'

In [66]:
def construct_ymd(): # takes ts dates into a more workable format
    year = []
    month = []
    day = []
    for i in range(len(df_tax_sorted)):
        if i % 50000 == 0:
            print(str(i))
        current_date = df_tax_sorted.iloc[i]['TS_DATES']
        if current_date == "NA":
            year.append("NA")
            month.append("NA")
            day.append("NA")
        else:
            split = current_date.split("/")
            year.append(split[2])
            month.append(split[0])
            day.append(split[1])
    return pd.DataFrame({'TS_YEAR': year, 'TS_MONTH': month, 'TS_DAY': day}) #change to TS

In [67]:
ymd = construct_ymd()

0
50000
100000
150000
200000
250000
300000
350000
400000


In [68]:
ymd.head(20)

Unnamed: 0,TS_YEAR,TS_MONTH,TS_DAY
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,
7,2009.0,1.0,13.0
8,2008.0,10.0,31.0
9,2008.0,11.0,12.0


In [70]:
df_final = pd.concat([df_tax_sorted, ymd], axis=1) # this is tax data

In [80]:
#df_final.to_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/ToDatabase/TotalTax_v03.csv', index=False)

#### some last minute stats

In [76]:
test_t_addr = df_final['FULL_ADDRESS']
test_f_addr = df_addr['Address']
not_found = test_f_addr[~test_f_addr.isin(test_t_addr)]

In [77]:
not_found.shape #discrepancy from before seems to be based off of the fix addresses master for some reason check out later

(8795,)

In [78]:
test_t_addr.shape

(421915,)

In [79]:
test_f_addr.shape

(30590,)

In [121]:
(30721-5868) / 30721

0.8089905927541421