In [1]:
import pandas as pd
import numpy as np
import Levenshtein
import math

In [2]:
pd.options.display.max_columns = 22
pd.options.display.max_rows = 1000

## Read in all data

In [3]:
df_taxes_list = []
path = "~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/"
for i in range(2009, 2020):
    df_tmp = pd.read_csv(path + "AY" + str(i) + " Real Property_Personal Property.csv")
    df_tmp['YEAR'] = i
    df_taxes_list.append(df_tmp)
df_full_taxes = pd.concat(df_taxes_list, ignore_index=True)
df_full_taxes.head();

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_home = pd.read_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/fixed_home_v02.csv')
df_cdbg = pd.read_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/fixed_cdbg.csv')

In [5]:
df_addr = pd.read_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/ToDatabase/addr_junct_table.csv')

## Functions

In [6]:
def genStrip(): # strip off extra white space
    return lambda addr: addr.strip()

In [7]:
def genSpacing(): # fix spacing
    def test(word):
        if word == "  ":
            return ""
        elif type(word) == float:
            if math.isnan(word):
                return ""
        else:
            return word
    return test

In [8]:
def lev_list(address_list, address, i): # will help us find near matches
    temp_list = []
    temp_list_u = []
    for u_address in address_list:
        if Levenshtein.distance(address, u_address.upper()) <= i:
            temp_list.append(address)
            temp_list_u.append(u_address.upper())
    temp_list_series = pd.Series(temp_list)
    temp_list_u_series = pd.Series(temp_list_u)
    return temp_list_u_series

In [9]:
def gen_list_none(address_list, mismatched, i): # just to find how many addresses are so far off from anything in full
    tmp_list = []
    for address in mismatched:
        if address == "UNKNOWN ADDRESS" or address == "SUPPRESSED ADDRESS":
            continue
        if lev_list(address_list, address, i).shape[0] == 0:
            tmp_list.append(address)
    return pd.Series(tmp_list)

In [10]:
def contains_cardinal(addr): # returns None if no cardinal direction, and direction otherwise
    directions = [' E ', ' W ', ' N ', ' S ', ' NE ', ' NW ', ' SW ', ' SE ']
    count = 0
    for car in directions:
        if car in addr:
            if "JOHNNY W WILLIAMS" in addr:
                return None
            return car # tested earlier, only one instance of each in every single address (no double directions)
    return None

In [11]:
def construct_st_name(address): # returns the street name of the address w/o number, direction, or ending
    split = address.split()
    end = split[-1]
    first = "NA"
    start_index = 0
    end_index = len(split) - 1
    if split[0].isnumeric():
        start_index += 1
    if contains_cardinal(address) != None:
        start_index += 1
    first = split[start_index:end_index]
    word = ""
    for a in first:
        word += a + " "
    word = word.strip()
    return word

In [12]:
def construct_dictionary(): # constructs dictionary from full addresses to create master lookup for endings
    endings = {'AVE', 'DR', 'RD', 'ST', 'LN', 'CT', 'BLVD', 'CIR', 
               'WAY', 'PL', 'EXPY', 'ALY', 'TRL', 'PKWY', 'TER'}
    dictionary = {}
    counter = 0
    for i in range(len(f_addresses)):
        split = f_addresses[i].split()
        end = split[-1]
        if end in endings:
            st_name = construct_st_name(f_addresses[i])
            if st_name in dictionary.keys():
                existing_endings = dictionary[st_name]
                if end not in existing_endings:
                    counter += 1
                    existing_endings.append(end)
            else:
                dictionary[st_name] = [end]
    return dictionary

In [13]:
def test(): # takes in the dictionary and tests to see if a new ending for a given address can be found in full data
    f_addresses_set = set(f_addresses)
    t_addresses_set = set(t_addresses)
    lis = []
    newly_found = []
    replace_in_tax = {}
    words = []
    dictionary = construct_dictionary()
    counter = 0
    for i in range(len(no_tax)):
        split = no_tax.iloc[i].split()
        word = split[:-1]
        wordo = ""
        for a in word:
            wordo += a + " "
        wordo = wordo.strip()
        st_name = construct_st_name(no_tax.iloc[i])
        found = False
        if st_name in dictionary.keys():
            for ending in dictionary[st_name]: #test a full address
                test = wordo + " " + ending
                words.append(test)
                if test in t_addresses_set:
                    newly_found.append(test)
                    replace_in_tax[test] = no_tax.iloc[i]
                    found = True
                    continue
            if not found:
                newly_found.append(no_tax.iloc[i])
        else:
            newly_found.append(no_tax.iloc[i])
            
    return pd.Series(newly_found), replace_in_tax

In [14]:
df_full_taxes['FULL_ADDRESS'] = df_full_taxes['HOUSE_NO'].astype(str).apply(genStrip()) + \
' ' + df_full_taxes['STDIRECT'].apply(genSpacing()) + df_full_taxes["STREET_NAM"].apply(genStrip()) + ' ' + df_full_taxes["STREET_TYPE"].astype(str).apply(genStrip())

In [15]:
f_addresses = df_addr['Address']
t_addresses = pd.Series(df_full_taxes['FULL_ADDRESS'].unique()) # to compare

In [16]:
t_addresses[~t_addresses.isin(f_addresses)].shape

(16224,)

In [17]:
no_tax = f_addresses[~f_addresses.isin(t_addresses)] # addresses that we "don't" have tax data for

In [18]:
no_tax.shape

(9029,)

In [19]:
def hasNumbers(inputString): # from stackoverflow
    return any(char.isdigit() for char in inputString)

In [20]:
def combine(row):
    word = ""
    for r in row:
        if r != None:
            word += r + " "
    return word.strip()

In [21]:
test_addr = df_addr['Address'].str.split(expand=True)

In [22]:
test_addr = test_addr.drop(0, axis=1)

In [23]:
test_addr = test_addr.apply(combine, axis=1)

In [24]:
test_addr[test_addr.apply(hasNumbers)].unique()

array(['W 4TH AVE', '15TH AVE', '1/2 S SLAPPEY BLVD', 'US 19S',
       'US HIGHWAY 19 S', 'US HIGHWAY 280 W', 'W 3RD AVE', '3RD AVE',
       'US HWY 82', 'GILLIONVILLE (SUBSTATION 21) RD', '21ST AVE',
       'W 1ST AVE', '16TH', '1/2 DAWSON RD', 'E 4TH AVE',
       'US HIGHWAY 19S', 'US 19 S', '1/2 KEYSTONE AVE', 'E 2ND AVE',
       'E 3RD AVE', '5TH AVE', '6TH AVE', 'W 2ND AVE', '9TH AVE',
       '12TH AVE', '8TH AVE', 'E 1ST AVE', '10TH AVE', '2ND AVE',
       '16TH AVE', '7TH AVE', '17TH AVE', '11TH AVE', '18TH AVE',
       '1/2 GROVES LN', '1ST AVE', '4TH AVE', 'W US HWY 82', '14TH AVE',
       '1/2 FLINT AVE', '1/2 W TIFT AVE', 'W 2ND W AVE', 'W 3RD W AVE',
       '13TH AVE', '20TH AVE', '4TH AVENUE ALY', '6TH AVENUE ALY',
       'E 4TH E AVE', '1/2 WINGATE AVE'], dtype=object)

In [25]:
def construct_number_dictionary():
    string = ['FIRST', 'SECOND', 'THIRD', 'FOURTH', 'FIFTH', 'SIXTH', 'SEVENTH',
             'EIGHTH', 'NINTH', 'TENTH', 'ELEVENTH', 'TWELFTH', 'THIRTEENTH',
             'FOURTEENTH', 'FIFTEENTH', 'SIXTEENTH', 'SEVENTEENTH', 'EIGHTEENTH',
             'NINETEENTH', 'TWENTIETH']
    numeric = ['1ST', '2ND', '3RD', '4TH', '5TH', '6TH', '7TH', '8TH', '9TH',
              '10TH', '11TH', '12TH', '13TH', '14TH', '15TH', '16T', '17TH',
              '18TH', '19TH', '20TH']
    return dict(zip(string, numeric))

In [26]:
def fix_addresses(): # fixes directions after getting endings
    f_addresses_set = set(f_addresses)
    t_addresses_set = set(t_addresses)
    newly_found_2 = []
    newly_found, replace_in_tax = test()
    number_dict = construct_number_dictionary()
    for i in range(len(newly_found)): # loop through values in main that do not have tax data
        addr = newly_found.iloc[i]
        car = contains_cardinal(addr)
        ind = "NA"
        if car != None:
            ind = addr.find(car)
            addr = addr[:ind] + " " + addr[ind+3:]
            if addr in t_addresses_set: # check if we can find the addr if we take out the cardinal direction
                newly_found_2.append(addr) # if we can, we ultimately want to update the tax data with the correct direction
                replace_in_tax[addr] = newly_found.iloc[i]
                continue
            else:
                newly_found_2.append(addr)
        else:
            newly_found_2.append(addr)
    newly_found_2 = pd.Series(newly_found_2)
    return newly_found_2, replace_in_tax

In [27]:
newly_found, replace_in_tax = fix_addresses()

In [28]:
newly_found.shape

(9029,)

## only testing on t_addresses, not actual dataset (CLEAN SECTION UP)

In [29]:
replaced = t_addresses.replace(replace_in_tax)

In [30]:
replaced.shape

(37916,)

In [31]:
no_tax_after_replace = f_addresses[~f_addresses.isin(replaced)]

In [32]:
no_tax_after_replace.shape

(7239,)

In [33]:
no_tax_after_replace.head(25)

4           509 W GORDON AVE
23             2632 ERICA CT
30        235 BONNY VIEW AVE
33              1200 AUGUSTA
37          205 S COLLINS ST
46          1705 E BROAD AVE
50    1001 RADIUM SPRINGS RD
56           109 W BROAD AVE
57             110 SHELBY LN
58             112 SHELBY LN
65        252 BONNY VIEW AVE
66        243 BONNY VIEW AVE
67        247 BONNY VIEW AVE
68        248 BONNY VIEW AVE
69        251 BONNY VIEW AVE
71           2530 STUART AVE
73            2726 BETTYS DR
76          2448 BROADWAY CT
77           248 BROADWAY CT
78           713 SHAMROCK DR
79           715 SHAMROCK DR
82             604 W 4TH AVE
93        311 S SLAPPEY BLVD
94        3353 TWINFLOWER RD
96           405 THORNTON DR
Name: Address, dtype: object

In [34]:
lev_list(t_addresses, "110 SHELBY LN", 3)

0       0 SHELBY LN
1     110 SHELBY DR
2     111 SHELBY DR
3     112 SHELBY DR
4     113 SHELBY DR
5     114 SHELBY LN
6     115 SHELBY LN
7     117 SHELBY LN
8     119 SHELBY LN
9     121 SHELBY LN
10    122 SHELBY LN
11    123 SHELBY LN
12    125 SHELBY LN
13    127 SHELBY LN
14    204 SHELBY LN
15    205 SHELBY LN
16    206 SHELBY LN
17    207 SHELBY LN
18    208 SHELBY LN
19    209 SHELBY LN
20    210 SHELBY LN
21    211 SHELBY LN
22    212 SHELBY LN
23    213 SHELBY LN
24    214 SHELBY LN
25    215 SHELBY LN
26    216 SHELBY LN
27    217 SHELBY LN
28    218 SHELBY LN
29    219 SHELBY LN
30    220 SHELBY LN
31    221 SHELBY LN
32    222 SHELBY LN
33    223 SHELBY LN
34    224 SHELBY LN
dtype: object

In [35]:
f_addresses.shape

(30721,)

In [36]:
(30721-7239) / 30721

0.7643631392207285

In [37]:
df_full_taxes.head(100)

Unnamed: 0,OWNER_NAME,HOUSE_NO,STDIRECT,STREET_NAM,STREET_TYPE,UNIT,PROP_ZIP,ZONE_CODE,SUBDIVISION,PARCEL_NO,PREV_VAL,...,VALCHGDATE,TAXDISTRIC,HOMEEXEMPT,TOTALACRES,PROPERTY_CLASS,DIG_STRAT,BUSI_ID,NAICS,PERSKEY,YEAR,FULL_ADDRESS
0,NICHOLS DORRIS WILDER,0,,A C L RAILROAD,,,,C3,COUNTY LINE - ACREE AREA,00104/00001/014,13100,...,11/12/2008,2,S0,0.0,RESIDENTIAL,3,,,,2009,0 A C L RAILROAD nan
1,WESTON MONTY,0,,A C L RAILROAD,,,,C3,COUNTY LINE - ACREE AREA,00104/00001/015,26300,...,11/12/2008,2,S0,0.39,COMMERCIAL,3,,,,2009,0 A C L RAILROAD nan
2,NICHOLS JOE,0,,A C L RAILROAD,,,,C3,COUNTY LINE - ACREE AREA,00104/00001/016,990,...,11/12/2008,2,S0,0.25,COMMERCIAL,3,,,,2009,0 A C L RAILROAD nan
3,ISRAEL CAROLYN R ETAL,0,,A C L RAILROAD,,,,AG,MASSEY,00155/00004/013,21400,...,4/16/2007,1,S0,17.19,RESIDENTIAL,4,,,,2009,0 A C L RAILROAD nan
4,GEER RET,0,,A C L RAILROAD,,,,AG,LIBERTY EXPRESSWAY INDUSTRIAL AREA,00186/00001/006,28600,...,6/20/2007,1,S0,14.3,RESIDENTIAL,4,,,,2009,0 A C L RAILROAD nan
5,WILDER SARAH P & DORRIS,0,,A C L RAILROAD,,,,R4,M. L. WILDER,00104/00002/031,3200,...,,2,S0,0.21,RESIDENTIAL,3,,,,2009,0 A C L RAILROAD nan
6,WILDER SARAH P & DORRIS,0,,A C L RAILROAD,,,,R4,M. L. WILDER,00104/00002/030,4300,...,,2,S0,0.21,RESIDENTIAL,3,,,,2009,0 A C L RAILROAD nan
7,HENDERSON PEARL WILDER,0,,A C L RAILROAD,,,,R4,M. L. WILDER,00104/00002/004,3000,...,3/6/2009,2,S0,0.17,RESIDENTIAL,3,,,,2009,0 A C L RAILROAD nan
8,WILDER SARAH P & DORRIS,0,,A C L RAILROAD,,,,R6,COUNTY LINE - ACREE AREA,00104/00001/013,4000,...,11/12/2008,2,S0,1.05,RESIDENTIAL,3,,,,2009,0 A C L RAILROAD nan
9,HENDERSON PEARL WILDER,0,,A C L RAILROAD,,,,R4,M. L. WILDER,00104/00002/003,9400,...,10/31/2008,2,S0,0.22,RESIDENTIAL,3,,,,2009,0 A C L RAILROAD nan


In [38]:
t_all_addresses = df_full_taxes['FULL_ADDRESS']

In [39]:
t_all_addresses.shape

(421915,)

In [40]:
replaced_t_all_addresses = t_all_addresses.replace(replace_in_tax)

In [56]:
dictionary = construct_number_dictionary()
replaced_t_all_addresses = replaced_t_all_addresses.replace(dictionary, regex=True)

In [57]:
no_tax_after_replace = f_addresses[~f_addresses.isin(replaced_t_all_addresses)]

In [58]:
no_tax_after_replace.shape

(5868,)

In [59]:
df_full_taxes['FULL_ADDRESS'] = replaced_t_all_addresses

In [60]:
tmp = list(df_full_taxes)[:15]

In [61]:
tmp.append("TOTAL_ACRES")

In [62]:
tmp.extend(list(df_full_taxes)[16:])

In [63]:
df_full_taxes.columns = tmp

In [64]:
to_ship = df_full_taxes[['FULL_ADDRESS', 'ZONE_CODE', 'PARCEL_NO', 'PREV_VAL', 'CURR_VAL', 'VALCHGDATE', 'PROPERTY_CLASS', 'TOTAL_ACRES', 'YEAR']]

In [65]:
to_ship.head()

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR
0,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009
1,0 A C L RAILROAD nan,C3,00104/00001/015,26300,26300,11/12/2008,COMMERCIAL,0.39,2009
2,0 A C L RAILROAD nan,C3,00104/00001/016,990,990,11/12/2008,COMMERCIAL,0.25,2009
3,0 A C L RAILROAD nan,AG,00155/00004/013,21400,21400,4/16/2007,RESIDENTIAL,17.19,2009
4,0 A C L RAILROAD nan,AG,00186/00001/006,28600,28600,6/20/2007,RESIDENTIAL,14.3,2009


In [66]:
#to_ship.to_csv(path + 'TotalTax.csv', index=False)

## Check that the csv + primaryid merge worked alright

In [67]:
df_total_tax = pd.read_csv(path + 'TotalTax.csv')

In [68]:
na_date = df_full_taxes[df_full_taxes['VALCHGDATE'].isna()]

In [69]:
na_date.shape

(205574, 23)

In [70]:
zone_codes = df_total_tax.groupby('FULL_ADDRESS').count()['ZONE_CODE']

In [71]:
zone_codes[zone_codes % 11 != 0].value_counts()

3       686
4       482
2       472
8       465
7       452
6       430
5       358
9       320
1       281
10      154
14      127
13      111
12       90
15       53
17       46
16       45
19       35
20       29
18       23
21       16
26       14
23       14
25       12
24       10
34        8
28        8
27        7
30        6
31        6
32        6
46        5
37        5
29        5
47        5
42        5
60        4
35        4
45        4
38        4
43        4
48        3
102       2
62        2
41        2
125       2
49        2
53        2
39        2
56        2
40        2
36        2
80        2
57        1
359       1
356       1
332       1
184       1
69        1
140       1
124       1
120       1
112       1
64        1
65        1
461       1
73        1
162       1
174       1
367       1
51        1
59        1
67        1
71        1
75        1
87        1
123       1
135       1
163       1
171       1
211       1
279       1
295       1
166       1
118 

In [72]:
df_full_taxes['FULL_ADDRESS'][df_full_taxes['PERSKEY'].isna()]

0             0 A C L RAILROAD nan
1             0 A C L RAILROAD nan
2             0 A C L RAILROAD nan
3             0 A C L RAILROAD nan
4             0 A C L RAILROAD nan
5             0 A C L RAILROAD nan
6             0 A C L RAILROAD nan
7             0 A C L RAILROAD nan
8             0 A C L RAILROAD nan
9             0 A C L RAILROAD nan
10            0 A C L RAILROAD nan
11            0 A C L RAILROAD nan
12            0 A C L RAILROAD nan
13            0 A C L RAILROAD nan
14            0 A C L RAILROAD nan
15            0 A C L RAILROAD nan
16            0 A C L RAILROAD nan
17            0 A C L RAILROAD nan
18         4408 A C L RAILROAD nan
19                   2700 ABBEY LN
20                   2701 ABBEY LN
21                   2702 ABBEY LN
22                   2703 ABBEY LN
23                   2704 ABBEY LN
24                   2705 ABBEY LN
25                   2706 ABBEY LN
26                   2707 ABBEY LN
27                   2708 ABBEY LN
28                  

## solving an issue with the dates

In [73]:
df_total_tax.drop('Unnamed: 0', axis=1, inplace=True)

In [74]:
df_total_tax.head(200)

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
0,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009,30720
1,0 A C L RAILROAD nan,C3,00104/00001/015,26300,26300,11/12/2008,COMMERCIAL,0.39,2009,30720
2,0 A C L RAILROAD nan,C3,00104/00001/016,990,990,11/12/2008,COMMERCIAL,0.25,2009,30720
3,0 A C L RAILROAD nan,AG,00155/00004/013,21400,21400,4/16/2007,RESIDENTIAL,17.19,2009,30720
4,0 A C L RAILROAD nan,AG,00186/00001/006,28600,28600,6/20/2007,RESIDENTIAL,14.3,2009,30720
5,0 A C L RAILROAD nan,R4,00104/00002/031,3200,3200,,RESIDENTIAL,0.21,2009,30720
6,0 A C L RAILROAD nan,R4,00104/00002/030,4300,4300,,RESIDENTIAL,0.21,2009,30720
7,0 A C L RAILROAD nan,R4,00104/00002/004,3000,3000,3/6/2009,RESIDENTIAL,0.17,2009,30720
8,0 A C L RAILROAD nan,R6,00104/00001/013,4000,4000,11/12/2008,RESIDENTIAL,1.05,2009,30720
9,0 A C L RAILROAD nan,R4,00104/00002/003,9400,9400,10/31/2008,RESIDENTIAL,0.22,2009,30720


In [75]:
df_total_tax['VALCHGDATE'][0]

'11/12/2008'

In [76]:
df_total_tax[df_total_tax['FULL_ADDRESS'].str.contains("2700 ABBEY LN")]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
19,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2009,13892
37848,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2010,13892
75738,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2011,13892
113843,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2012,13892
151904,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2013,13892
189989,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2014,13892
228128,2700 ABBEY LN,R1A,00333/00009/001,159060,157560,4/20/2015,RESIDENTIAL,1.03,2015,13892
266160,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2016,13892
304229,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2017,13892
342756,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,,COMMERCIAL,1.03,2018,13892


In [77]:
df_total_tax[df_total_tax['FULL_ADDRESS'].str.contains("2701 ABBEY LN")]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
20,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,10/31/2008,RESIDENTIAL,1.33,2009,13878
37849,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,10/20/2009,RESIDENTIAL,1.33,2010,13878
75739,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,10/20/2009,RESIDENTIAL,1.33,2011,13878
113844,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,10/20/2009,RESIDENTIAL,1.33,2012,13878
151905,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,,RESIDENTIAL,1.33,2013,13878
189990,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,,RESIDENTIAL,1.33,2014,13878
228129,2701 ABBEY LN,R1A,00333/00006/017,175500,174000,4/20/2015,RESIDENTIAL,1.33,2015,13878
266161,2701 ABBEY LN,R1A,00333/00006/017,167000,167000,12/8/2015,RESIDENTIAL,1.33,2016,13878
304230,2701 ABBEY LN,R1A,00333/00006/017,167000,167000,,RESIDENTIAL,1.33,2017,13878
342757,2701 ABBEY LN,R1A,00333/00006/017,167000,167000,,RESIDENTIAL,1.33,2018,13878


In [78]:
df_full_taxes['PERSKEY'].value_counts()

447994.0    124
941408.0     55
967470.0     51
951814.0     51
975244.0     51
954978.0     51
954929.0     51
957476.0     51
962020.0     51
954565.0     51
969620.0     51
719297.0     51
962230.0     51
975382.0     51
957193.0     51
965058.0     51
962833.0     51
976375.0     34
959359.0     34
969695.0     34
699192.0     32
950698.0     30
954371.0     30
646858.0     30
967044.0     30
950050.0     30
956953.0     30
699187.0     26
951781.0     22
650513.0     22
890832.0     22
892227.0     22
300645.0     22
506986.0     22
595437.0     22
869947.0     22
940849.0     21
699208.0     20
965548.0     19
628949.0     19
718157.0     18
954061.0     18
960040.0     18
563054.0     18
301193.0     18
959993.0     18
867012.0     17
268146.0     17
454053.0     16
966446.0     16
966327.0     16
454823.0     16
959211.0     16
606794.0     16
975423.0     16
573701.0     16
958636.0     16
550286.0     16
975833.0     16
959332.0     16
318076.0     16
976019.0     16
891417.0

In [79]:
df_full_taxes[df_full_taxes['FULL_ADDRESS'].str.contains("948 N MAPLE ST")]

Unnamed: 0,OWNER_NAME,HOUSE_NO,STDIRECT,STREET_NAM,STREET_TYPE,UNIT,PROP_ZIP,ZONE_CODE,SUBDIVISION,PARCEL_NO,PREV_VAL,...,VALCHGDATE,TAXDISTRIC,HOMEEXEMPT,TOTAL_ACRES,PROPERTY_CLASS,DIG_STRAT,BUSI_ID,NAICS,PERSKEY,YEAR,FULL_ADDRESS
20161,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,A T & T MOBILITY LLC,513322.0,870266.0,2009,948 N MAPLE ST
20162,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,T MOBILE SOUTH LLC,513322.0,886103.0,2009,948 N MAPLE ST
20163,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,NEXTEL PARTNERS EQUIPMENT CORP,513322.0,931100.0,2009,948 N MAPLE ST
58013,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,A T & T MOBILITY LLC,513322.0,870266.0,2010,948 N MAPLE ST
58014,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,T MOBILE SOUTH LLC,513322.0,886103.0,2010,948 N MAPLE ST
58015,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,NEXTEL PARTNERS EQUIPMENT CORP,513322.0,931100.0,2010,948 N MAPLE ST
96045,ALLIED WIRELESS COMMUNICATIONS GA LLC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,,1,S0,4.65,COMMERCIAL,3,A T & T MOBILITY LLC,513322.0,870266.0,2011,948 N MAPLE ST
96046,ALLIED WIRELESS COMMUNICATIONS GA LLC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,,1,S0,4.65,COMMERCIAL,3,T MOBILE SOUTH LLC,513322.0,886103.0,2011,948 N MAPLE ST
96047,ALLIED WIRELESS COMMUNICATIONS GA LLC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,,1,S0,4.65,COMMERCIAL,3,NEXTEL PARTNERS EQUIPMENT CORP,513322.0,931100.0,2011,948 N MAPLE ST
134147,ALLIED WIRELESS COMMUNICATIONS GA LLC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,,1,S0,4.65,COMMERCIAL,3,A T & T MOBILITY LLC,513322.0,870266.0,2012,948 N MAPLE ST


In [80]:
df_total_tax[df_total_tax['FULL_ADDRESS'].str.contains("948 N MAPLE ST")]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
20161,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2009,30720
20162,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2009,30720
20163,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2009,30720
58013,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2010,30720
58014,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2010,30720
58015,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2010,30720
96045,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,,COMMERCIAL,4.65,2011,30720
96046,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,,COMMERCIAL,4.65,2011,30720
96047,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,,COMMERCIAL,4.65,2011,30720
134147,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,,COMMERCIAL,4.65,2012,30720


In [81]:
df_total_tax[['FULL_ADDRESS', 'VALCHGDATE', 'YEAR']].groupby('FULL_ADDRESS')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11dd40f60>

In [82]:
df_tax_sorted = df_total_tax.sort_values(by=['FULL_ADDRESS', 'YEAR', 'VALCHGDATE'])

In [85]:
df_full_taxes.iloc[9235]

OWNER_NAME        3F PROPERTIES LLC                       
HOUSE_NO                                                 0
STDIRECT                                                  
STREET_NAM                       ELEVENTH                 
STREET_TYPE                                           AVE 
UNIT                                                   NaN
PROP_ZIP                                               NaN
ZONE_CODE                                     M1          
SUBDIVISION       FLINT RIVER COTTON M                    
PARCEL_NO                             000LL/00009/03B     
PREV_VAL                                             26100
CURR_VAL                                             26100
VALCHGDATE                                       8/15/2008
TAXDISTRIC                                               1
HOMEEXEMPT                                           S0   
TOTAL_ACRES                                          11.61
PROPERTY_CLASS                                  COMMERCI

In [87]:
df_tax_sorted['PrimaryID'].value_counts().sum()

421915

In [89]:
df_tax_sorted.index = range(len(df_tax_sorted))

In [99]:
df_tax_sorted[df_tax_sorted['FULL_ADDRESS'].str.contains("2700 ABBEY LN")]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
234850,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2009,13892
234851,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2010,13892
234852,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2011,13892
234853,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2012,13892
234854,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2013,13892
234855,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2014,13892
234856,2700 ABBEY LN,R1A,00333/00009/001,159060,157560,4/20/2015,RESIDENTIAL,1.03,2015,13892
234857,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2016,13892
234858,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2017,13892
234859,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,,COMMERCIAL,1.03,2018,13892


In [91]:
df_tax_sorted['VALCHGDATE'] = df_tax_sorted['VALCHGDATE'].astype(str)

In [92]:
def fix_dates():
    i = 0
    TS_DATES = []
    while i < len(df_tax_sorted):
        if i % 50000 == 0:
            print(str(i))
        try:
            existing_entry = df_tax_sorted.iloc[i]
            existing_address = existing_entry['FULL_ADDRESS']
            existing_csv_year = existing_entry['YEAR']
            existing_date = existing_entry['VALCHGDATE']
            if existing_date == 'nan':
                TS_DATES.append("NA")
                while df_tax_sorted.iloc[i+1]['VALCHGDATE'] == 'nan':
                    TS_DATES.append("NA")
                    i = i + 1
            else:
                TS_DATES.append(existing_date)
                while df_tax_sorted.iloc[i+1]['FULL_ADDRESS'] == existing_address:
                    if df_tax_sorted.iloc[i+1]['VALCHGDATE'] == 'nan':
                        TS_DATES.append(existing_date)
                    else:
                        TS_DATES.append(df_tax_sorted.iloc[i+1]['VALCHGDATE'])
                        existing_date = df_tax_sorted.iloc[i+1]['VALCHGDATE']
                    i = i + 1
        except IndexError:
            print(str(i))
        i = i + 1
    return pd.Series(TS_DATES)

In [93]:
TS_DATES = fix_dates()

0
50000
100000
250000
300000
421914


In [98]:
df_tax_sorted.head(10)

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
0,0 11TH AVE,M1,000LL/00009/03B,26100,26100,8/15/2008,COMMERCIAL,11.61,2009,30720
1,0 11TH AVE,M1,000LL/00009/03B,26100,26100,8/15/2008,COMMERCIAL,11.61,2010,30720
2,0 11TH AVE,M1,000LL/00009/03B,26100,26100,6/13/2011,COMMERCIAL,11.61,2011,30720
3,0 11TH AVE,M1,000LL/00009/03B,26100,26100,8/15/2011,COMMERCIAL,11.61,2012,30720
4,0 11TH AVE,M1,000LL/00009/03B,26100,26100,8/15/2011,COMMERCIAL,11.61,2013,30720
5,0 11TH AVE,M1,000LL/00009/03B,26100,26100,1/24/2014,COMMERCIAL,11.61,2014,30720
6,0 11TH AVE,M1,000LL/00009/03B,26100,26100,4/28/2015,COMMERCIAL,11.61,2015,30720
7,0 11TH AVE,M1,000LL/00009/03B,26100,39400,2/25/2016,COMMERCIAL,11.61,2016,30720
8,0 11TH AVE,M1,000LL/00009/03B,39400,39400,2/25/2016,COMMERCIAL,11.61,2017,30720
9,0 11TH AVE,M1,000LL/00009/03B,39400,39400,2/25/2016,COMMERCIAL,11.61,2018,30720


In [95]:
TS_DATES.head(10)

0    8/15/2008
1    8/15/2008
2    6/13/2011
3    8/15/2011
4    8/15/2011
5    1/24/2014
6    4/28/2015
7    2/25/2016
8    2/25/2016
9    2/25/2016
dtype: object

In [100]:
TS_DATES.iloc[234850:234861]

234850            NA
234851    10/20/2009
234852    10/20/2009
234853    10/20/2009
234854    10/20/2009
234855    10/20/2009
234856     4/20/2015
234857     4/20/2015
234858     4/20/2015
234859     4/20/2015
234860     4/20/2015
dtype: object

In [101]:
TS_DATES.tail()

421910    3/26/2012
421911    3/26/2012
421912    3/26/2012
421913    3/26/2012
421914    3/26/2012
dtype: object

In [102]:
df_tax_sorted['TS_DATES'] = TS_DATES

In [103]:
df_tax_sorted.head(20)

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID,TS_DATES
0,0 11TH AVE,M1,000LL/00009/03B,26100,26100,8/15/2008,COMMERCIAL,11.61,2009,30720,8/15/2008
1,0 11TH AVE,M1,000LL/00009/03B,26100,26100,8/15/2008,COMMERCIAL,11.61,2010,30720,8/15/2008
2,0 11TH AVE,M1,000LL/00009/03B,26100,26100,6/13/2011,COMMERCIAL,11.61,2011,30720,6/13/2011
3,0 11TH AVE,M1,000LL/00009/03B,26100,26100,8/15/2011,COMMERCIAL,11.61,2012,30720,8/15/2011
4,0 11TH AVE,M1,000LL/00009/03B,26100,26100,8/15/2011,COMMERCIAL,11.61,2013,30720,8/15/2011
5,0 11TH AVE,M1,000LL/00009/03B,26100,26100,1/24/2014,COMMERCIAL,11.61,2014,30720,1/24/2014
6,0 11TH AVE,M1,000LL/00009/03B,26100,26100,4/28/2015,COMMERCIAL,11.61,2015,30720,4/28/2015
7,0 11TH AVE,M1,000LL/00009/03B,26100,39400,2/25/2016,COMMERCIAL,11.61,2016,30720,2/25/2016
8,0 11TH AVE,M1,000LL/00009/03B,39400,39400,2/25/2016,COMMERCIAL,11.61,2017,30720,2/25/2016
9,0 11TH AVE,M1,000LL/00009/03B,39400,39400,2/25/2016,COMMERCIAL,11.61,2018,30720,2/25/2016


In [104]:
df_tax_sorted.iloc[7]['TS_DATES']

'2/25/2016'

In [105]:
def construct_ymd():
    year = []
    month = []
    day = []
    for i in range(len(df_tax_sorted)):
        if i % 50000 == 0:
            print(str(i))
        current_date = df_tax_sorted.iloc[i]['TS_DATES']
        if current_date == "NA":
            year.append("NA")
            month.append("NA")
            day.append("NA")
        else:
            split = current_date.split("/")
            year.append(split[2])
            month.append(split[0])
            day.append(split[1])
    return pd.DataFrame({'YEAR': year, 'MONTH': month, 'DAY': day})

In [106]:
ymd = construct_ymd()

0
50000
100000
150000
200000
250000
300000
350000
400000


In [107]:
ymd.head(20)

Unnamed: 0,YEAR,MONTH,DAY
0,2008,8,15
1,2008,8,15
2,2011,6,13
3,2011,8,15
4,2011,8,15
5,2014,1,24
6,2015,4,28
7,2016,2,25
8,2016,2,25
9,2016,2,25


In [108]:
df_final = pd.concat([df_tax_sorted, ymd], axis=1)

In [109]:
df_final.iloc[139042:310192]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID,TS_DATES,YEAR.1,MONTH,DAY
139042,1908 S RIVERVIEW CIR,R3,0000K/00042/005,2700,3400,2/25/2013,RESIDENTIAL,0.17,2013,2924,2/25/2013,2013,2,25
139043,1908 S RIVERVIEW CIR,R3,0000K/00042/005,3400,3400,2/25/2013,RESIDENTIAL,0.17,2014,2924,2/25/2013,2013,2,25
139044,1908 S RIVERVIEW CIR,R3,0000K/00042/005,3400,590,4/20/2015,RESIDENTIAL,0.17,2015,2924,4/20/2015,2015,4,20
139045,1908 S RIVERVIEW CIR,R3,0000K/00042/005,590,590,4/20/2015,RESIDENTIAL,0.17,2016,2924,4/20/2015,2015,4,20
139046,1908 S RIVERVIEW CIR,R3,0000K/00042/005,590,590,4/20/2015,RESIDENTIAL,0.17,2017,2924,4/20/2015,2015,4,20
139047,1908 S RIVERVIEW CIR,R3,0000K/00042/005,590,590,4/20/2015,RESIDENTIAL,0.17,2018,2924,4/20/2015,2015,4,20
139048,1908 S RIVERVIEW CIR,R3,0000K/00042/005,590,590,4/20/2015,RESIDENTIAL,0.17,2019,2924,4/20/2015,2015,4,20
139049,1908 S SKYLANE DR,R3,0000K/00041/005,36100,36100,12/10/2007,RESIDENTIAL,0.15,2009,26166,12/10/2007,2007,12,10
139050,1908 S SKYLANE DR,R3,0000K/00041/005,36100,36100,12/10/2007,RESIDENTIAL,0.15,2010,26166,12/10/2007,2007,12,10
139051,1908 S SKYLANE DR,R3,0000K/00041/005,36100,36100,,RESIDENTIAL,0.15,2011,26166,12/10/2007,2007,12,10


In [110]:
#df_final.to_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/ToDatabase/TotalTax.csv', index=False)

In [112]:
test_t_addr = df_final['FULL_ADDRESS']
test_f_addr = df_addr['Address']
not_found = test_f_addr[~test_f_addr.isin(test_t_addr)]

In [118]:
not_found.shape

(5868,)

In [119]:
test_t_addr.shape

(421915,)

In [120]:
test_f_addr.shape

(30721,)

In [121]:
(30721-5868) / 30721

0.8089905927541421