In [1]:
import pandas as pd
import numpy as np
import Levenshtein
import math

In [2]:
pd.options.display.max_columns = 22
pd.options.display.max_rows = 1000

## Read in all data

In [3]:
df_taxes_list = []
path = "~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/"
for i in range(2009, 2020):
    df_tmp = pd.read_csv(path + "AY" + str(i) + " Real Property_Personal Property.csv")
    df_tmp['YEAR'] = i
    df_taxes_list.append(df_tmp)
df_full_taxes = pd.concat(df_taxes_list, ignore_index=True)
df_full_taxes.head();

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_home = pd.read_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/fixed_home_v02.csv')
df_cdbg = pd.read_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/fixed_cdbg.csv')

In [5]:
df_addr = pd.read_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/ToDatabase/addr_junct_table.csv')

## Functions

In [6]:
def genStrip(): # strip off extra white space
    return lambda addr: addr.strip()

In [7]:
def genSpacing(): # fix spacing
    def test(word):
        if word == "  ":
            return ""
        elif type(word) == float:
            if math.isnan(word):
                return ""
        else:
            return word
    return test

In [8]:
def lev_list(address_list, address, i): # will help us find near matches
    temp_list = []
    temp_list_u = []
    for u_address in address_list:
        if Levenshtein.distance(address, u_address.upper()) <= i:
            temp_list.append(address)
            temp_list_u.append(u_address.upper())
    temp_list_series = pd.Series(temp_list)
    temp_list_u_series = pd.Series(temp_list_u)
    return temp_list_u_series

In [9]:
def gen_list_none(address_list, mismatched, i): # just to find how many addresses are so far off from anything in full
    tmp_list = []
    for address in mismatched:
        if address == "UNKNOWN ADDRESS" or address == "SUPPRESSED ADDRESS":
            continue
        if lev_list(address_list, address, i).shape[0] == 0:
            tmp_list.append(address)
    return pd.Series(tmp_list)

In [10]:
def contains_cardinal(addr): # returns None if no cardinal direction, and direction otherwise
    directions = [' E ', ' W ', ' N ', ' S ', ' NE ', ' NW ', ' SW ', ' SE ']
    count = 0
    for car in directions:
        if car in addr:
            if "JOHNNY W WILLIAMS" in addr:
                return None
            return car # tested earlier, only one instance of each in every single address (no double directions)
    return None

In [11]:
def construct_st_name(address): # returns the street name of the address w/o number, direction, or ending
    split = address.split()
    end = split[-1]
    first = "NA"
    start_index = 0
    end_index = len(split) - 1
    if split[0].isnumeric():
        start_index += 1
    if contains_cardinal(address) != None:
        start_index += 1
    first = split[start_index:end_index]
    word = ""
    for a in first:
        word += a + " "
    word = word.strip()
    return word

In [12]:
# f_addresses contains list of 'trusted' addresses
# constructs dictionary from all trusted addresses to create master lookup for endings
def construct_dictionary(f_addresses): 
    endings = {'AVE', 'DR', 'RD', 'ST', 'LN', 'CT', 'BLVD', 'CIR', 
               'WAY', 'PL', 'EXPY', 'ALY', 'TRL', 'PKWY', 'TER'}
    dictionary = {}
    counter = 0
    for i in range(len(f_addresses)):
        split = f_addresses[i].split()
        end = split[-1]
        if end in endings:
            st_name = construct_st_name(f_addresses[i])
            if st_name in dictionary.keys():
                existing_endings = dictionary[st_name] # should be a list of endings, as there could be multiple
                if end not in existing_endings:
                    counter += 1
                    existing_endings.append(end)
            else:
                dictionary[st_name] = [end]
    return dictionary

In [13]:
def construct_word_spacing(word_list):
    word = ""
    for part in word_list:
        word += part + " "
    return word.strip()

In [14]:
def reverse_ending_dictionary(f_addresses, t_addresses, not_found):
    f_addresses_set = set(f_addresses)
    t_addresses_set = set(t_addresses)
    ending_dictionary = construct_dictionary(f_addresses)
    replace_ending = {}
    for i in range(len(not_found)):
        current_address = not_found.iloc[i]
        split = current_address.split()
        no_ending_split = split[:-1]
        word = construct_word_spacing(no_ending_split)
        st_name = construct_st_name(not_found.iloc[i]) #removes number, cardinal direction, ending if exists
        if st_name in ending_dictionary.keys():
            for ending in ending_dictionary[st_name]:
                test_address = word + " " + ending
                if test_address in t_addresses_set:
                    if test_address in replace_ending.keys(): # this structure just uses first option
                        print("multiple addresses found for: " + test_address)
                    else:
                        replace_ending[test_address] = current_address
                # else just ignore
    return replace_ending

In [15]:
def reverse_direction_dictionary(f_addresses, t_addresses, not_found):
    f_addresses_set = set(f_addresses)
    t_addresses_set = set(t_addresses)
    replace_direction = {}
    for i in range(len(not_found)):
        current_address = not_found.iloc[i]
        car = contains_cardinal(current_address)
        if car != None:
            ind = current_address.find(car)
            addr = current_address[:ind] + " " + current_address[ind+3:]
            if addr in t_addresses_set:
                if addr in replace_direction.keys(): # this structure just uses first option
                    print("multiple directions found for: " + current_address)
                else:
                    replace_direction[addr] = current_address
    return replace_direction

In [16]:
def hasNumbers(inputString): # from stackoverflow
    return any(char.isdigit() for char in inputString)

In [17]:
def combine(row):
    word = ""
    for r in row:
        if r != None:
            word += r + " "
    return word.strip()

In [18]:
def number_dictionary():
    string = ['FIRST', 'SECOND', 'THIRD', 'FOURTH', 'FIFTH', 'SIXTH', 'SEVENTH',
             'EIGHTH', 'NINTH', 'TENTH', 'ELEVENTH', 'TWELFTH', 'THIRTEENTH',
             'FOURTEENTH', 'FIFTEENTH', 'SIXTEENTH', 'SEVENTEENTH', 'EIGHTEENTH',
             'NINETEENTH', 'TWENTIETH']
    numeric = ['1ST', '2ND', '3RD', '4TH', '5TH', '6TH', '7TH', '8TH', '9TH',
              '10TH', '11TH', '12TH', '13TH', '14TH', '15TH', '16T', '17TH',
              '18TH', '19TH', '20TH']
    return dict(zip(string, numeric))

In [19]:
def find_numbers(addresses):
    test_addr = addresses.str.split(expand=True)
    test_addr = test_addr.drop(0, axis=1)
    test_addr = test_addr.apply(combine, axis=1)
    uniq = test_addr[test_addr.apply(hasNumbers)].unique()
    return pd.Series(uniq)

In [20]:
def return_not_found(f_addresses, t_addresses):
    return f_addresses[~f_addresses.isin(t_addresses)]

## Cleaning

In [21]:
df_full_taxes['FULL_ADDRESS'] = df_full_taxes['HOUSE_NO'].astype(str).apply(genStrip()) + \
' ' + df_full_taxes['STDIRECT'].apply(genSpacing()) + df_full_taxes["STREET_NAM"].apply(genStrip()) + ' ' + df_full_taxes["STREET_TYPE"].astype(str).apply(genStrip())

In [22]:
f_addresses = df_addr['Address']
t_addresses = pd.Series(df_full_taxes['FULL_ADDRESS'].unique()) # to compare

In [23]:
print(t_addresses[~t_addresses.isin(f_addresses)].shape)
not_found = return_not_found(f_addresses, t_addresses)
print(not_found.shape)

(16456,)
(9130,)


In [24]:
find_numbers(not_found)

0                           W 4TH AVE
1                            15TH AVE
2                  1/2 S SLAPPEY BLVD
3                              US 19S
4                     US HIGHWAY 19 S
5                    US HIGHWAY 280 W
6                           W 3RD AVE
7                             3RD AVE
8                           US HWY 82
9     GILLIONVILLE (SUBSTATION 21) RD
10                           21ST AVE
11                          W 1ST AVE
12                      1/2 DAWSON RD
13                          E 4TH AVE
14                     US HIGHWAY 19S
15                            US 19 S
16                   1/2 KEYSTONE AVE
17                          E 2ND AVE
18                          E 3RD AVE
19                            5TH AVE
20                            6TH AVE
21                          W 2ND AVE
22                            9TH AVE
23                           12TH AVE
24                            8TH AVE
25                          E 1ST AVE
26          

In [25]:
string_to_number = number_dictionary()
t_addresses = t_addresses.replace(string_to_number, regex=True)

In [26]:
return_not_found(f_addresses, t_addresses).shape

(7759,)

In [27]:
ending_dictionary = reverse_ending_dictionary(f_addresses, t_addresses, not_found)
direction_dictionary = reverse_direction_dictionary(f_addresses, t_addresses, not_found)

multiple directions found for: 1310 W HIGHLAND AVE
multiple directions found for: 1106 W 4TH AVE
multiple directions found for: 1111 W 4TH AVE
multiple directions found for: 1301 W 4TH AVE
multiple directions found for: 1202 W 3RD AVE
multiple directions found for: 1205 W 2ND AVE
multiple directions found for: 1209 W 2ND AVE
multiple directions found for: 1218 W 2ND AVE
multiple directions found for: 1403 W 2ND AVE
multiple directions found for: 1413 W LINCOLN AVE
multiple directions found for: 815 S WESTOVER BLVD
multiple directions found for: 1306 E 2ND AVE
multiple directions found for: 1302 E 2ND AVE
multiple directions found for: 1300 E 2ND AVE
multiple directions found for: 1216 E 2ND AVE
multiple directions found for: 1212 E 2ND AVE
multiple directions found for: 1210 E 2ND AVE
multiple directions found for: 1208 E 2ND AVE
multiple directions found for: 1204 E 2ND AVE
multiple directions found for: 1203 E 2ND AVE
multiple directions found for: 1211 E 2ND AVE
multiple directions 

In [28]:
ending_dictionary;

In [29]:
direction_dictionary;

In [30]:
t_addresses = t_addresses.replace(ending_dictionary)
t_addresses = t_addresses.replace(direction_dictionary)

In [31]:
not_found = return_not_found(f_addresses, t_addresses)

In [32]:
not_found.shape

(5473,)

In [33]:
not_found.head(10)

4           509 W GORDON AVE
23             2632 ERICA CT
30        235 BONNY VIEW AVE
33              1200 AUGUSTA
37            205 COLLINS ST
46          1705 E BROAD AVE
50    1001 RADIUM SPRINGS RD
56           109 W BROAD AVE
57             110 SHELBY LN
58             112 SHELBY LN
Name: Address, dtype: object

## Pick out relevant columns and ship for primary id merging

In [34]:
tmp = list(df_full_taxes)[:15]
tmp.append("TOTAL_ACRES")
tmp.extend(list(df_full_taxes)[16:])

In [35]:
df_full_taxes.columns = tmp

In [36]:
to_ship = df_full_taxes[['FULL_ADDRESS', 'ZONE_CODE', 'PARCEL_NO', 'PREV_VAL', 'CURR_VAL', 'VALCHGDATE', 'PROPERTY_CLASS', 'TOTAL_ACRES', 'YEAR']]

In [37]:
to_ship.head()

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR
0,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009
1,0 A C L RAILROAD nan,C3,00104/00001/015,26300,26300,11/12/2008,COMMERCIAL,0.39,2009
2,0 A C L RAILROAD nan,C3,00104/00001/016,990,990,11/12/2008,COMMERCIAL,0.25,2009
3,0 A C L RAILROAD nan,AG,00155/00004/013,21400,21400,4/16/2007,RESIDENTIAL,17.19,2009
4,0 A C L RAILROAD nan,AG,00186/00001/006,28600,28600,6/20/2007,RESIDENTIAL,14.3,2009


In [40]:
#to_ship.to_csv(path + 'TotalTax.csv', index=False)

## Here you run the tax merge file

## Check that the csv + primaryid merge worked alright

In [62]:
df_total_tax = pd.read_csv(path + 'TotalTax.csv')

In [63]:
df_total_tax.head()

Unnamed: 0.1,Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
0,0,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009,30589
1,1,0 A C L RAILROAD nan,C3,00104/00001/015,26300,26300,11/12/2008,COMMERCIAL,0.39,2009,30589
2,2,0 A C L RAILROAD nan,C3,00104/00001/016,990,990,11/12/2008,COMMERCIAL,0.25,2009,30589
3,3,0 A C L RAILROAD nan,AG,00155/00004/013,21400,21400,4/16/2007,RESIDENTIAL,17.19,2009,30589
4,4,0 A C L RAILROAD nan,AG,00186/00001/006,28600,28600,6/20/2007,RESIDENTIAL,14.3,2009,30589


In [64]:
na_date = df_full_taxes[df_full_taxes['VALCHGDATE'].isna()]

In [65]:
na_date.shape

(205574, 23)

In [66]:
zone_codes = df_total_tax.groupby('FULL_ADDRESS').count()['ZONE_CODE']

In [67]:
zone_codes[zone_codes % 11 != 0].value_counts()

3       686
4       482
2       472
8       465
7       452
6       430
5       358
9       320
1       281
10      154
14      127
13      111
12       90
15       53
17       46
16       45
19       35
20       29
18       23
21       16
26       14
23       14
25       12
24       10
34        8
28        8
27        7
30        6
31        6
32        6
46        5
37        5
29        5
47        5
42        5
60        4
35        4
45        4
38        4
43        4
48        3
102       2
62        2
41        2
125       2
49        2
53        2
39        2
56        2
40        2
36        2
80        2
57        1
359       1
356       1
332       1
184       1
69        1
140       1
124       1
120       1
112       1
64        1
65        1
461       1
73        1
162       1
174       1
367       1
51        1
59        1
67        1
71        1
75        1
87        1
123       1
135       1
163       1
171       1
211       1
279       1
295       1
166       1
118 

In [68]:
df_full_taxes['FULL_ADDRESS'][df_full_taxes['PERSKEY'].isna()]

0             0 A C L RAILROAD nan
1             0 A C L RAILROAD nan
2             0 A C L RAILROAD nan
3             0 A C L RAILROAD nan
4             0 A C L RAILROAD nan
5             0 A C L RAILROAD nan
6             0 A C L RAILROAD nan
7             0 A C L RAILROAD nan
8             0 A C L RAILROAD nan
9             0 A C L RAILROAD nan
10            0 A C L RAILROAD nan
11            0 A C L RAILROAD nan
12            0 A C L RAILROAD nan
13            0 A C L RAILROAD nan
14            0 A C L RAILROAD nan
15            0 A C L RAILROAD nan
16            0 A C L RAILROAD nan
17            0 A C L RAILROAD nan
18         4408 A C L RAILROAD nan
19                   2700 ABBEY LN
20                   2701 ABBEY LN
21                   2702 ABBEY LN
22                   2703 ABBEY LN
23                   2704 ABBEY LN
24                   2705 ABBEY LN
25                   2706 ABBEY LN
26                   2707 ABBEY LN
27                   2708 ABBEY LN
28                  

## solving an issue with the dates

In [69]:
df_total_tax.drop('Unnamed: 0', axis=1, inplace=True)

In [70]:
df_total_tax.head(200)

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
0,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009,30589
1,0 A C L RAILROAD nan,C3,00104/00001/015,26300,26300,11/12/2008,COMMERCIAL,0.39,2009,30589
2,0 A C L RAILROAD nan,C3,00104/00001/016,990,990,11/12/2008,COMMERCIAL,0.25,2009,30589
3,0 A C L RAILROAD nan,AG,00155/00004/013,21400,21400,4/16/2007,RESIDENTIAL,17.19,2009,30589
4,0 A C L RAILROAD nan,AG,00186/00001/006,28600,28600,6/20/2007,RESIDENTIAL,14.3,2009,30589
5,0 A C L RAILROAD nan,R4,00104/00002/031,3200,3200,,RESIDENTIAL,0.21,2009,30589
6,0 A C L RAILROAD nan,R4,00104/00002/030,4300,4300,,RESIDENTIAL,0.21,2009,30589
7,0 A C L RAILROAD nan,R4,00104/00002/004,3000,3000,3/6/2009,RESIDENTIAL,0.17,2009,30589
8,0 A C L RAILROAD nan,R6,00104/00001/013,4000,4000,11/12/2008,RESIDENTIAL,1.05,2009,30589
9,0 A C L RAILROAD nan,R4,00104/00002/003,9400,9400,10/31/2008,RESIDENTIAL,0.22,2009,30589


In [71]:
df_total_tax['VALCHGDATE'][0]

'11/12/2008'

In [72]:
df_total_tax[df_total_tax['FULL_ADDRESS'].str.contains("2700 ABBEY LN")]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
19,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2009,13777
37848,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2010,13777
75738,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2011,13777
113843,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2012,13777
151904,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2013,13777
189989,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2014,13777
228128,2700 ABBEY LN,R1A,00333/00009/001,159060,157560,4/20/2015,RESIDENTIAL,1.03,2015,13777
266160,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2016,13777
304229,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2017,13777
342756,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,,COMMERCIAL,1.03,2018,13777


In [73]:
df_total_tax[df_total_tax['FULL_ADDRESS'].str.contains("2701 ABBEY LN")]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
20,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,10/31/2008,RESIDENTIAL,1.33,2009,13763
37849,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,10/20/2009,RESIDENTIAL,1.33,2010,13763
75739,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,10/20/2009,RESIDENTIAL,1.33,2011,13763
113844,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,10/20/2009,RESIDENTIAL,1.33,2012,13763
151905,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,,RESIDENTIAL,1.33,2013,13763
189990,2701 ABBEY LN,R1A,00333/00006/017,175500,175500,,RESIDENTIAL,1.33,2014,13763
228129,2701 ABBEY LN,R1A,00333/00006/017,175500,174000,4/20/2015,RESIDENTIAL,1.33,2015,13763
266161,2701 ABBEY LN,R1A,00333/00006/017,167000,167000,12/8/2015,RESIDENTIAL,1.33,2016,13763
304230,2701 ABBEY LN,R1A,00333/00006/017,167000,167000,,RESIDENTIAL,1.33,2017,13763
342757,2701 ABBEY LN,R1A,00333/00006/017,167000,167000,,RESIDENTIAL,1.33,2018,13763


In [74]:
df_full_taxes['PERSKEY'].value_counts()

447994.0    124
941408.0     55
967470.0     51
951814.0     51
975244.0     51
954978.0     51
954929.0     51
957476.0     51
962020.0     51
954565.0     51
969620.0     51
719297.0     51
962230.0     51
975382.0     51
957193.0     51
965058.0     51
962833.0     51
976375.0     34
959359.0     34
969695.0     34
699192.0     32
950698.0     30
954371.0     30
646858.0     30
967044.0     30
950050.0     30
956953.0     30
699187.0     26
951781.0     22
650513.0     22
890832.0     22
892227.0     22
300645.0     22
506986.0     22
595437.0     22
869947.0     22
940849.0     21
699208.0     20
965548.0     19
628949.0     19
718157.0     18
954061.0     18
960040.0     18
563054.0     18
301193.0     18
959993.0     18
867012.0     17
268146.0     17
454053.0     16
966446.0     16
966327.0     16
454823.0     16
959211.0     16
606794.0     16
975423.0     16
573701.0     16
958636.0     16
550286.0     16
975833.0     16
959332.0     16
318076.0     16
976019.0     16
891417.0

In [75]:
df_full_taxes[df_full_taxes['FULL_ADDRESS'].str.contains("948 N MAPLE ST")]

Unnamed: 0,OWNER_NAME,HOUSE_NO,STDIRECT,STREET_NAM,STREET_TYPE,UNIT,PROP_ZIP,ZONE_CODE,SUBDIVISION,PARCEL_NO,PREV_VAL,...,VALCHGDATE,TAXDISTRIC,HOMEEXEMPT,TOTAL_ACRES,PROPERTY_CLASS,DIG_STRAT,BUSI_ID,NAICS,PERSKEY,YEAR,FULL_ADDRESS
20161,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,A T & T MOBILITY LLC,513322.0,870266.0,2009,948 N MAPLE ST
20162,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,T MOBILE SOUTH LLC,513322.0,886103.0,2009,948 N MAPLE ST
20163,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,NEXTEL PARTNERS EQUIPMENT CORP,513322.0,931100.0,2009,948 N MAPLE ST
58013,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,A T & T MOBILITY LLC,513322.0,870266.0,2010,948 N MAPLE ST
58014,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,T MOBILE SOUTH LLC,513322.0,886103.0,2010,948 N MAPLE ST
58015,ALLTEL COMMUNICATIONS INC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,10/31/2008,1,S0,4.65,COMMERCIAL,3,NEXTEL PARTNERS EQUIPMENT CORP,513322.0,931100.0,2010,948 N MAPLE ST
96045,ALLIED WIRELESS COMMUNICATIONS GA LLC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,,1,S0,4.65,COMMERCIAL,3,A T & T MOBILITY LLC,513322.0,870266.0,2011,948 N MAPLE ST
96046,ALLIED WIRELESS COMMUNICATIONS GA LLC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,,1,S0,4.65,COMMERCIAL,3,T MOBILE SOUTH LLC,513322.0,886103.0,2011,948 N MAPLE ST
96047,ALLIED WIRELESS COMMUNICATIONS GA LLC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,,1,S0,4.65,COMMERCIAL,3,NEXTEL PARTNERS EQUIPMENT CORP,513322.0,931100.0,2011,948 N MAPLE ST
134147,ALLIED WIRELESS COMMUNICATIONS GA LLC,948,N,MAPLE,ST,,,M1,GIRADEAU PECAN GROVE,0000R/00001/02A,133200,...,,1,S0,4.65,COMMERCIAL,3,A T & T MOBILITY LLC,513322.0,870266.0,2012,948 N MAPLE ST


In [76]:
df_total_tax[df_total_tax['FULL_ADDRESS'].str.contains("948 N MAPLE ST")]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
20161,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2009,30589
20162,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2009,30589
20163,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2009,30589
58013,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2010,30589
58014,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2010,30589
58015,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,10/31/2008,COMMERCIAL,4.65,2010,30589
96045,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,,COMMERCIAL,4.65,2011,30589
96046,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,,COMMERCIAL,4.65,2011,30589
96047,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,,COMMERCIAL,4.65,2011,30589
134147,948 N MAPLE ST,M1,0000R/00001/02A,133200,133200,,COMMERCIAL,4.65,2012,30589


In [77]:
df_total_tax[['FULL_ADDRESS', 'VALCHGDATE', 'YEAR']].groupby('FULL_ADDRESS')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x13e15f940>

In [78]:
df_tax_sorted = df_total_tax.sort_values(by=['FULL_ADDRESS', 'YEAR', 'VALCHGDATE'])

In [79]:
df_full_taxes.iloc[9235]

OWNER_NAME        3F PROPERTIES LLC                       
HOUSE_NO                                                 0
STDIRECT                                                  
STREET_NAM                       ELEVENTH                 
STREET_TYPE                                           AVE 
UNIT                                                   NaN
PROP_ZIP                                               NaN
ZONE_CODE                                     M1          
SUBDIVISION       FLINT RIVER COTTON M                    
PARCEL_NO                             000LL/00009/03B     
PREV_VAL                                             26100
CURR_VAL                                             26100
VALCHGDATE                                       8/15/2008
TAXDISTRIC                                               1
HOMEEXEMPT                                           S0   
TOTAL_ACRES                                          11.61
PROPERTY_CLASS                                  COMMERCI

In [80]:
df_tax_sorted['PrimaryID'].value_counts().sum()

421915

In [81]:
df_tax_sorted.index = range(len(df_tax_sorted))

In [82]:
df_tax_sorted[df_tax_sorted['FULL_ADDRESS'].str.contains("2700 ABBEY LN")]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
234850,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2009,13777
234851,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2010,13777
234852,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2011,13777
234853,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2012,13777
234854,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2013,13777
234855,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2014,13777
234856,2700 ABBEY LN,R1A,00333/00009/001,159060,157560,4/20/2015,RESIDENTIAL,1.03,2015,13777
234857,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2016,13777
234858,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2017,13777
234859,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,,COMMERCIAL,1.03,2018,13777


In [83]:
df_tax_sorted['VALCHGDATE'] = df_tax_sorted['VALCHGDATE'].astype(str)

In [84]:
def fix_dates():
    i = 0
    TS_DATES = []
    while i < len(df_tax_sorted):
        if i % 50000 == 0:
            print(str(i))
        try:
            existing_entry = df_tax_sorted.iloc[i]
            existing_address = existing_entry['FULL_ADDRESS']
            existing_csv_year = existing_entry['YEAR']
            existing_date = existing_entry['VALCHGDATE']
            if existing_date == 'nan':
                TS_DATES.append("NA")
                while df_tax_sorted.iloc[i+1]['VALCHGDATE'] == 'nan':
                    TS_DATES.append("NA")
                    i = i + 1
            else:
                TS_DATES.append(existing_date)
                while df_tax_sorted.iloc[i+1]['FULL_ADDRESS'] == existing_address:
                    if df_tax_sorted.iloc[i+1]['VALCHGDATE'] == 'nan':
                        TS_DATES.append(existing_date)
                    else:
                        TS_DATES.append(df_tax_sorted.iloc[i+1]['VALCHGDATE'])
                        existing_date = df_tax_sorted.iloc[i+1]['VALCHGDATE']
                    i = i + 1
        except IndexError:
            print(str(i))
        i = i + 1
    return pd.Series(TS_DATES)

In [85]:
TS_DATES = fix_dates()

0
50000
100000
250000
300000
421914


In [86]:
df_tax_sorted.head(10)

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID
0,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2017,30589
1,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2017,30589
2,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2018,30589
3,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2018,30589
4,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2019,30589
5,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2019,30589
6,0 A C L RAILROAD,R1B,00122/00001/020,314700,314700,,RESIDENTIAL,152.66,2019,30589
7,0 A C L RAILROAD nan,C5,00140/00001/007,175000,175000,1/13/2009,,43.75,2009,30589
8,0 A C L RAILROAD nan,R4,00104/00002/003,9400,9400,10/31/2008,RESIDENTIAL,0.22,2009,30589
9,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009,30589


In [87]:
TS_DATES.head(10)

0            NA
1            NA
2            NA
3            NA
4            NA
5            NA
6            NA
7     1/13/2009
8    10/31/2008
9    11/12/2008
dtype: object

In [88]:
TS_DATES.iloc[234850:234861]

234850            NA
234851    10/20/2009
234852    10/20/2009
234853    10/20/2009
234854    10/20/2009
234855    10/20/2009
234856     4/20/2015
234857     4/20/2015
234858     4/20/2015
234859     4/20/2015
234860     4/20/2015
dtype: object

In [89]:
TS_DATES.tail()

421910    3/26/2012
421911    3/26/2012
421912    3/26/2012
421913    3/26/2012
421914    3/26/2012
dtype: object

In [90]:
df_tax_sorted['TS_DATES'] = TS_DATES

In [91]:
df_tax_sorted.head(20)

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID,TS_DATES
0,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2017,30589,
1,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2017,30589,
2,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2018,30589,
3,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2018,30589,
4,0 A C L RAILROAD,R4,00104/00002/031,5000,5000,,RESIDENTIAL,0.21,2019,30589,
5,0 A C L RAILROAD,R4,00104/00002/006,3000,3000,,RESIDENTIAL,0.24,2019,30589,
6,0 A C L RAILROAD,R1B,00122/00001/020,314700,314700,,RESIDENTIAL,152.66,2019,30589,
7,0 A C L RAILROAD nan,C5,00140/00001/007,175000,175000,1/13/2009,,43.75,2009,30589,1/13/2009
8,0 A C L RAILROAD nan,R4,00104/00002/003,9400,9400,10/31/2008,RESIDENTIAL,0.22,2009,30589,10/31/2008
9,0 A C L RAILROAD nan,C3,00104/00001/014,13100,13100,11/12/2008,RESIDENTIAL,0.0,2009,30589,11/12/2008


In [92]:
df_tax_sorted.iloc[7]['TS_DATES']

'1/13/2009'

In [95]:
def construct_ymd():
    year = []
    month = []
    day = []
    for i in range(len(df_tax_sorted)):
        if i % 50000 == 0:
            print(str(i))
        current_date = df_tax_sorted.iloc[i]['TS_DATES']
        if current_date == "NA":
            year.append("NA")
            month.append("NA")
            day.append("NA")
        else:
            split = current_date.split("/")
            year.append(split[2])
            month.append(split[0])
            day.append(split[1])
    return pd.DataFrame({'TS_YEAR': year, 'TS_MONTH': month, 'TS_DAY': day}) #change to TS

In [96]:
ymd = construct_ymd()

0
50000
100000
150000
200000
250000
300000
350000
400000


In [97]:
ymd.head(20)

Unnamed: 0,TS_YEAR,TS_MONTH,TS_DAY
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,
7,2009.0,1.0,13.0
8,2008.0,10.0,31.0
9,2008.0,11.0,12.0


In [98]:
df_final = pd.concat([df_tax_sorted, ymd], axis=1)

In [99]:
df_final.iloc[234850:234861]

Unnamed: 0,FULL_ADDRESS,ZONE_CODE,PARCEL_NO,PREV_VAL,CURR_VAL,VALCHGDATE,PROPERTY_CLASS,TOTAL_ACRES,YEAR,PrimaryID,TS_DATES,TS_YEAR,TS_MONTH,TS_DAY
234850,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2009,13777,,,,
234851,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2010,13777,10/20/2009,2009.0,10.0,20.0
234852,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,10/20/2009,RESIDENTIAL,1.03,2011,13777,10/20/2009,2009.0,10.0,20.0
234853,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2012,13777,10/20/2009,2009.0,10.0,20.0
234854,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2013,13777,10/20/2009,2009.0,10.0,20.0
234855,2700 ABBEY LN,R1A,00333/00009/001,159060,159060,,RESIDENTIAL,1.03,2014,13777,10/20/2009,2009.0,10.0,20.0
234856,2700 ABBEY LN,R1A,00333/00009/001,159060,157560,4/20/2015,RESIDENTIAL,1.03,2015,13777,4/20/2015,2015.0,4.0,20.0
234857,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2016,13777,4/20/2015,2015.0,4.0,20.0
234858,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,4/20/2015,RESIDENTIAL,1.03,2017,13777,4/20/2015,2015.0,4.0,20.0
234859,2700 ABBEY LN,R1A,00333/00009/001,157560,157560,,COMMERCIAL,1.03,2018,13777,4/20/2015,2015.0,4.0,20.0


In [101]:
#df_final.to_csv('~/Dropbox (Amherst College)/CDS-2019-AlbanyHub/ToDatabase/TotalTax.csv', index=False)

In [112]:
test_t_addr = df_final['FULL_ADDRESS']
test_f_addr = df_addr['Address']
not_found = test_f_addr[~test_f_addr.isin(test_t_addr)]

In [118]:
not_found.shape

(5868,)

In [119]:
test_t_addr.shape

(421915,)

In [120]:
test_f_addr.shape

(30721,)

In [121]:
(30721-5868) / 30721

0.8089905927541421