# Parsing code list documentation



In [8]:
import pathlib
import os
import pandas as pd
import tabula

In [2]:
from tabula import read_pdf

In [3]:
!java -version

openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)


In [4]:
pdf_path = pathlib.Path().cwd().parent.joinpath('references', 'Property Tax Management System Code List Manual 180614.pdf')

In [12]:
def get_page_table(page, loc=0):
    return tabula.read_pdf(pdf_path, pages=page)[loc]

In [29]:
pages_for_codes = {
    'PropertyClass': (1, 2, 3, 4, (5, 0)),
    'StreetType': ((5, 1),),
    'NeighborhoodType': ((5, 2),),
    'DataSource': ((6, 0),),
    'ReasonChange': ((6, 1),),
    'LandType': ((6,2), (7,0)),
    'InfluenceFactor': ((7,1),),
    'Occupancy': ((7, 2),),
    'Story': ((8, 0),),
    'Attic': ((8, 1),),
    'Basement': ((8, 2),),
    'CrawlSpace': ((8,3),),
    'Roofing': ((8, 4),),
    'Heating': ((9, 0),),
    'QualityGrade': ((9, 1),),
    'ConstructionType': ((9, 2), 10, (11, 0)),
    'ResAgImprovement': ((11, 1), 12),
    'CommIndustImprovement': (13, 14, (15, 0)),
    'NeighImprovement': ((15, 1),),
    'TaxingDistrict': tuple(range(24, 66)) + ((66, 0),),
    'TownshipName': ((66, 1),) + tuple(range(67, 88)),
    'CountryName': tuple(range(88, 92)) + ((92, 0),),
    'Waterfront': ((93, 2),),
}
extra_entries = {
    'PropertyClass': pd.DataFrame(
        data={
            'CODE': [346, 462, 820],
            'VALUE': [
                'INDUSTRIAL RESEARCH AND DEVELOPMENT FACILITY', 
                'GOLF RANGE OR MINIATURE COURSE',
                'LOCALLY ASSESSED PROPERTY OWNED BY A LIGHT, HEAT, OR POWER COMPANY-COMMERCIAL',
            ],
        }
    ),
    'LandType': pd.DataFrame(data={'CODE': [12], 'VALUE': ['SECONDARY']}),
    # Possibly need for:
    #  CountryName
    #  TownshipName
    #  TaxingDistrict
}

In [23]:
get_page_table(9, 2)

Unnamed: 0,CODE,VALUE


In [30]:
code_dictionary = {}
for name, pages in pages_for_codes.items():
    print(name, end=' ')
    table_fragments = []
    for pg in pages:
        print(pg, end=' ')
        if type(pg) == int:
            tab = get_page_table(pg)
        else:
            tab = get_page_table(pg[0], loc=pg[1])
        #print(tab)
        table_fragments.append(tab)

    if name in extra_entries:
        code_df = pd.concat(
            table_fragments + [extra_entries[name]],
            ignore_index=True
        )
    else:
        code_df = pd.concat(table_fragments, ignore_index=True)
    code_dictionary[name] = code_df.dropna().drop_duplicates()
    print(' ')

PropertyClass 1 2 3 4 (5, 0)  
StreetType (5, 1)  
NeighborhoodType (5, 2)  
DataSource (6, 0)  
ReasonChange (6, 1)  
LandType (6, 2) (7, 0)  
InfluenceFactor (7, 1)  
Occupancy (7, 2)  
Story (8, 0)  
Attic (8, 1)  
Basement (8, 2)  
CrawlSpace (8, 3)  
Roofing (8, 4)  
Heating (9, 0)  
QualityGrade (9, 1)  
ConstructionType (9, 2) 10 (11, 0)  
ResAgImprovement (11, 1) 12  
CommIndustImprovement 13 14 (15, 0)  
NeighImprovement (15, 1)  
TaxingDistrict 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 (66, 0)  
TownshipName (66, 1) 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87  
CountryName 88 89 90 91 (92, 0)  
Waterfront (93, 2)  


In [31]:
import pickle

In [32]:
# save code dictionary to pickled file
save_path = pathlib.Path().cwd().parent.joinpath('data', 'processed', 'code_dictionary.pkl')

In [34]:
with open(save_path, 'wb') as fout:
    pickle.dump(code_dictionary, fout)

In [None]:
# code list 1 property types
# 1-4, and first table 5

# code list      pg      table      name
# pg 5, second table
#

In [6]:
dfs[0]

Unnamed: 0,CODE,VALUE
0,100,AGRICULTURAL - VACANT LAND
1,101,AGRICULTURAL - CASH GRAIN/GENERAL FARM
2,102,AGRICULTURAL - LIVESTOCK OTHER THAN DAIRY OR P...
3,103,AGRICULTURAL - DAIRY FARM
4,104,AGRICULTURAL - POULTRY FARM
5,105,AGRICULTURAL - FRUIT & NUT FARM
6,106,AGRICULTURAL - VEGETABLE FARM
7,107,AGRICULTURAL - TOBACCO FARM
8,108,AGRICULTURAL - NURSERY
9,109,AGRICULTURAL - GREENHOUSES


In [12]:
read_pdf(pdf_path, pages=5)

[    CODE                                              VALUE
 0    821  STATE ASSESSED PROPERTY OWNED BY A LIGHT, HEAT...
 1    825  LOCALLY ASSESSED PROPERTY OWNED BY A LIGHT, HE...
 2    830  LOCALLY ASSESSED PROPERTY OWNED BY A PIPELINE ...
 3    831  STATE ASSESSED PROPERTY OWNED BY A PIPELINE CO...
 4    835  LOCALLY ASSESSED PROPERTY OWNED BY A PIPELINE ...
 5    840  LOCALLY ASSESSED PROPERTY OWNED BY A RAILROAD ...
 6    841  STATE ASSESSED OPERATING PROPERTY OWNED BY A R...
 7    845  LOCALLY ASSESSED PROPERTY OWNED BY A RAILROAD ...
 8    850  LOCALLY ASSESSED PROPERTY OWNED BY A SEWAGE CO...
 9    851  STATE ASSESSED PROPERTY OWNED BY A SEWAGE COMP...
 10   855  LOCALLY ASSESSED PROPERTY OWNED BY A SEWAGE CO...
 11   860  LOCALLY ASSESSED PROPERTY OWNED BY A TELEPHONE...
 12   861  STATE ASSESSED PROPERTY OWNED BY A TELEPHONE, ...
 13   865  LOCALLY ASSESSED PROPERTY OWNED BY A TELEPHONE...
 14   870  LOCALLY ASSESSED PROPERTY OWNED BY A WATER DIS...
 15   871  STATE ASSESSE