In [22]:
import pandas as pd
import os
import json

In [23]:
#Read in the address junction table 
df = pd.read_csv('~/Dropbox/CDS-2019-AlbanyHub/ToDatabase/addr_junct_table.csv')
directory = '/Users/oliviafiol/Dropbox/CDS-2019-AlbanyHub/Processed-Data/attom_json' #dir of json files

In [24]:
#read all json files into a single list
j_list=[]
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        f = open(directory+'/'+filename)
        j_list.append(json.load(f))
        f.close()

In [25]:
#Look at all the values available for a single home
example_entry = j_list[0][0]
example_entry.keys()

dict_keys(['identifier', 'lot', 'area', 'address', 'location', 'summary', 'utilities', 'sale', 'building', 'assessment', 'vintage'])

In [26]:
#concatenate j_list into a single list
data = [y for x in j_list for y in x]
len(data)

13000

In [27]:
#separate these out by the major keys
all_identifiers = [x['identifier'] for x in data if len(x)>0]
all_lot = [x['lot'] for x in data if len(x)>0]
all_area = [x['area'] for x in data if len(x)>0]
all_addresses = [x['address'] for x in data if len(x)>0]
all_location = [x['location'] for x in data if len(x)>0]
all_summary = [x['summary'] for x in data if len(x)>0]
all_utilities = [x['utilities']for x in data if len(x)>0]
all_sale = [x['sale'] for x in data if len(x)>0]
all_building = [x['building'] for x in data if len(x)>0]
all_assessment = [x['assessment'] for x in data if len(x)>0]
all_vintage = [x['vintage'] for x in data if len(x)>0]
len(all_identifiers) #25030 rows were not found when making api calls

10798

In [28]:
#convert data from a multi-layered dict into a single dict for a pandas dataframe
dict_full = {
    'address':[x['line1'] for x in all_addresses],
    'lot_size':[x['lotSize1'] for x in all_lot],
    'zoningType': [x.get('zoningType') for x in all_lot],
    'siteZoningIdent': [x.get('siteZoningIdent') for x in all_lot],
    'propClass': [x['propClass'] for x in all_summary],
    'yearBuilt':[x['yearBuilt'] for x in all_summary],
    'size':[x['size']['grossSizeAdjusted'] for x in all_building],
    'baths':[x['rooms']['bathsTotal'] for x in all_building],
    'beds':[x['rooms']['beds'] for x in all_building],
    'rooms':[x['rooms']['roomsTotal'] for x in all_building],
    'floors':[x['interior'].get('floors') for x in all_building],
    'condition':[x['construction'].get('condition') for x in all_building],
    'foundationType':[x['construction'].get('foundationType') for x in all_building],
    'roofCover':[x['construction'].get('roofCover') for x in all_building],
    'wallType':[x['construction'].get('wallType') for x in all_building],
    'improvementYear':[x['construction'].get('propertyStructureMajorImprovementsYear') for x in all_building],
    'assessment':[x['assessed']['assdTtlValue'] for x in all_assessment],
    'assessment':[x['market']['mktTtlValue'] for x in all_assessment]
}

In [29]:
#Convert to pandas dataframe
dfp = pd.DataFrame(data=dict_full)
dfp.drop_duplicates(subset="address")
dfp.index = range(len(dfp))
len(dfp)

10798

In [30]:
#get the primary ids in the junction table for each address returned
id_dict = {df.loc[x,'Address']:df.loc[x, 'Id'] for x in range(len(df))} #maps address to id, from junct table
NOT_FOUND = id_dict['NOT FOUND']
prim_ids=[id_dict.get(x, NOT_FOUND) for x in dfp['address']] #id of each addresss in new dataframe

In [31]:
#find the locs where x is not in the address junction table
nf = [dfp.loc[x, 'address'] for x in range(len(prim_ids)) if prim_ids[x]==NOT_FOUND]

In [32]:
len(nf)

323

In [33]:
len(prim_ids) # about 3% of addresses were not found

10798

In [34]:
dfp['PrimaryID'] = prim_ids

In [35]:
#replace 0's with None for clarity
dfp[dfp['yearBuilt'] == 0]=None
dfp[dfp['size']==0] = None
dfp[dfp['lot_size'] == 0] = None
#Drop rows where address is Nan
dfp = dfp.dropna(how='all')

In [36]:
dfp.to_csv(directory+'/property_data.csv', index=False)

In [38]:
len(dfp)/len(data) # about 75% of addresses have data

0.7307692307692307

In [39]:
dfp.head()

Unnamed: 0,address,lot_size,zoningType,siteZoningIdent,propClass,yearBuilt,size,baths,beds,rooms,floors,condition,foundationType,roofCover,wallType,improvementYear,assessment,PrimaryID
0,2316 GAIL AVE,0.36,Residential,R1A,Single Family Residence / Townhouse,1968.0,1790.0,2.0,3.0,0.0,WOOD,AVERAGE,MASONRY,ASPHALT,BRICK VENEER,0,110100.0,17000.0
1,2318 GAIL AVE,0.36,Residential,R1A,Single Family Residence / Townhouse,1966.0,1455.0,2.0,3.0,0.0,WOOD,AVERAGE,MASONRY,ASPHALT,BRICK VENEER,1990,109200.0,17001.0
2,2307 BARNESDALE WAY,0.35,Residential,R1A,Single Family Residence / Townhouse,1964.0,1560.0,1.5,3.0,0.0,WOOD,AVERAGE,MASONRY,ASPHALT,BRICK VENEER,0,91600.0,17002.0
3,2305 BARNESDALE WAY,0.32,Residential,R1A,Single Family Residence / Townhouse,1964.0,1653.0,2.0,3.0,0.0,WOOD,AVERAGE,MASONRY,ASPHALT,BRICK VENEER,0,94300.0,17003.0
4,2303 BARNESDALE WAY,0.32,Residential,R1A,Single Family Residence / Townhouse,1964.0,1175.0,1.5,3.0,0.0,WOOD,AVERAGE,MASONRY,ASPHALT,BRICK VENEER,0,75400.0,17004.0
