In [1]:
import pandas as pd
import numpy as np
import os
import json
from fix_addresses_master import *

In [2]:
#Read in the address junction table 
df = pd.read_csv('~/Dropbox (GaTech)/CDS-2019-AlbanyHub/ToDatabase/addr_junct_table.csv')
directory = '/home/mirabel/Dropbox (GaTech)/CDS-2019-AlbanyHub/Processed-Data/attom_json' #dir of json files

In [3]:
#read all json files into a single list
j_list=[]
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        f = open(directory+'/'+filename)
        j_list.append(json.load(f))
        f.close()
    

In [4]:
#Look at all the values available for a single home
example_entry = j_list[0][0]
example_entry.keys()

dict_keys(['identifier', 'lot', 'area', 'address', 'location', 'summary', 'utilities', 'sale', 'building', 'assessment', 'vintage'])

In [5]:
#concatenate j_list into a single list
data = [y for x in j_list for y in x]
len(data)

30721

In [6]:
def prop_split(data):
    #separate these out by the major keys
    all_identifiers = [x['identifier'] for x in data if len(x)>0]
    all_lot = [x['lot'] for x in data if len(x)>0]
    all_area = [x['area'] for x in data if len(x)>0]
    all_addresses = [x['address'] for x in data if len(x)>0]
    all_location = [x['location'] for x in data if len(x)>0]
    all_summary = [x['summary'] for x in data if len(x)>0]
    all_utilities = [x['utilities']for x in data if len(x)>0]
    all_sale = [x['sale'] for x in data if len(x)>0]
    all_building = [x['building'] for x in data if len(x)>0]
    all_assessment = [x['assessment'] for x in data if len(x)>0]
    all_vintage = [x['vintage'] for x in data if len(x)>0]
    #convert data from a multi-layered dict into a single dict for a pandas dataframe
    dict_full = {
        'address':[x['line1'] for x in all_addresses],
        'lot_size':[x['lotSize1'] for x in all_lot],
        'zoningType': [x.get('zoningType') for x in all_lot],
        'siteZoningIdent': [x.get('siteZoningIdent') for x in all_lot],
        'propClass': [x['propClass'] for x in all_summary],
        'yearBuilt':[x['yearBuilt'] for x in all_summary],
        'size':[x['size']['grossSizeAdjusted'] for x in all_building],
        'baths':[x['rooms']['bathsTotal'] for x in all_building],
        'beds':[x['rooms']['beds'] for x in all_building],
        'rooms':[x['rooms']['roomsTotal'] for x in all_building],
        'floors':[x['interior'].get('floors') for x in all_building],
        'condition':[x['construction'].get('condition') for x in all_building],
        'foundationType':[x['construction'].get('foundationType') for x in all_building],
        'roofCover':[x['construction'].get('roofCover') for x in all_building],
        'wallType':[x['construction'].get('wallType') for x in all_building],
        'improvementYear':[x['construction'].get('propertyStructureMajorImprovementsYear') for x in all_building],
        'assessment':[x['assessed']['assdTtlValue'] for x in all_assessment],
        'market':[x['market']['mktTtlValue'] for x in all_assessment]
    }
    return pd.DataFrame(data=dict_full)

In [21]:
#Convert to pandas dataframe
dfp = prop_split(data)
dfp.drop_duplicates(subset="address")
dfp.index = range(len(dfp))
len(dfp)

26931

In [22]:
#get the primary ids in the junction table for each address returned
id_dict = {df.loc[x,'Address']:df.loc[x, 'Id'] for x in range(len(df))} #maps address to id, from junct table
NOT_FOUND = id_dict['NOT FOUND']
prim_ids=[id_dict.get(x, NOT_FOUND) for x in fix_series(dfp['address'])] #id of each addresss in new dataframe
len(prim_ids)

26931

In [23]:
#find the locs where x is not in the address junction table
nf = [dfp.loc[x, 'address'] for x in range(len(prim_ids)) if prim_ids[x]==NOT_FOUND]

In [24]:
len(nf)/len(prim_ids) #about 3% of addresses were not found despite the successful api call

0.022910400653521963

In [26]:
dfp['PrimaryId'] = prim_ids
len(dfp)

26931

In [28]:
#replace 0's with None for clarity
dfp.loc[dfp['yearBuilt'] == 0, 'yearBuilt']=None
dfp.loc[dfp['size']==0, 'size'] = None
dfp.loc[dfp['lot_size'] == 0, 'lot_size'] = None
dfp.loc[dfp['assessment'] ==0, 'assessment']=None
dfp.loc[dfp['market'] == 0, 'market']=None
#Drop rows where address is Not found
dfp.loc[dfp['PrimaryId'] == NOT_FOUND, :] = None
dfp = dfp.dropna(how='all')


### Look at all the missing data and try to load it again

In [34]:
missing = df[~df['Id'].isin(dfp['PrimaryId'])]
missing.to_csv("missing.csv", index=False)

In [30]:
dfp.to_csv(directory+'/property_data.csv', index=False)

## Data Exploration
### 1. Identifiers
Address (Primary ID) - linked with junction table

### 2. Counts of Observations
Get counts and percentages of data

In [None]:
len(dfp)/len(data) # about 86% of addresses have data

In [None]:
len(dfp[dfp['size'].notnull()])/len(data) #about 83% of addresses have info on square footage

In [None]:
len(dfp[dfp['yearBuilt'].notnull()])/len(data) #similarly about 83% of addresses have year built

In [None]:
len(data) #number of addresses in the database

In [None]:
len(dfp) #number of addresses successfully retrieved from database

### 3. Some example records

In [None]:
dfp.loc[[5000, 10000, 15000, 20000, 25000], :]

### 4. Summarization

In [None]:
#all fields
dfp.columns

In [None]:
def get_stats(l):
    print("Min:", min(l))
    print("Max:", max(l))
    print("Mean:",l.mean())

In [None]:
print("Lot Size")
get_stats(dfp['lot_size'])
print("Year Built")
get_stats(dfp['yearBuilt'])
print("Size (sq ft)")
get_stats(dfp['size'])
print('assessment')
get_stats(dfp['assessment'])
print('market')
get_stats(dfp['market'])

In [None]:
# tiny lot size
dfp.loc[dfp['lot_size']<0.02, :]

In [None]:
# large lot size
dfp.loc[dfp['lot_size']>10000, :] # this is an error

In [None]:
# Small size
dfp[dfp['size']<250] # nothing ridiculous

In [None]:
#large size
dfp[dfp['size']>200000] #largest are commercial, industrial, distribution,etc - checks out

#### Question: difference between market and assessment? (market is often >2x assessment value)
https://www.realtor.com/advice/sell/assessed-value-vs-market-value-difference/<br>
Market: what home could sell for<br>
Assessed: Used for property tax<br>

In [None]:
dfp[dfp['market']<500] #most low values are vacant properties, 5200 radium springs is a power plant

In [None]:
dfp[dfp['market']>10000000]

In [None]:
dfp[dfp['assessment']<500] #Vacant properties

#### Categorical values

In [None]:
dfp['zoningType'].value_counts()

In [None]:
dfp['siteZoningIdent'].value_counts()

In [None]:
dfp['propClass'].value_counts()

In [None]:
dfp['floors'].value_counts()

In [None]:
dfp['wallType'].value_counts()

In [None]:
dfp['condition'].value_counts()

In [None]:
dfp['roofCover'].value_counts()

In [None]:
dfp['foundationType'].value_counts()

### 5. Statistical summaries by groups

In [None]:
dfp_g1 = dfp.loc[:,['zoningType','lot_size', 'yearBuilt', 'size', 'market', 'assessment']].groupby(by='zoningType')
pd.set_option('precision', 2)
dfp_g1.agg(['mean', 'min', 'max'])

In [None]:
dfp_g2 = dfp.loc[:, ['propClass','lot_size', 'yearBuilt', 'size', 'market', 'assessment']].groupby(by='propClass')
dfp_g2.agg(['mean', 'min', 'max'])

In [None]:
dfp_g2 = dfp.loc[:, ['condition','lot_size', 'yearBuilt', 'size', 'market', 'assessment']].groupby(by='condition')
dfp_g2.agg(['mean', 'min', 'max'])

In [None]:
dfp_g3 = dfp.groupby(by=['condition', 'propClass']) 
dfp_g3.count()

In [None]:
dfp_g4 = dfp.groupby(by=['siteZoningIdent', 'propClass']) 
dfp_g4.count()

# Read missing back in

In [31]:
missing.index = range(len(missing))
missing.shape

(4410, 7)

In [32]:
missing_dir = '/home/mirabel/Dropbox (GaTech)/CDS-2019-AlbanyHub/Processed-Data/attom_json/missing'
#read all json files into a single list
j_list=[]
flist = os.listdir(missing_dir)
flist.sort()
for filename in flist:
    if filename.endswith('.json'):
        f = open(missing_dir+'/'+filename)
        j_list.append(json.load(f))
        f.close()

#### a) Split the results into 'found', which were retrieved on the second run-through and 'missing_list', which still could not be retrieved.

In [33]:
data_new = [y for x in j_list for y in x]
data_found =[]#The data returned by the attom api
orig_addresses = [] #The addresses used to query the attom api
missing_list = pd.DataFrame(data={'Address':[], 'StatusMSG':[], 'Code':[]}) #the return values of addresses which were still not coded
for i in range(len(data_new)):
    if len(data_new[i])==8 or len(data_new[i])==6:
        missing_list = missing_list.append({'Address':missing.iloc[i, 1], 'StatusMSG':data_new[i]['msg'], 'Code':data_new[i]['code']}, ignore_index=True)
    elif len(data_new[i])==0:
        missing_list = missing_list.append({'Address':missing.iloc[i, 1], 'StatusMSG':'unknown', 'Code':0}, ignore_index=True)
    else:
        data_found.append(data_new[i])
        orig_addresses.append(missing.iloc[i, 1])

IndexError: single positional indexer is out-of-bounds

In [None]:
len(data_found)

In [None]:
#Projects that were found when the api was queried again
dfp2 = prop_split(data_found)
dfp2.drop_duplicates(subset="address")
dfp2.index = range(len(dfp2))
#get the primary ids in the junction table for each address returned
id_dict = {df.loc[x,'Address']:df.loc[x, 'Id'] for x in range(len(df))} #maps address to id, from junct table
NOT_FOUND = id_dict['NOT FOUND']
prim_ids=[id_dict.get(x, NOT_FOUND) for x in orig_addresses] #id of each addresss in new dataframe
dfp2['PrimaryId'] = prim_ids

####  In found, compare the coordinates that were located to the coordinates of the address used to query. If it seems that the api found the wrong location, drop this address from dfp2

In [None]:
#Compare the addresses returned by attom api to those which were queried -> is there a difference?
data_found_lat = [float(x['location']['latitude']) for x in data_found]
data_found_lon = [float(x['location']['longitude']) for x in data_found]
data_found_acc = [x['location']['accuracy'] for x in data_found]
df_subset = df[df['Address'].isin(orig_addresses)] #The set of the dataframe that was used to generate data_found
df_subset.index = range(len(df_subset))

df_compare = pd.DataFrame(data={'found_addresses':dfp2['address'], 'orig_addresses':df_subset['Address'], 'accuracy':data_found_acc,'found_lat':data_found_lat, 'found_lon':data_found_lon, 'orig_lat':df_subset['Xcoord'], 'orig_lon':df_subset['Ycoord']})
df_compare['coord_diff'] = df_compare['found_lat']+df_compare['found_lon']-df_compare['orig_lat']-df_compare['orig_lon']

In [None]:
#Addresses which may have been miscoded: difference in coordinates is significant
df_compare[df_compare['coord_diff']>0.005]
#US highway addresses: found is more accurate
#all else: original is more accurate
#Despite incorrect geocoding, likely only the E/W N/S mixups and wildly incorrect addresses are wrong
#That is, 80, 97, 190, 553, 676, 734
drop_idx = [80, 97, 190, 553, 676, 734]
nf_miscoded = dfp2.loc[drop_idx, 'address']
dfp2.drop(drop_idx, inplace=True)
dfp2.index = range(len(dfp2))

In [None]:
dfp_all = pd.concat([dfp, dfp2], axis=0)

#### c) In missing_list, split the non-found addresses by the error code. Export the addresses which resulted in 'System not responding' to be re-run

In [None]:
missing_list.groupby(by='StatusMSG').count()

In [None]:
missing_list.groupby(by='Code').count()
#-2 = System Not Responding : There is a failure within the api
#1 = SuccessWithoutResult : Your request was successful but returned no results
#210 = Geocoder Search Results Address Not Identified. : The input address could not be identified. Please try again.
#212 = 	Success without results. No data available for this address. : The input address has been located with ZIP level precision, but a record is not available.

In [None]:
missing_list[missing_list['StatusMSG']=='System Not Responding.'].to_csv('missing_notresponding.csv')


In [None]:
missing_list[missing_list['StatusMSG']=='Success without results. No data available for this address.'].head()

In [None]:
missing_list[missing_list['StatusMSG']=='Geocoder Results Address Not Identified.'].head()

#### d) Retrieve the results of step c) and add them to a new dataframe dfp3.

In [None]:
f = open('/home/mirabel/Dropbox/CDS-2019-AlbanyHub/Processed-Data/attom_json/missing/noresponse/json_dump_missing_noresponse0_10.json')
data_nr = json.load(f)
f.close()
dfp3 = prop_split(data_nr)
prim_ids=[id_dict.get(x, NOT_FOUND) for x in dfp3['address']] #id of each addresss in new dataframe
#manually fix one value
prim_ids[6] = id_dict.get('600 POLARIS DR')
dfp3['PrimaryId'] = prim_ids

In [None]:
dfp_all = pd.concat([dfp, dfp2, dfp3], axis=0)

In [None]:
p1 = missing_list[missing_list['StatusMSG']!='System Not Responding.']
p2 = p1.append([{'Address':x} for x in nf_miscoded], ignore_index=True)
#
p2.tail(20)
p2.to_csv('missing_unresolved.csv', index=False)