In [1]:
import numpy as np
import math
import glob
import pandas as pd
import folium
# from folium import plugins
# from folium import *
import json
import datetime
import branca
# These lines set up the plotting functionality and formatting.
# import matplotlib
# matplotlib.use('Agg', warn=False)
# %matplotlib inline
# import matplotlib.pyplot as plots
# plots.style.use('fivethirtyeight')
# import warnings
# warnings.simplefilter(action="ignore", category=FutureWarning)


# 1. Introduction

The point of this study will be to observe housing prices in the City of Charlottesville. 

The Folium module adapts Leaflet maps to Python. The provided coordinates will be used to generate all subsequent maps of the City of Charlottesville. The boundary is shown below using a JSON overlay from OpenData Charlottesville. The houses in this study will almost always be found within this boundary. 

In [2]:
with open ("data/cityboundary.geojson") as f:
    citylayer = json.load(f)

map_with_citylayer = folium.Map(location=[38.0293, -78.4767], zoom_start=13)
folium.GeoJson(citylayer).add_to(map_with_citylayer)
map_with_citylayer

# 2. Preparing the Data

The repository https://opendata.charlottesville.org/ offers data on various utilities in the City of Charlottesville. The Property subsection itself has various datasets. Thus, it is important to first obtain only portions of each table that are relevant and adjust them so that they are ready for manipulation.

## 2.1 Reading/Modifying Sales Table

The _Sales Table_ contains data about sales of particular parcels. It contains their addresses, date of sale, and amount of sale. Firstly, the Street Number and Street Address are combined for ease of use later on. **_sales_** variable is a DataFrame that reads in _Sales Table csv_ and makes a column that combines street number and street name.

In [3]:
sales = pd.read_csv('data/Real_Estate__Sales_.csv')
sales['Combo'] = sales['StreetNumber'] + ' '+sales['StreetName'] 

**_formatDate(row)_** is an applied function that takes every row of the DataFrame and modifies the SaleDate. The time portion is removed since none of the dates appear to specify an actual time. The backslashes are replaced with dashes to better coincide with numpy's date objects. **_sales_** now removes rows that do not have proper dates, and applies the **_formatDate(row)_** function.

In [4]:
def formatDate(row):  
    date = row['SaleDate']
    return date[:10].replace('/','-')

sales = sales[~pd.isna(sales.SaleDate)]
sales = sales.assign(SaleDate= sales.apply(formatDate,axis=1))

## 2.2 Merging Sales with Residential

The _Residential datasheet_ contains a list of all the residential parcels of the area. This is important to filter out the non-residential parcels located in the _Sales dataset_. The datasheet has a column called "UseCode" that specifies the type of residential building in the parcel. It was discovered that some of the labels in the column were not relevant to family housing (such as parking lots and vacant lands). Filtering was done to remove these parcels from the dataset, and of the fifty or so labels, about ten are left.


**_resid_** is the DataFrame after reading in this _Residential csv_. **_salesResid_** is the result of merging the **_sales_** and **_resid_**. Only parcels of UseCodes in the set **_labels_** are included. The join is done using the ParcelNumber from each dataframe.

In [5]:
# Filter out non-residential parcels
resid = pd.read_csv('data/Real_Estate__Residential_Details_.csv',header=0)

labels = {'Single Family', 'Duplex', 'Single Family Attached', 'Condominium',
          'Single Family-1 Conversion', 'Triplex', 'Rooming House', 'Condo Main',
          'Single Family-3 Conversion', 'Four-Level Split', 'Three-Level Split',
          'Single Family-2 Conversion', 'Condominium-Flex', 'Quadplex'}

salesResid = pd.merge(sales,resid[resid.UseCode.isin(labels)].loc[:,['ParcelNumber','UseCode']],on="ParcelNumber")

## 2.3 Merging with Geocoded Addresses

The addresses in the table were then geocoded (found latitude/longitude coordinates using addresses) and located in a .csv file called _'coordinates.csv'_. This was merged with the combined Residential Sales (**_resid_**) table. **_geoSalesResid_** is the table that will be used for subsequent sections. As a baseline, parcels with SaleAmounts less than 100 were assumed to be typos or invalid sales and filtered out. Additionally, it was found that some parcels had sales with different 'RecordID', but with the same exact 'SaleDate', 'SaleAmount', and 'ParcelNumber'. These would also be filtered out. 

In [6]:
geocoded = pd.read_csv('data/coordinates.csv')

geoSalesResid = pd.merge(salesResid,geocoded,left_on="Combo",right_on="ADDRESS")
geoSalesResid = geoSalesResid[geoSalesResid.SaleAmount > 100].drop(columns=["StreetName","StreetNumber","Combo"])
geoSalesResid = geoSalesResid.drop_duplicates(subset=['ParcelNumber','SaleDate','SaleAmount'])
geoSalesResid

Unnamed: 0,RecordID_Int,ParcelNumber,SaleDate,SaleAmount,Unit,UseCode,ADDRESS,LATITUDE,LONGITUDE
2,62,010006000,2003-01-31,545000,,Single Family,2028 BARRACKS RD,38.050765,-78.498080
3,63,010007000,1982-07-27,160000,,Single Family,1930 BARRACKS RD,38.050395,-78.497636
4,66,010009000,1984-08-31,175000,,Single Family,1920 BARRACKS RD,38.049731,-78.497104
7,69,010010000,2018-10-31,575000,,Single Family,1851 WESTVIEW RD,38.048508,-78.498668
8,70,010011000,2018-01-10,970000,,Single Family,1855 WESTVIEW RD,38.049037,-78.498346
...,...,...,...,...,...,...,...,...,...
48019,56275,610318000,2001-07-20,89900,,Single Family Attached,110 MILFORD TER,38.018268,-78.470688
48020,56276,610318000,1999-02-18,93000,,Single Family Attached,110 MILFORD TER,38.018268,-78.470688
48022,56278,610318000,1999-10-21,81500,,Single Family Attached,110 MILFORD TER,38.018268,-78.470688
48023,56279,610318000,2017-03-07,148000,,Single Family Attached,110 MILFORD TER,38.018268,-78.470688


# 3. Defining Boundaries

The goal of this section is to recognize regions (neighborhoods) in Charlottesville that share similar housing prices as well as create visuals based off of these regions. 

## 3.1 Introduction

**_makeRecentSales(cutoffDate)_** is a function that creates a subset of **_geoSalesResid_** based on a cutoff date. It will only look at sales that have happened after the specified date. **_recentSales_** is an example DataFrame created that has sales from the beginning of 2019 to the present. 

In [7]:
def makeRecentSales(cutoffDate):
    return geoSalesResid[geoSalesResid.SaleDate > cutoffDate]

recentSales = makeRecentSales('2018-12-31')

A Folium map can be generated with circle markers describing a coordinate that has latitude and longitude. **_addMarkersSale_** is an applied function that will read the rows of the DataFrame and add a circle marker to it. The color will be chosen using the **_chooseColor_** function which defines the sale cutoffs that will be used on the map. It uses the Yellow/Orange/Red color brewer gradient. The markers will also show the Sale Date and Amount for each. **_formatSalesNumber(v)_** is a helper function that adds commas every third digit in the sale.

In [8]:
def formatSalesNumber(v):
    splitstr = v.split('.')
    leftNum = splitstr[0]
    leftNumDigits = len(splitstr[0])
    if len(splitstr) > 1:  
        rightNum = '.' + splitstr[1]
    else:
        rightNum = ''
    
    comma = 0
    if leftNumDigits > 3:
        temp = leftNumDigits
        while(((temp -1) // 3) > 0):
            temp -= 3 
            comma += 1
    else:
        return '$' + leftNum + rightNum

    if leftNumDigits % 3 == 0:
        s = '$' + leftNum[:3]
        for i in np.arange(comma):
            a = i + 1
            s += ','+ leftNum[3 * a:3 * a + 3]
        return s + rightNum
    else:
        s = '$' + leftNum[:(leftNumDigits % 3)]
        for i in np.arange(comma):
            a = i + 1 
            b = leftNumDigits % 3 
            s += ','+ leftNum[b * a : b * a + 3]
        return s + rightNum


def chooseColorSale(sale):
    if sale < 89000:
        return '#fef0d9'
    elif 89000 <= sale < 150000:
        return '#fdd49e'
    elif 150000 <= sale < 400000:
        return '#fdbb84'
    elif 400000 <= sale < 800000:
        return '#fc8d59'
    elif 800000 <= sale < 1000000:
        return '#e34a33'
    else:
        return '#b30000'

def addMarkersSale(row,currmap):
    line = row["SaleDate"][:4] + ": " + formatSalesNumber(str(row['SaleAmount']))
    folium.CircleMarker(location=[row['LATITUDE'],row['LONGITUDE']],
                  popup=line,radius=1.5,color=chooseColorSale(row['SaleAmount']),
                       fill_color=chooseColorSale(row['SaleAmount'])).add_to(currmap)
    return

A Folium map is created, centered on Charlottesville. **_generateSalesMap()_** is just a function that encloses this process of genearting a map. The functions described above are used to add circle markers to every parcel that had a Sale from beginning of 2019 to the present.

In [9]:
def generateSalesMap():
    m = folium.Map(location=[38.0293, -78.4767], zoom_start=13)
    makeRecentSales('2018-12-31').apply(addMarkersSale,axis=1,args=(m,))
    return m
generateSalesMap()

From the above map, it can be seen that there are exist clusters of similar sale prices. JSON object files are a good way to represent geographical areas. Cville Open Data provides a set of objects called Planning Neighborhood Area, and said set is used as a layer. **_generateJSONMap()_** produces a map of Charlottesville with these Planning Areas overlayed. 

In [10]:
def generateJSONMap():
    m = folium.Map(location=[38.0293, -78.4767], zoom_start=13)
    folium.GeoJson("data/regions.geojson", name='area').add_to(m)
    return m
generateJSONMap()

The JSON file first is opened and the longitude and latitude of the points that comprise of each JSON object are extracted. The variable **_lay_** stores lists of these coordinates. 

In [11]:
layoutFile = json.load(open('data/regions.geojson'))
arr = []
for i in np.arange(len(layoutFile['features'])):
    arr.append(layoutFile['features'][i]['geometry']['coordinates'][0])
lay = pd.Series(arr)

To combine these individual parcel points with the imported JSON layer into a cohesive visualization, the Point in Polygon problem must be solved. The following section will describe the algorithm that ultimately determines whether a parcel (point) is located within a particular JSON object (polygon). 

## 3.2 Point in Polygon Algorithm

With the imported JSON polygon layers, each point in the previous section should be grouped with each polygon. This is a point in polygon problem: determine whether or not a point is in a polygon defined by its vertices. 

A way to solve this problem is with ray-casting. Take a point and extend it "infinitely" along an arbitrary direction (it is now a ray). Count the number of times the point intersects a side of the polygon. If the number of ray intersections is even, then it is not in the polygon. If the number of ray intersections is odd, then it is in the polygon. 

![Solving PIP with Ray-casting](https://upload.wikimedia.org/wikipedia/commons/c/c9/RecursiveEvenPolygon.svg)


The first issue is, given two pairs of points (that define two line segments), determine if they intersect or not. The solution is to check the orientation of these points (intersecting line segments should have their respective points in between one point from the other line segment). A algorithm written by Kite is used. The main function below is intersects(s1,s2) and it takes in two tuples.

**Note: All credit for the functions _on_segment(p,q,r)_, _orientation(p,q,r)_, and _intersects(s1,s2)_ goes to Kite [https://www.kite.com/python/answers/how-to-check-if-two-line-segments-intersect-in-python].** 

In [12]:
def on_segment(p,q,r):
    if r[0] <= max(p[0], q[0]) and r[0] >= min(p[0], q[0]) and r[1] <= max(p[1], q[1]) and r[1] >= min(p[1], q[1]):
        return True
    else: 
        return False

def orientation(p,q,r):
    val = ((q[1] - p[1]) * (r[0] - q[0])) - ((q[0] - p[0]) * (r[1] - q[1]))
    if val == 0:
        return 0
    elif val > 0:
        return 1
    else:
        return -1

def intersects(s1,s2):
    p1,q1 =s1
    p2,q2 = s2
    
    o1 = orientation(p1,q1,p2)
    o2 = orientation(p1,q1,q2)
    o3 = orientation(p2,q2,p1)    
    o4 = orientation(p2,q2,q1)
    
    if o1 != o2 and o3 != o4:
        return True
    if o1 == 0 and on_segment(p1,q1,p2):
        return True
    if o2 == 0 and on_segment(p1, q1, q2):
        return True
    if o3 == 0 and on_segment(p2, q2, p1):
        return True
    if o4 == 0 and on_segment(p2, q2, q1):
        return True
    return False

Now the point in polygon algorithm can be applied for this scenario. The main function is **_pointinpolygons(row,layout)_**, which will be applied across the entire table with each row (point) as its argument. It will return a series of booleans showing whether or not the point is in each of the polygons in the JSON file.

The **_makeray(point)_** function receives a point and returns a sequence with two tuples inside that represent the ray. For this, each point is extended horizontally. Additionally, since all points reside in Charlottesville, the left/right longitude are the west-most/east-most points of the area, respectively. 

The **_pointinpolygon(polygons,raysegment)_** function is the individual function that is applied inside pointinpolygons. It creates a list of tuples that contain the line segments of the polygon. It is assumed that the Json file is configured so that the points can be connected from smallest to biggest (index 0 to index 1, index 1 to index 2, etc) until the very last which is connected back to index 0. It then counts the number of intersections as described above about the ray-casting. 

In [13]:
# defines a point, returns a series of whether or not point is in each JSON polygon
def pointinpolygons(row,layout):
    pt = (row['LATITUDE'],row['LONGITUDE'])
    rayseg = makeray(pt)
    
    foundin = layout.apply(pointinpoly,args=(rayseg,))
    if foundin[foundin].shape[0] == 0:
        return 0
    else:
        return foundin[foundin].index[0] + 1
    

# produces a ray by extending a point horizontally depending on its longitude relative to Cville
def makeray(point):
    # Bounds for Longitude 
    leftb = -78.647930
    rightb = -78.411250
    midb = leftb + (rightb-leftb)/2

    if point[1] > midb:
        return (point,(point[0],leftb)) 
    else:
        return (point,(point[0],rightb))
    
# determines if the point is in or not in a single JSON polygon
def pointinpoly(polygons,raysegment):
    # Makes pairs of indices to form polygon edges (assumes that edges are defined as) from a polygon that has n sides
    # 0 to 1, 1 to 2, 2 to 3, etc... until the last index which would be n-1 to 0
    combo = []
    for i in np.arange(len(polygons)):
        if i < len(polygons) - 1:
            apair = (i,i+1)
        else:
            apair = (i,0)
        combo.append(apair)
    
    # Iterates through the list of pairs and indexes into the polygons variable which is a list of lists. It produces a 
    # a segment represented by two points and it is determined whether it intersects with the address.
    intersections = 0 
    for pair in combo:
        segment1 = ((polygons[pair[0]][1],polygons[pair[0]][0]), (polygons[pair[1]][1],polygons[pair[1]][0]))
        if intersects(segment1,raysegment):
            intersections +=1
            
    if intersections % 2 == 0:
        return False
    else:
        return True

The function is used to insert a column that assigns each point to its polygon region. If it is not in any of the polygons, then it is assigned 0. The recentSales table that contained sales after 2018 adds a new column that categorizes each parcel into an area.

In [14]:
recentSales.insert(recentSales.shape[1],'Region',recentSales.apply(pointinpolygons,axis=1,args=(lay,)))

To verify the accuracy of the algorithm, the same table is used to create another map with the Json layers. Filtering out those without a region, there does not seem to be a point that lies outside the polygons. **_generateSalesMapWithJson()_** uses **_recentSales_** and filters out those without a region (classifed with 0).

In [15]:
def generateSalesMapWithJson():
    m = folium.Map(location=[38.0293, -78.4767], zoom_start=13)
    testTab = recentSales[recentSales.Region > 0].apply(addMarkersSale,axis=1,args=(m,))
    folium.GeoJson("data/regions.geojson", name='area').add_to(m)
    return m

generateSalesMapWithJson()

## 3.3 Grouping the points

The pandas group function can be used to group all the points by region and determine the average sale amount within that region. **_salesByRegion_** is an example of grouping the entire **_recentSales_** table by region and finding the average of their sale prices to form a Series.

In [16]:
salesByRegion = recentSales.groupby(['Region']).mean().loc[:,"SaleAmount"]
salesByRegion

Region
1     670985.467742
2     544344.571429
3     319768.657143
4     364567.980198
5     454100.333333
6     235972.153846
7     305440.586420
8     247008.225806
9     362783.363636
10    313123.349206
11    592000.000000
12    105000.000000
13    332421.292994
14    332452.313725
15    725444.100000
16    640250.000000
17    291418.384615
18    181000.000000
19    649639.571429
Name: SaleAmount, dtype: float64

A style function tells Folium how to format the JSON layers onto the map. This is necessary to customize GeoJSON layers in Folium. The color tiers similarily follow the function **_chooseColorSale()_** in section 3.1.

In [17]:
def stylefunction(x):  
    sale=x['properties']['SaleAverage']
    if sale < 89000:
        color = '#fef0d9'
    elif 89000<= sale < 150000:
        color = '#fdd49e'
    elif 150000 <= sale < 400000:
        color = '#fdbb84'
    elif 400000 <= sale < 800000:
        color = '#fc8d59'
    elif 800000 <= sale < 1000000:
        color = '#e34a33'
    else:
        color = '#b30000'
    return {'weight': 5, 'color': color,'fill': True, 'fillOpacity':0.5}

## 3.4 Creation of the Map

The **_generateSalesMap(row,currmap,group)_** function will do the following:

1. Use pointinpolygons function on salesTable, producing a Series that categorizes each parcel into a JSON region. Insert that Series into the salesTable (points in no regions are given "0").
2. Group the salesTable by the regions and determine the mean/median/count of each. Make a DataFrame for all three with the numerical indices as a column. Uses **_produceAGroup(t,func,label)_ function**
3. Load JSON file of planning areas and update each layer with the three values
4. Load JSON file of city boundary and add median/average/count to JSON
5. Create a Folium Map and include additional Tile options
6. FeatureGroup: Add a GeoJSON of city boundary that includes average/median of whole city
7. FeatureGroup: Add the markers of individual parcels onto the map
8. FeatureGroup: Add a Choropleth using DataFrame from #3 and JSON from #4 based on averages
9. FeatureGroup: Add a Choropleth using DataFrame from #3 and JSON from #4 based on median
10. Add LayerControl and save map

In [72]:
def produceAGroup(t,func,label):
    if func == 'mean':
        temp = t.groupby(['Region']).mean().loc[:,label]
        return pd.DataFrame({'Regions': pd.Series(data=np.arange(20)), 'Avg': temp}).fillna(value=0)
    elif func == 'median':
        temp = t.groupby(['Region']).median().loc[:,label]
        return pd.DataFrame({'Regions': pd.Series(data=np.arange(20)), 'Med': temp}).fillna(value=0)          
    elif func == 'count':
        temp = t.groupby(['Region']).count().iloc[:,0]
        return pd.DataFrame({'Regions': pd.Series(data=np.arange(20)), 'Count': temp}).fillna(value=0)
    

def checkIfZero(s,label,n):
    try:
        return str(round(s[label][n],2))
    except:
        return '0'

def addMarkersSaleGroup(row,currmap,group):
    # normalized to day (percNorm)
    price = row['SaleAmount']
    line = row ["UseCode"] + '(' + row["SaleDate"][:4] + "): " + formatSalesNumber(str(row['SaleAmount']))
    marker = folium.CircleMarker(location=[row['LATITUDE'],row['LONGITUDE']],
          popup=line,radius=1.5,color=chooseColorSale(price),
               fill_color=chooseColorSale(price))
    group.add_child(marker)
    return

def generateSalesMap(salesTable,lay):
    # 1. Insert a Series that classifies each parcel into to a JSON region, if possible (otherwise it is given region 0)
    salesTable.insert(salesTable.shape[1],'Region',salesTable.apply(pointinpolygons,axis=1,args=(lay,)))
    
    # 2. Group by region number and find average sale amount and median in each, even for region 0. Produce a DataFrame that has
    # the region numbers as a column
    salesByRegionAvg = produceAGroup(salesTable,'mean','SaleAmount')
    salesByRegionMed = produceAGroup(salesTable,'median','SaleAmount')
    salesByRegionCount = produceAGroup(salesTable,'count','SaleAmount')
    
    print((salesByRegionAvg))
    
    # 3. load GeoJSON file of regions, adding the SaleAverage as a property to each
    with open ("data/regions.geojson") as f:
        regionlayer = json.load(f)
        count = 1
        for i in regionlayer['features']:
            i['properties']['AVG PRICE'] = formatSalesNumber(checkIfZero(salesByRegionAvg,'Avg',count))
            i['properties']['MED PRICE'] = formatSalesNumber(checkIfZero(salesByRegionMed,'Med',count))
            i['properties']['NUM SALES'] = salesByRegionCount.Count[count]
            count += 1
            
    
    # 4. Load GeoJSON of city boundary, adding SaleAverage/Median/Count of whole city to the Layer
    with open ("data/cityboundary.geojson") as f:
        citylayer = json.load(f)
    citylayer['features'][0]['properties']['AVG PRICE'] = formatSalesNumber(str(round(salesTable.SaleAmount.mean(),2)))
    citylayer['features'][0]['properties']['MED PRICE'] = formatSalesNumber(str(round(salesTable.SaleAmount.median(),2)))
    citylayer['features'][0]['properties']['NUM SALES'] = salesTable.shape[0]

    
    # 5. Create Folium Map and add TileLayer Options        
    finalMap = folium.Map(location=[38.0293, -78.4767], zoom_start=13)
    folium.raster_layers.TileLayer('Open Street Map').add_to(finalMap)
    folium.raster_layers.TileLayer('Stamen Toner').add_to(finalMap)
    folium.raster_layers.TileLayer('CartoDB Positron').add_to(finalMap)
    
    
    # 6. Group: GeoJson of Charlottesville City Boundary
    cityavg = salesTable.SaleAmount.mean()
    style_function = lambda x: {'fillColor': chooseColorSale(cityavg), 'color': chooseColorSale(cityavg)}
    city = folium.GeoJson(citylayer, name='City of Charlottesville Boundary',
                   style_function=style_function,
                         tooltip=folium.GeoJsonTooltip(['NAME','AVG PRICE','MED PRICE', 'NUM SALES'])).add_to(finalMap)

    # 7. Group: Addition of Markers for each individual Parcel
    saleMarkers = folium.FeatureGroup(name='Sale Price Markers',show=False)
    salesTable.apply(addMarkersSaleGroup,axis=1,args=(finalMap,saleMarkers))
    finalMap.add_child(saleMarkers)

    # 8. Group: Choropleth of Planning Neighborhood Regions' Average Sale
    choroAvg = folium.Choropleth(geo_data=regionlayer, key_on='feature.properties.OBJECTID',
                      data=salesByRegionAvg,columns=['Regions','Avg'],
                      fill_color='YlOrRd',legend_name='Sale Amount ($): Average',name='Sale Amount Average', show=False,
                      
                      highlight=True).add_to(finalMap)
    choroAvg.geojson.add_child(folium.GeoJsonTooltip(['OBJECTID','NAME','AVG PRICE','NUM SALES']))
 
    # 9. Group: Choropleth of Planning Neighborhood Regions' Median Sale
    choroMed = folium.Choropleth(geo_data=regionlayer, key_on='feature.properties.OBJECTID',
                      data=salesByRegionMed,columns=['Regions','Med'],
                      fill_color='YlOrRd',legend_name='Sale Amount ($): Median',name='Sale Amount Median', show=False,
                      highlight=True).add_to(finalMap)
    choroMed.geojson.add_child(folium.GeoJsonTooltip(['OBJECTID','NAME','MED PRICE','NUM SALES']))
    
    
    # 10. Add LayerControl and Save Map
    folium.LayerControl().add_to(finalMap)
    finalMap.save("maps/salesMapLast20Years.html")
    return salesTable

a = generateSalesMap(makeRecentSales('1999-12-31'),lay)

    Regions            Avg
0         0       0.000000
1         1  451642.506110
2         2  346723.063202
3         3  232653.032316
4         4  360120.421186
5         5  319752.805492
6         6  193094.946429
7         7  237921.176829
8         8  199390.198043
9         9  368008.992733
10       10  296849.270079
11       11  231531.921569
12       12  123802.826087
13       13  260627.560757
14       14  280721.436518
15       15  535288.480769
16       16  420401.931990
17       17  223753.168285
18       18  202902.883721
19       19  420604.602637


In [73]:
a

Unnamed: 0,RecordID_Int,ParcelNumber,SaleDate,SaleAmount,Unit,UseCode,ADDRESS,LATITUDE,LONGITUDE,Region
2,62,010006000,2003-01-31,545000,,Single Family,2028 BARRACKS RD,38.050765,-78.498080,16
7,69,010010000,2018-10-31,575000,,Single Family,1851 WESTVIEW RD,38.048508,-78.498668,16
8,70,010011000,2018-01-10,970000,,Single Family,1855 WESTVIEW RD,38.049037,-78.498346,16
12,74,010012000,2015-05-04,512000,,Single Family,1857 WESTVIEW RD,38.049108,-78.499057,16
16,78,010014000,2007-06-26,975000,,Single Family,1885 WESTVIEW RD,38.049347,-78.499412,16
...,...,...,...,...,...,...,...,...,...,...
48013,56269,610317000,2009-12-14,102128,,Single Family Attached,108 MILFORD TER,38.018362,-78.470659,7
48016,56272,610317000,2010-03-29,122500,,Single Family Attached,108 MILFORD TER,38.018362,-78.470659,7
48019,56275,610318000,2001-07-20,89900,,Single Family Attached,110 MILFORD TER,38.018268,-78.470688,7
48023,56279,610318000,2017-03-07,148000,,Single Family Attached,110 MILFORD TER,38.018268,-78.470688,7


# 4: Sales History Aggregation

The section below observes which parcels have had multiple sales throughout the years. A quick check of the dimensions of the table show that there are quite a few parcels that have more than one sale in the records. 

In [24]:
grpbysale = geoSalesResid.groupby(['ParcelNumber']).count().iloc[:,0]

multisales = grpbysale[grpbysale > 1]
print('Total sales: ' + str(geoSalesResid.shape[0]))
print('Unique parcels: ' + str(grpbysale.shape[0]))
print('Unique parcels with two or more sales: ' + str(multisales.shape[0]))

Total sales: 30618
Unique parcels: 12309
Unique parcels with two or more sales: 9107


## 4.1: Creating a Dictionary of Sales 

A couple of functions are written to provide different ways of grouping the table and produce a sales history depending on conditions.
- **_normal(parcel)_**: used to clump up all listed sales under a possible parcel as a dictionary. 
- **_byYear(parcel,year=2000)_**: adds a condition of only including sales after a specified year (set as a default parameter). 

In [25]:
def normal(parcel):
    temp = {}
    if parcel.shape[0] == 1:
        saledate = parcel.iloc[0,2]
        temp[saledate] = parcel.iloc[0,3]
    else:
        for i in np.arange(parcel.shape[0]):
            saledate = parcel.iloc[i,2]
            temp[saledate] = parcel.iloc[i,3]
    return temp

def byYear(parcel,year=2000):
    temp = {}
    if parcel.shape[0] == 1:
        saledate = parcel.iloc[0,2]
        if int(saledate[:4]) >= year:
            temp[saledate] = parcel.iloc[0,3]
    else:
        for i in np.arange(parcel.shape[0]):
            saledate = parcel.iloc[i,2]           
            if int(saledate[:4]) >= year:
                temp[saledate] = parcel.iloc[i,3]
    return temp

The **_getSalesHistory(table,norm=True,multi=True)_** function will return a dictionary of the sales in the table depending on the desired form of filtering (see above) list of functions. Multi parameter means only parcels with multiple sale histories are returned.

In [26]:
def getSalesHistory(table, norm=True, multi=True):
    grouped = table.groupby('ParcelNumber')
    if norm:
        history = grouped.apply(normal).to_dict()
    else:
        temp = grouped.apply(byYear).to_dict()
        history = {}
        for parcel in temp:
            if len(temp[parcel]) > 1:
                history[parcel] = temp[parcel]
            elif not multi and len(temp[parcel]) == 1:
                history[parcel] = temp[parcel]
    return history

Below, a sale history dictionary is created. Norm is set to false so that only sales occuring after the year 1999 are included. Multi is set to True so that only parcels with more than one sale as well are included in addition to the previous condition. In short, these two conditions mean that parcels in the dictionary will:

- Each sale will be from Jan 1, 2000 and beyond
- Every parcel history will have two or more sales

In [27]:
saleHistory = getSalesHistory(geoSalesResid,norm=False)
saleHistory

{'010017000': {'2017-06-08': 636000, '2018-10-19': 1030000},
 '010017100': {'2000-09-14': 459500, '2005-11-10': 654750},
 '010019000': {'2002-05-23': 85000,
  '2006-07-07': 479500,
  '2003-03-03': 360000,
  '2001-11-01': 70000,
  '2008-08-07': 549000},
 '010020000': {'2010-08-04': 962500, '2001-01-24': 275000},
 '010024A00': {'2002-06-12': 125000,
  '2014-06-30': 940000,
  '2007-08-29': 895000},
 '010027000': {'2008-08-04': 483000, '2017-12-08': 695000},
 '010031000': {'2003-06-06': 425000, '2008-12-15': 545000},
 '010034000': {'2018-11-15': 430000, '2019-02-15': 470000},
 '010036000': {'2014-03-21': 695940, '2010-11-01': 389000},
 '010037000': {'2016-05-05': 740000, '2001-07-02': 345000},
 '010038000': {'2018-04-09': 685000, '2019-03-29': 1375000},
 '010039000': {'2007-07-13': 520000, '2007-05-25': 520000},
 '010041000': {'2004-03-03': 530000, '2019-06-06': 1350000},
 '010043000': {'2020-01-07': 680000,
  '2018-07-17': 660000,
  '2011-05-25': 315000,
  '2015-05-06': 590000},
 '0100450

In [51]:
len(saleHistory)

4805

Below are a collection of functions to determine properties of sale histories. The first, **_strictincr(saleHistory)_**, determines if a given parcel's sale history has been strictly increasing. This means that, sorting sale amount from earliest to latest, the sale amounts will be increasing. 

The second, **_finalincr(saleHistory)_**, determines if a given parcel's sale history has gone up via the first and last sales. This will return true as long as the latest sale is larger than the earliest.

In [28]:
def strictincr(saleHistory):
    strictinc = {}
    for parcel in saleHistory:
        temp = pd.DataFrame.from_dict(saleHistory[parcel],orient='index')
        temp = temp.sort_index(ascending=True)
        curr = 0
        datesOrd = temp.index.tolist()
        strictinc[parcel] = True
        for saleday in datesOrd:
            if curr < saleHistory[parcel][saleday]:
                curr =  saleHistory[parcel][saleday]
            else:
                strictinc[parcel] = False
                break
    return strictinc

def finalincr(saleHistory):
    finalinc = {}
    for parcel in saleHistory:
        temp = pd.DataFrame.from_dict(saleHistory[parcel],orient='index')
        temp = temp.sort_index(ascending=True)
        
        datesOrd = temp.index.tolist()
        finalinc[parcel]= True
        firstsale = saleHistory[parcel][datesOrd[0]]
        lastsale = saleHistory[parcel][datesOrd[len(datesOrd)-1]]
        if firstsale > lastsale:
            finalinc[parcel] = False
    return finalinc

Two series, **_strictTab_** and **_finalTab_** are created from the two functions above. One is a boolean series seeing if a parcel has strictly increased in price while the other just checks if the last is greater than the first. 

In [29]:
strictTab = pd.DataFrame.from_dict(strictincr(saleHistory),orient='index',columns=['strictincr'])
finalTab = pd.DataFrame.from_dict(finalincr(saleHistory),orient='index',columns=['finalincr'])

In [52]:
finalTab

Unnamed: 0,finalincr
010017000,True
010017100,True
010019000,True
010020000,True
010024A00,True
...,...
610313000,True
610314000,True
610315000,True
610317000,True


In [36]:
print("Proportion of Parcels that increased in price every subsequent sale: "+ str(round(strictTab.sum()[0]/len(saleHistory),5)))
print("Proportion of Parcels that increased in price from first to last sale: "+ str(round(finalTab.sum()[0]/len(saleHistory),5)))

Proportion of Parcels that increased in price every subsequent sale: 0.62706
Proportion of Parcels that increased in price from first to last sale: 0.78668


# 4.2: Producing a Map

The above functions can be adapted into applied functions that will produce a series that shows percent change in between the first and last sales of a parcel. **_incrPercent(row,saleHistory)_** will divide the difference between the last and first sales by the first sale and multiply by 100 to produce a percentage. **_incrPercentDay(row,saleHistory)_** will perform the same calculation, but will divide the value by the number of days elapsed between the two sales. This serves to normalize for time.

In [37]:
def incrPercent(row,saleHistory):
    temp = pd.DataFrame.from_dict(saleHistory[row.name],orient='index')
    temp = temp.sort_index(ascending=True)
    datesOrd = temp.index.tolist()
    firstsale = saleHistory[row.name][datesOrd[0]]
    lastsale = saleHistory[row.name][datesOrd[len(datesOrd)-1]]
    val = (lastsale-firstsale)/firstsale * 100
    return val

def incrPercentDay(row,saleHistory):
    temp = pd.DataFrame.from_dict(saleHistory[row.name],orient='index')
    temp = temp.sort_index(ascending=True)
    datesOrd = temp.index.tolist()
    
    firstdate = datetime.datetime.strptime(datesOrd[0], "%Y-%m-%d").date()
    lastdate = datetime.datetime.strptime(datesOrd[len(datesOrd)-1], "%Y-%m-%d").date()
    
    diff = (lastdate-firstdate).days
    firstsale = saleHistory[row.name][datesOrd[0]]
    lastsale = saleHistory[row.name][datesOrd[len(datesOrd)-1]]      
    return (lastsale-firstsale)/firstsale * 100/diff

Since **_finalTab_** from section 4.1 is a Series that contains which parcels have increased from first to last sale, it is fed into the two percent change functions and the resulting two Series are added to the DataFrame. It is then merged with the use-all DataFrame after the sale dates before 2000 have been filtered out, **_geoSalesResid_**, to produce **_geoSalesResidPercent_**. 

In [59]:
percentTab = pd.DataFrame(finalTab[finalTab.finalincr])
percentTab.insert(percentTab.shape[1],'perc',finalTab[finalTab.finalincr].apply(incrPercent,axis=1,args=(saleHistory,)))
percentTab.insert(percentTab.shape[1],'percNorm',finalTab[finalTab.finalincr].apply(incrPercentDay,axis=1,args=(saleHistory,)))
geoSalesResidPercent = pd.merge(makeRecentSales('1999-12-31'),percentTab,left_on='ParcelNumber', right_index=True).drop_duplicates('ParcelNumber')
geoSalesResidPercent

Unnamed: 0,RecordID_Int,ParcelNumber,SaleDate,SaleAmount,Unit,UseCode,ADDRESS,LATITUDE,LONGITUDE,finalincr,perc,percNorm
24,86,010017000,2017-06-08,636000,,Single Family-1 Conversion,1893 WESTVIEW RD,38.050093,-78.499944,True,61.949686,0.124397
31,93,010017100,2000-09-14,459500,,Single Family-2 Conversion,1895 WESTVIEW RD,38.050473,-78.499333,True,42.491839,0.022566
36,99,010019000,2002-05-23,85000,,Single Family,1890 WESTVIEW RD,38.050086,-78.500280,True,684.285714,0.276927
43,106,010020000,2010-08-04,962500,,Single Family,1888 WESTVIEW RD,38.049628,-78.500349,True,250.000000,0.071860
62,125,010024A00,2002-06-12,125000,,Single Family,1876 WESTVIEW RD,38.048490,-78.499412,True,652.000000,0.148148
...,...,...,...,...,...,...,...,...,...,...,...,...
47995,56251,610313000,2001-03-23,83000,,Single Family Attached,100 MILFORD TER,38.018602,-78.470619,True,116.746988,0.074790
48000,56256,610314000,2002-07-31,117900,,Single Family Attached,102 MILFORD TER,38.018528,-78.470644,True,6.022053,0.001819
48005,56261,610315000,2005-11-01,174900,,Single Family Attached,104 MILFORD TER,38.018473,-78.470647,True,6.346484,0.007005
48013,56269,610317000,2009-12-14,102128,,Single Family Attached,108 MILFORD TER,38.018362,-78.470659,True,19.947517,0.189976


New color functions have to be chosen as the numerical values differ from sales prices. 
1. **_chooseColorPerc(sale)_**: colors for percent change between first/last sale
2. **_chooseColorPercNorm(sale)_**: colors for normalized percent change
3. **_addMarkersPerc(row,currmap,norm,group)_**: similar to that of **_addMarkersSaleGroup(row,currmap,group)_**, with the exception being that it must also add keep track of whether or not it has bee normalized or not. 

In [60]:
def chooseColorPerc(sale):
    if sale < 5:
        return '#fef0d9'
    elif 5 <= sale < 10:
        return '#fdd49e'
    elif 20 <= sale < 50:
        return '#fdbb84'
    elif 50 <= sale < 100:
        return '#fc8d59'
    elif 100 <= sale < 200:
        return '#e34a33'
    else:
        return '#b30000'
    
    
def chooseColorPercNorm(sale):
    if sale < 0.01:
        return '#fef0d9'
    elif 0.01 <= sale < 0.05:
        return '#fdd49e'
    elif 0.05 <= sale < 0.10:
        return '#fdbb84'
    elif 0.10 <= sale < 0.25:
        return '#fc8d59'
    elif 0.25 <= sale < 0.5:
        return '#e34a33'
    else:
        return '#b30000'
    

def addMarkersPerc(row,currmap,norm,group):
    # not normalized to day (perc)
    if not norm:
        line = str(row['perc']) + '%'
        marker = folium.CircleMarker(location=[row['LATITUDE'],row['LONGITUDE']],
                      popup=line,radius=1.5,color=chooseColorPerc(row['perc']),
                           fill_color=chooseColorPerc(row['perc']))
    # normalized to day (percNorm)
    else:
        line = str(row['percNorm']) + '%'
        marker = folium.CircleMarker(location=[row['LATITUDE'],row['LONGITUDE']],
              popup=line,radius=1.5,color=chooseColorPercNorm(row['percNorm']),
                   fill_color=chooseColorPercNorm(row['percNorm']))
    group.add_child(marker)
    return

The **_generatePercentMap(row,currmap,group)_** function will do the following:

1. Use pointinpolygons function on salesTable, producing a Series that categorizes each parcel into a JSON region. Insert that Series into the salesTable (points in no regions are given "0").
2. Group the table by the regions categorized and determine the mean
3. Make it a DataFrame with the numerical indices as a column 
4. Load JSON file of planning areas and update each layer with the calculated average
5. Load JSON file of city boundary and add median/average sale to JSON
6. Create a Folium Map and include additional Tile options
7. FeatureGroup: Add a GeoJSON of city boundary that includes average/median of whole city
8. FeatureGroup: Add the markers of individual parcels onto the map
9. FeatureGroup: Add a Choropleth using DataFrame from #3 and JSON from #4 based on averages
10. FeatureGroup: Add a Choropleth using DataFrame from #3 and JSON from #4 based on median
11. Add LayerControl and save map

In [62]:
def addMarkersSaleGroup(row,currmap,group):
    # normalized to day (percNorm)
    price = row['SaleAmount']
    line = row ["UseCode"] + '(' + row["SaleDate"][:4] + "): " + formatSalesNumber(str(row['SaleAmount']))
    marker = folium.CircleMarker(location=[row['LATITUDE'],row['LONGITUDE']],
          popup=line,radius=1.5,color=chooseColorSale(price),
               fill_color=chooseColorSale(price))
    group.add_child(marker)
    return

    
def generateMapPercent(table,lay):
    # 1. Insert a Series that classifies each parcel into to a JSON region, if possible (otherwise it is given region 0)
    table.insert(table.shape[1],'Region',table.apply(pointinpolygons,axis=1,args=(lay,)))
    
    # 2. Group by region number and find average sale amount and median in each, even for region 0, then make a DataFrame
    # with region numbers as a column
    percByRegionAvg = produceAGroup(table,'mean','perc')
    percByRegionMed = produceAGroup(table,'median','perc')
    
    percNormByRegionAvg = produceAGroup(table,'mean','percNorm')
    percNormByRegionMed = produceAGroup(table,'median','percNorm')
    
    countByRegion = table.groupby(['Region']).count().iloc[:,0]

    # 3. load GeoJSON file of regions, adding the SaleAverage as a property to each
    with open ("data/regions.geojson") as f:
        regionlayer = json.load(f)
        count = 1
        for i in regionlayer['features']:
            i['properties']['% DIFF AVG'] = checkIfZero(percByRegionAvg, 'Avg')
            i['properties']['% DIFF MED'] = checkIfZero(percByRegionMed, 'Med')
            i['properties']['% NORM DIFF AVG'] = checkIfZero(percNormByRegionAvg, 'Avg') 
            i['properties']['% NORM DIFF MED'] = checkIfZero(percNormByRegionMed, 'Med') 
            i['properties']['NUM PARCELS'] = checkIfZero(countByRegion, 'Count') 
            count += 1
            
    # 5. Load GeoJSON of city boundary, adding SaleAverage/Median of whole city to the Layer
    with open ("data/cityboundary.geojson") as f:
        citylayer = json.load(f)
    citylayer['features'][0]['properties']['% DIFF AVG'] = str(round(table.perc.mean(),2))
    citylayer['features'][0]['properties']['% DIFF MID'] = str(round(table.perc.median(),2))
    citylayer['features'][0]['properties']['% NORM DIFF AVG'] = str(round(table.percNorm.mean(),2))
    citylayer['features'][0]['properties']['% NORM DIFF MED'] = str(round(table.percNorm.median(),2)) 
 #   citylayer['features'][0]['properties']['NUM PARCELS'] = str(round(table.perc.mean(),2))


    # 6. Create Folium Map and add TileLayer Options        
    finalMap = folium.Map(location=[38.0293, -78.4767], zoom_start=13)
    folium.raster_layers.TileLayer('Open Street Map').add_to(finalMap)
    folium.raster_layers.TileLayer('Stamen Toner').add_to(finalMap)
    folium.raster_layers.TileLayer('CartoDB Positron').add_to(finalMap)
    
    
    cityavg = table.perc.mean()
    cityavgNorm = table.percNorm.mean()
    style_function = lambda x: {'fillColor': chooseColorSale(cityavg), 'color': chooseColorSale(cityavg)}
    city = folium.GeoJson(citylayer, name='City of Charlottesville Boundary',
                   style_function=style_function,
                         tooltip=folium.GeoJsonTooltip(['NAME','AVG PRICE','MED PRICE'])).add_to(finalMap)


#     folium.Choropleth(geo_data=regionlayer,name='Percent Incr',data=percByReg,columns=['Regions','perc'],
#                       key_on='feature.properties.OBJECTID',fill_color='YlOrRd',
#                       legend_name='Percent Increase').add_to(finalMap)
    
#     folium.Choropleth(geo_data=regionlayer,name='Percent Incr/Day',data=percNByReg,columns=['Regions','percNorm'],
#                       key_on='feature.properties.OBJECTID',fill_color='YlOrRd',
#                       legend_name='Percent Increase/Days',show=False).add_to(finalMap)
  

#     percMarkers = folium.FeatureGroup(name='Perc Incr Markers')
#     table.apply(addMarkersPerc,axis=1,args=(finalMap,False,percMarkers,))
#     finalMap.add_child(percMarkers)
#     percMarkersNorm = folium.FeatureGroup(name='Perc Incr/Day Markers', show=False)
#     table.apply(addMarkersPerc,axis=1,args=(finalMap,True,percMarkersNorm,))
#     finalMap.add_child(percMarkersNorm)  
    
    folium.LayerControl().add_to(finalMap)


    finalMap.save("maps/percent.html")
    return 

generateMapPercent(geoSalesResidPercent.copy(),lay)