In [1]:
from copy import deepcopy

from geopy.geocoders import Nominatim
import pandas as pd

geolocator = Nominatim(user_agent='House Buying')

In [2]:
towns = [
    'Acton', 
    'Sudbury',
    'Wayland', 
    'Carlisle',
    'Bedford', 
    'Boxborough', 
    'Concord', 
    'Lincoln', 
    'Lexington', 
    'Weston',
]

def lat_lon(address, town):
    geocode_result = geolocator.geocode(f'{address}, {town}, MA')
    if geocode_result is not None:
        raw_dict = geocode_result.raw
        return (float(raw_dict['lat']), float(raw_dict['lon']))
    else:
        print(f'\tFAILED TO FIND ADDRESS FOR {address}')
        return (0.0, 0.0)

def load_lat_lon_cache():
    # Load lat-lon pairs from cache file
    lat_lon_cache = dict()
    with open('lat_lon_cache.txt', 'r') as fd:
        for line in fd:
            parts = line.replace('\n', '').split()
            address = ' '.join(parts[:-3])
            lat = float(parts[-3])
            lon = float(parts[-2])
            town_idx = int(parts[-1])
            lat_lon_cache[address] = (lat, lon, town_idx)
    return lat_lon_cache
            
def save_lat_lon_cache(lat_lon_cache):
    with open('lat_lon_cache.txt', 'w') as fd:
        for address in lat_lon_cache.keys():
            town_idx, lat, lon = lat_lon_cache[address]
            fd.write(f'{address} {town_idx} {lat} {lon}\n')
            
def build_dataset():
    # Load lat-lon pairs from cache file
    lat_lon_cache = load_lat_lon_cache()
    
    combined_df = None
    for town_idx in range(len(towns)):
        town = towns[town_idx]
        df = pd.DataFrame(pd.read_excel(f'house_buying.xlsx', sheet_name=town))

        drop_columns = list()
        for column in df.columns:
            if 'Unnamed' in column:
                drop_columns.append(column)
        df.drop(columns=drop_columns, inplace=True)
        df.dropna(inplace=True)
        
        df['town_idx'] = town_idx

        lats = list()
        lons = list()
        for i, row in df.iterrows():
            address = row['Address']
            if address in lat_lon_cache and lat_lon_cache[address][-1] == town_idx:
                # print(f'Getting address {address}, {town}, MA from lat/lon cache')
                lat, lon = lat_lon_cache[address][:2]
            else:
                # print(f'Fetching lat/lon for address {address}, {town}, MA')
                lat, lon = lat_lon(address, town)
                lat_lon_cache[address] = (lat, lon, town_idx)
            lats.append(lat)
            lons.append(lon)
        df['lat'] = lats
        df['lon'] = lons
        
        if combined_df is None:
            combined_df = deepcopy(df)
        else:
            combined_df = pd.concat([combined_df, deepcopy(df)])
            
    save_lat_lon_cache(lat_lon_cache)
    
    return combined_df            

In [3]:
dataset = build_dataset()

In [42]:
# https://towardsdatascience.com/pythons-geocoding-convert-a-list-of-addresses-into-a-map-f522ef513fd6
import folium
from folium.plugins import HeatMap, MarkerCluster

# Filter dataset
# filtered_ds = dataset
filtered_ds = dataset.loc[dataset['town_idx'].isin([
    towns.index('Acton'), 
    towns.index('Sudbury'),
    towns.index('Carlisle'),
])]

# Create a map object and center it to the avarage coordinates to m
center = filtered_ds[["lat", "lon"]].mean().to_list()
m = folium.Map(location=center,
               zoom_start=11)

# Anything greater than or equal to the key (up to next key) will use its value as color
color_map = {
    0: 'lightgreen',
    4500: 'green',
    5000: 'darkgreen',
    5500: 'orange',
    6000: 'lightred',
    6250: 'red',
    6500: 'darkred',
}
color_thresholds = sorted(color_map.keys())

use_cluster = True
use_markers = True
use_heatmap = True

if use_heatmap:
    heatmap_data = filtered_ds[['lat', 'lon']].values.tolist()
    HeatMap(heatmap_data, max_zoom=1, min_opacity=100, radius=30, blur=30).add_to(m)

# if the points are too close to each other, cluster them,
# create a cluster overlay with MarkerCluster, add to m
if use_cluster:
    marker_cluster = MarkerCluster().add_to(m)

# draw the markers and assign popup and hover texts
# add the markers the the cluster layers so that they are automatically clustered
for i,r in filtered_ds.iterrows():
    location = (r["lat"], r["lon"])
    
    monthly = float(r["Monthly cost from Yale's calculator assuming $5k insurance"])
    
    if use_markers:
        color_idx = 0
        while color_idx < len(color_thresholds) - 1:
            if monthly < color_thresholds[color_idx]:
                color_idx -= 1
                break
            else:
                color_idx += 1
        color = color_map[color_thresholds[color_idx]]

        popup_msg = f"""{r["Square Footage"]} sq.ft.
{r["Acres"]} acres
${r["2020 or 2019 Taxes (yellow indicates estimate)"]} taxes
${r["Annual HOA Fee (yellow indicates estimate)"]} HOA fee"""

        tooltip_msg = f"""{r["Address"]}, {towns[r["town_idx"]]}, MA (
${monthly} per month,
sold for ${r["Sell Price"]})"""

        marker = folium.Marker(
            location=location,
            icon=folium.Icon(color=color),
            popup=popup_msg,
            tooltip=tooltip_msg)
        if use_cluster:
            marker.add_to(marker_cluster)
        else:
            marker.add_to(m)
    
# display the map
m

In [69]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Filter dataset
filtered_ds = dataset
"""
filtered_ds = dataset.loc[dataset['town_idx'].isin([
    towns.index('Acton'), 
    towns.index('Sudbury'),
    towns.index('Carlisle'),
])]
"""

features = [
    '2020 or 2019 Taxes (yellow indicates estimate)',
    "Square Footage",
    "Sell Price",
    "lat",
    "lon",
    "Acres",
]
predict = "town_idx"


shuffled_ds = filtered_ds.sample(frac=1.0)

X = shuffled_ds[features].values
Y = shuffled_ds[predict].values

num_feats = 2
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, n_features_to_select=num_feats)
fit = rfe.fit(X, Y)

print('Features: %s' % features)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Features: ['2020 or 2019 Taxes (yellow indicates estimate)', 'Square Footage', 'Sell Price', 'lat', 'lon', 'Acres']
Num Features: 2
Selected Features: [ True  True False False False False]
Feature Ranking: [1 1 2 4 3 5]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist